I often do web scraping and API retrieval involving 10s of thousands of web pages. Originally I wrote in a shell language and used Awk 1-liners extensively for the data processing but it became ridiculously slow when most of the shell script is calls to Awk. So I started writing in Awk directly with a few system/getline calls to external shell programs -- one of which is wget to download the webpage for processing. For example
command = "wget -q -O- \"http:..\" > temp_file"
system(command)
This method works but requires 1. calling an external program 2. saving to a temp file and then 3. reading that file back into a variable using Gawk's readfile() function (or, getline). It's a lot of IO, and maintenance overhead.
Ideally I wanted a way to load a webpage directly into a variable without calling an external program or saving/reading anything to/from a file. And that is what the following function awget() does.
It borrows heavily from Peteris Krumins's "get_youtube_vids.awk" script which showed the way.
Note the example URL is a redirect to
en.wikipedia.org which the code detects and handles correctly.
Welcome any suggestions or ideas how to improve or where the function might fail.
------------------
BEGIN{
url = "
http://www.wikipedia.org/wiki/Awk"
s = awget(url)
print s
}
# awget (replicate "wget -q -O- http:...")
# Download URL and return as a variable
# Handles redirects.
#
# Adapted from Peteris Krumins's "get_youtube_vids.awk"
#
https://code.google.com/p/lawker/source/browse/fridge/gawk/www/get_youtube_vids.awk
#
function awget(url ,urlhost,urlrequest, c,i,a,p,f,j,output, foO, headerS, matches, inetfile, request, loop)
{
# Parse URL into host and request
c = split(url, a, "/")
urlhost = a[3] #
www.domain.com
i = 3
while(i < c) {
f++
i++
p[f] = a[i]
}
urlrequest = "/" join(p,1,length(p)," ") # /wiki/Feudalism
gsub(" ","/",urlrequest) # Will break if request tail contains a "/"
inetfile = "/inet/tcp/0/" urlhost "/80"
request = "GET " urlrequest " HTTP/4.1\r\n"
request = request "Host: " urlhost "\r\n\r\n"
do {
get_headers(inetfile, request, headerS)
if ("Location" in headerS) {
close(inetfile)
if (match(headerS["Location"], /http:\/\/([^\/]+)(\/.+)/, matches)) {
foO["InetFile"] = "/inet/tcp/0/" matches[1] "/80"
foO["Host"] = matches[1]
foO["Request"] = matches[2]
}
else {
foO["InetFile"] = ""
foO["Host"] = ""
foO["Request"] = ""
}
if (inetfile == "") {
print "Failed 1 (" url "), caught in Location loop!" > "/dev/stderr"
return -1
}
}
loop++
} while (("Location" in headerS) && loop < 5)
if (loop == 5) {
print "Failed 2 (" url "), caught in Location loop!" > "/dev/stderr"
return -1
}
while ((inetfile |& getline) > 0) {
j++
output[j] = $0
}
close(inetfile)
if(length(output) == 0)
return -1
else
return join(output, 1, j, "\n")
}
function get_headers(Inet, Request, headerS)
{
delete headerS
# save global vars
OLD_RS=RS
print Request |& Inet
# get the http status response
if (Inet |& getline > 0) {
headerS["_status"] = $2
}
else {
print "Failed reading from the net. Quitting!"
exit 1
}
RS="\r\n"
while ((Inet |& getline) > 0) {
# we could have used FS=": " to split, but i could not think of a good
# way to handle header values which contain multiple ": "
# so i better go with a match
if (match($0, /([^:]+): (.+)/, Matches)) {
headerS[Matches[1]] = Matches[2]
}
else { break }
}
RS=OLD_RS
}
# From Awk manual
#
function join(array, start, end, sep, result, i)
{
if (sep == "")
sep = " "
else if (sep == SUBSEP) # magic value
sep = ""
result = array[start]
for (i = start + 1; i <= end; i++)
result = result sep array[i]
return result
}