sync vs. async
scrapy uses twisted under the covers:
http://twistedmatrix.com
a nice discussion of asynchronous programming can be found here:
http://krondo.com/?p=1209
try these two scripts out -- each one gets invoked with a number and a
url. the number is the number of times to pull that url. you should
see that the larger the number of urls to fetch the better the async
model performs.
-------8<---------sync-url.py----------8<----------
#!/usr/bin/python
import time
import urllib
def main(args):
limit = args[0]
url = args[1]
start = time.time()
for i in xrange(1, int(limit)):
urllib.urlopen(url)
print "finished request %s" % i
end = time.time()
print end - start
if __name__ == '__main__':
import sys
main(sys.argv[1:])
-------8<---------async-url.py----------8<----------
#!/usr/bin/python
from twisted.internet import reactor
from twisted.web.client import getPage
import time
jobs = list()
start = 0
def shutdown():
global start
print time.time() - start
reactor.stop()
def cb(result, jobid):
print "finished request %d" % jobid
jobs.remove(jobid)
if not jobs:
shutdown()
def main(args):
global start, jobs
limit, url = args
jobs = range(1, int(limit))
start = time.time()
for i in jobs:
d = getPage(url)
d.addCallback(cb, i)
if __name__ == '__main__':
import sys
main(sys.argv[1:])
reactor.run()