#!/usr/bin/python
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'project.settings')
#Must be at the top before other imports
##################################################
from scrapy.item import Item, Field
class DmozItem(Item):
link = Field()
##################################################
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["patentstorm.us"]
start_urls = []
for i in range(2):
start_urls.append('http://www.patentstorm.us/search/index/q/battery/page/' + str(i+1) + '.html')
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//td/a')
items = []
for site in sites:
item = DmozItem()
item['link'] = site.select('@href').extract()
items.append(item)
return items
##################################################
from scrapy import project, signals
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import multiprocessing
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = CrawlerProcess(settings)
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
##################################################
# Usage
if __name__ == "__main__":
result_queue = Queue()
crawler = CrawlerWorker(DmozSpider(), result_queue)
crawler.start()
for item in result_queue.get():
print item
What am I doing wrong??
Thanks!
args_for_scrapy = ['scrapy','runspider','bugimporters/main.py','-a', 'input_filename=%s' % (args.input,),'-s', 'TELNETCONSOLE_ENABLED=0','-s', 'WEBSERVICE_ENABLED=0','-s', 'FEED_FORMAT=jsonlines','-s', 'FEED_URI=%s' % (args.output,),'-s', 'CONCURRENT_REQUESTS_PER_DOMAIN=1','-s', 'CONCURRENT_REQUESTS=200','-s', 'DEPTH_PRIORITY=1','-s', 'SCHEDULER_DISK_QUEUE=scrapy.squeue.PickleFifoDiskQueue','-s', 'SCHEDULER_MEMORY_QUEUE=scrapy.squeue.FifoMemoryQueue',]return scrapy.cmdline.execute(args_for_scrapy)I'm guessing: scrapy.cmdline.execute(args_for_scrapy) is how you called the spider. Can you explain the args parameters? What do they serve?Thank you.