Running Scrapy from script (Updated)

1,385 views
Skip to first unread message

Wayne

unread,
Nov 16, 2012, 4:32:03 PM11/16/12
to scrapy...@googlegroups.com
Hi,

I'm trying to running Scrapy from a Python script. These are the 2 tutorials I used:

I just can't get it to work. This is my code:

#!/usr/bin/python

import os

os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'project.settings')

#Must be at the top before other imports

##################################################

from scrapy.item import Item, Field

class DmozItem(Item):

link = Field()

##################################################

from scrapy.spider import BaseSpider

from scrapy.selector import HtmlXPathSelector

class DmozSpider(BaseSpider):

name = "dmoz"

allowed_domains = ["patentstorm.us"]

start_urls = []

for i in range(2):

start_urls.append('http://www.patentstorm.us/search/index/q/battery/page/' + str(i+1) + '.html')

def parse(self, response):

hxs = HtmlXPathSelector(response)

sites = hxs.select('//td/a')

items = []

for site in sites:

item = DmozItem()

item['link'] = site.select('@href').extract()

items.append(item)

return items

##################################################

from scrapy import project, signals

from scrapy.conf import settings

from scrapy.crawler import CrawlerProcess

from scrapy.xlib.pydispatch import dispatcher

from multiprocessing.queues import Queue

import multiprocessing

 

class CrawlerWorker(multiprocessing.Process):

 

    def __init__(self, spider, result_queue):

        multiprocessing.Process.__init__(self)

        self.result_queue = result_queue

 

        self.crawler = CrawlerProcess(settings)

        if not hasattr(project, 'crawler'):

            self.crawler.install()

        self.crawler.configure()

 

        self.items = []

        self.spider = spider

        dispatcher.connect(self._item_passed, signals.item_passed)

 

    def _item_passed(self, item):

        self.items.append(item)

  

    def run(self):

        self.crawler.crawl(self.spider)

        self.crawler.start()

        self.crawler.stop()

        self.result_queue.put(self.items)

##################################################

# Usage

if __name__ == "__main__":

result_queue = Queue()

crawler = CrawlerWorker(DmozSpider(), result_queue)

crawler.start()

for item in result_queue.get():

    print item


What am I doing wrong??

Thanks!

Asheesh Laroia

unread,
Nov 17, 2012, 2:18:01 AM11/17/12
to scrapy-users
Excerpts from Wayne's message of Fri Nov 16 16:32:03 -0500 2012:
> Hi,
>
> I'm trying to running Scrapy from a Python script. These are the 2
> tutorials I used:
> http://tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/
> http://snipplr.com/view/67006/using-scrapy-from-a-script/

FWIW, this is what I do:

https://github.com/openhatch/oh-bugimporters/blob/master/bugimporters/main.py#L25

I just skip the Scrapy project infrastructure at the moment and go
straight to runspider.

-- Asheesh.

Steven Almeroth

unread,
Nov 17, 2012, 10:10:44 AM11/17/12
to scrapy...@googlegroups.com
> I just can't get it to work.

What is the problem?

Wayne

unread,
Nov 17, 2012, 4:55:17 PM11/17/12
to scrapy...@googlegroups.com
I don't understand where I should put my spider code and how to call it in the main function.

Thanks!

Wayne

unread,
Nov 19, 2012, 2:13:14 AM11/19/12
to scrapy...@googlegroups.com
I'm looking at this part of the main function:

    args_for_scrapy = ['scrapy',
                       'runspider',
                       'bugimporters/main.py',
                       '-a', 'input_filename=%s' % (args.input,),
                       '-s', 'TELNETCONSOLE_ENABLED=0',
                       '-s', 'WEBSERVICE_ENABLED=0',
                       '-s', 'FEED_FORMAT=jsonlines',
                       '-s', 'FEED_URI=%s' % (args.output,),
                       '-s', 'CONCURRENT_REQUESTS_PER_DOMAIN=1',
                       '-s', 'CONCURRENT_REQUESTS=200',
                       '-s', 'DEPTH_PRIORITY=1',
                       '-s', 'SCHEDULER_DISK_QUEUE=scrapy.squeue.PickleFifoDiskQueue',
                       '-s', 'SCHEDULER_MEMORY_QUEUE=scrapy.squeue.FifoMemoryQueue',
                       ]
    return scrapy.cmdline.execute(args_for_scrapy)

I'm guessing: scrapy.cmdline.execute(args_for_scrapy) is how you called the spider. Can you explain the args parameters? What do they serve?

Thank you.

Wayne

unread,
Nov 19, 2012, 2:51:17 AM11/19/12
to scrapy...@googlegroups.com
I also found out that using scrapy.cmdline.execute(args) will just stop the script once the spider is executed. Nothing after that will be executed, which beats the purpose of running it from a script. That's probably why someone developed a workaround using multiprocessing.

Best.

ssslock

unread,
Nov 19, 2012, 10:47:44 PM11/19/12
to scrapy...@googlegroups.com
I use another way to call a spider from script which serves me well, scrapy 0.16, hope it helps.




start the crawler
====================================
import sys

from scrapy import log
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
from scrapy.settings import CrawlerSettings

from crawler import settings as settings_   #the settings module (setting.py) for scrapy

def start(spider_name):
    log.start(loglevel='DEBUG')
    settings = CrawlerSettings(settings_)
    crawler = CrawlerProcess(settings)
    crawler.target_spider_name = spider_name            #an arg which will delivered to my spider launcher
    crawler.install()
    crawler.configure()
    crawler.start()
    sys.exit(0)
==========================================

then launch the spider in something like an Extension while receiving the engine_started signal
remember to config this module in EXTENSIONS
=========================================
class SpiderController(object):
    def __init__(self, crawler):
        self.crawler = crawler
        crawler.signals.connect(self.launch_spiders, signal=signals.engine_started)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def launch_spiders(self):
        spider_settings = all_spiders[self.crawler.target_spider_name]     #my own config for different spiders
        spider = MySpider(spider_settings)                                            #the Spider instance
        self.crawler.engine.open_spider(spider, close_if_idle=False)         #close_if_idle is True by default, set to false and I will take care of it with some other scripts

Wayne

unread,
Nov 20, 2012, 12:03:12 AM11/20/12
to scrapy...@googlegroups.com
Thanks ssslock. I'll try this out and let you know what I find.

Wayne

unread,
Nov 21, 2012, 11:35:50 AM11/21/12
to scrapy...@googlegroups.com
So after lots of research, I wrote this script using this source: http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python
But I got a bunch of errors including this: pickle.PicklingError: Can't pickle <function <lambda> at 0x03513170>: it's not found as scrapy.contrib.linkextractors.sgml.<lambda>

Here's my code. It seems that multitasking is no longer working for scrapy 0.16?

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector 
from scrapy import project, signals
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
from multiprocessing import Process

class MypItem(Item):
    name = Field()
    link = Field() 
 
class MySpider1(CrawlSpider):
    count = 0
    name = 'patent'
    allowed_domains = ['patentstorm.us']
   
    rules = (
        Rule(SgmlLinkExtractor(allow=('inventors-patents')), follow='True'),
        Rule(SgmlLinkExtractor(allow=('description\.html')), callback='parse_item'),
    )
    
    def parse_item(self, response):
        self.count = self.count + 1
        print '################   ' + str(self.count)
        
        hxs = HtmlXPathSelector(response)
        item = MypItem()
        item['name'] = hxs.select('//h1/div/text()').extract()
        item['link'] = response.url
        return item
        
class CrawlerWorker(Process):
    def __init__(self, spider, results):
        Process.__init__(self)
        self.results = results

        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)

if __name__ == "__main__":
    # The part below can be called as often as you want
    results = Queue()
    crawler = CrawlerWorker(MySpider1(), results)
    crawler.start()
    for item in results.get():
        print item

ssslock

unread,
Nov 22, 2012, 5:36:46 AM11/22/12
to scrapy...@googlegroups.com
I begin to use scrapy for only one month, and just get my spider to work.
the only thing I know about this is when I try to start multi spiders on one crawler I will recieve a warning, asking me to use some settings in scrapyd instead.
Reply all
Reply to author
Forward
0 new messages