Re: Scrapy: defining multiple rules

800 views
Skip to first unread message

Steven Almeroth

unread,
Feb 16, 2013, 2:44:51 PM2/16/13
to scrapy...@googlegroups.com
There is no link on the sfbay homepage that satisfies this regex, "index\d00\.html", perhaps you meant, "index\d+\.html".  Also, it is generally bad practice to alter spider attributes inside the callbacks since two callbacks can be running at "the same time".

Try this code:

    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import HtmlXPathSelector

    from myspider.items import CraigslistSampleItem

    class MySpider(CrawlSpider):
        name = "craigs"
        allowed_domains = ["sfbay.craigslist.org"]
        start_urls = ["http://sfbay.craigslist.org/"]

        rules = (
           Rule(SgmlLinkExtractor(allow=("index\d+\.html")), callback="parse_items_2", follow=True),
           Rule(SgmlLinkExtractor(allow=('\/npo')), callback="parse_items_1"),
           )

        def __init__(self, *a, **kw):
            super(MySpider, self).__init__(*a, **kw)

        def parse_items_1(self, response):
            print response.url
            items = []
            hxs = HtmlXPathSelector(response)
            item = CraigslistSampleItem()
            titles = hxs.select("//div")

            for title in titles:
                item["title"] = title.select("//li/a/text()").extract()
                item["link"] = title.select("//li/a/@href").extract()
                print item["title"], item["link"]
                items.append(item)

            return items

        def parse_items_2(self, response):
            print response.url
            items = []
            hxs = HtmlXPathSelector(response)
            item = CraigslistSampleItem()
            titles = hxs.select("//p")

            for title in titles:
                item["title"] = title.select("a/text()").extract()
                item["link"] = title.select("a/@href").extract()
                print item["title"]
                items.append(item)

            return items


On Tuesday, February 12, 2013 8:41:39 PM UTC-6, Srikanth Maru wrote:

As a part of learning, I'm trying to use scrapy to crawl through domain 'sfbay.craigslist.org' and my requirement is as follows,

for start_url = http://sfbay.craigslist.org/ use parse_items_1 to parse and identify the links and allow only the links with http://sfbay.craigslist.org/npo to follow.

for links in http://sfbay.craigslist.org/npo use parse_items_2 to parse and identify the links with xpath index\d00\.html

But my rule definition is incorrect, either all the pages crawled from the main domain use only one parse i.e. parse_items_1
 or since I removed follow=true, it doesn't crawl multiple pages.

Code:

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from myspider.items import CraigslistSampleItem


class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/"]

    rules = (
       Rule(SgmlLinkExtractor(allow=("index\d00\.html")), callback="parse_items_2", follow= True),
       Rule(SgmlLinkExtractor(allow=('\/npo')), callback="parse_items_1"),
       )

    def __init__(self, *a, **kw):
        super(MySpider, self).__init__(*a, **kw)
        self.items = []
        self.item = CraigslistSampleItem()

    def parse_items_1(self, response):
        print response.url
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//div")
        for title in titles:
            self.item ["title"] = title.select("//li/a/text()").extract()
            self.item ["link"] = title.select("//li/a/@href").extract()
            print self.item["title"], self.item["link"]
            self.items.append(self.item)
        return self.items

    def parse_items_2(self, response):
        print response.url
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//p")
        for title in titles:
            self.item ["title"] = title.select("a/text()").extract()
            self.item ["link"] = title.select("a/@href").extract()
            print self.item["title"]
            self.items.append(self.item)
        return self.items




Reply all
Reply to author
Forward
0 new messages