There is no link on the sfbay homepage that satisfies this regex, "
index\d00\.html", perhaps you meant, "
index\d+\.html". Also, it is generally bad practice to alter spider attributes inside the callbacks since two callbacks can be running at "the same time".
Try this code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from myspider.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/"]
rules = (
Rule(SgmlLinkExtractor(allow=("index\d+\.html")), callback="parse_items_2", follow=True),
Rule(SgmlLinkExtractor(allow=('\/npo')), callback="parse_items_1"),
)
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
def parse_items_1(self, response):
print response.url
items = []
hxs = HtmlXPathSelector(response)
item = CraigslistSampleItem()
titles = hxs.select("//div")
for title in titles:
item["title"] = title.select("//li/a/text()").extract()
item["link"] = title.select("//li/a/@href").extract()
print item["title"], item["link"]
items.append(item)
return items
def parse_items_2(self, response):
print response.url
items = []
hxs = HtmlXPathSelector(response)
item = CraigslistSampleItem()
titles = hxs.select("//p")
for title in titles:
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/@href").extract()
print item["title"]
items.append(item)
return items
On Tuesday, February 12, 2013 8:41:39 PM UTC-6, Srikanth Maru wrote:
As a part of learning, I'm trying to use scrapy to crawl through domain 'sfbay.craigslist.org' and my requirement is as follows,
for start_url = http://sfbay.craigslist.org/ use parse_items_1 to parse and identify the links and allow only the links with http://sfbay.craigslist.org/npo to follow.
for links in http://sfbay.craigslist.org/npo use parse_items_2 to parse and identify the links with xpath index\d00\.html
But my rule definition is incorrect, either all the pages crawled from the main domain use only one parse i.e. parse_items_1
or since I removed follow=true, it doesn't crawl multiple pages.
Code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from myspider.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/"]
rules = (
Rule(SgmlLinkExtractor(allow=("index\d00\.html")), callback="parse_items_2", follow= True),
Rule(SgmlLinkExtractor(allow=('\/npo')), callback="parse_items_1"),
)
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
self.items = []
self.item = CraigslistSampleItem()
def parse_items_1(self, response):
print response.url
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div")
for title in titles:
self.item ["title"] = title.select("//li/a/text()").extract()
self.item ["link"] = title.select("//li/a/@href").extract()
print self.item["title"], self.item["link"]
self.items.append(self.item)
return self.items
def parse_items_2(self, response):
print response.url
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
for title in titles:
self.item ["title"] = title.select("a/text()").extract()
self.item ["link"] = title.select("a/@href").extract()
print self.item["title"]
self.items.append(self.item)
return self.items