import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from africanstudies.items import AfricanstudiesItem
from scrapy.contrib.linkextractors import LinkExtractor
class DmozSpider(CrawlSpider):
name = "africanstudies"
allowed_domains = ["northwestern.edu"]
start_urls = [
"http://www.northwestern.edu/african-studies/about/"
]
def parse(self, response):
for sel in response.xpath('//div[2]/div[1]'):
item = AfricanstudiesItem()
item['url'] = response.url
item['title'] = sel.xpath('div[3]/*[@id="green_title"]/text()').extract()
item['desc'] = sel.xpath('div[4]/*').extract()
yield item
--
You received this message because you are subscribed to the Google Groups "scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to scrapy-users...@googlegroups.com.
To post to this group, send email to scrapy...@googlegroups.com.
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from africanstudies.items import AfricanstudiesItem
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
class AfricanstudiesSpider(CrawlSpider):
name = "africanstudies"
allowed_domains = ["northwestern.edu/african-studies"]
start_urls = [
"http://www.northwestern.edu/african-studies/about/"
]
rules = (Rule(LinkExtractor(allow=(r)),callback='parse_links',follow=True),)
def parse_links(self, response):
sel = scrapy.Selector(response)
for href in sel.xpath('//a/@href').extract():
url = urlparse.urljoin(response.url, href)
yield Request(url, callback = self.parse_items,)
def parse_items(self, response):
self.log('Hi, this is an item page! %s' % response.url)
for sel in response.xpath('//div[2]/div[1]'):
item = AfricanstudiesItem()
item['url'] = response.url
item['title'] = sel.xpath('div[3]/*[@id="green_title"]/text()').extract()
item['desc'] = sel.xpath('div[4]/*').extract()
yield item
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from africanstudies.items import AfricanstudiesItem
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
import urlparse
class AfricanstudiesSpider(CrawlSpider):
name = "africanstudies"
allowed_domains = ["northwestern.edu/african-studies"]
start_urls = [
"http://www.northwestern.edu/african-studies/about/"
]
rules = (Rule(LinkExtractor(allow=(r'')),callback='parse_links',follow=True),)
def parse_links(self, response):
links = response.xpath('//a/@href').extract()
for link in links:
url = urlparse.urljoin(response.url, link)import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from africanstudies.items import AfricanstudiesItem
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
import urlparse
class AfricanstudiesSpider(CrawlSpider):
name = "africanstudies"
allowed_domains = ["northwestern.edu"]
start_urls = [
"http://www.northwestern.edu/african-studies/about/"
]
rules = (Rule(LinkExtractor(allow=(r'african-studies')),callback='parse_links',follow=True),)
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from africanstudies.items import AfricanstudiesItem
class MySpider(CrawlSpider):
start_urls = ['http://www.northwestern.edu/african-studies']
rules = (
Rule(LinkExtractor(allow='african-studies'), follow=True, callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
item = AfricanstudiesItem()
item['url'] = response.url
return itema single regular expression (or list of regular expressions) that the (absolute) urls must match in order to be extracted. If not given (or empty), it will match all links.