from scrapy.spider import BaseSpider, Rulefrom scrapy.selector import HtmlXPathSelectorfrom scrapy.selector import Selectorfrom scrapy.contrib.linkextractors import LinkExtractorfrom dirbot.items import WebsiteLoaderfrom scrapy.http import Requestfrom scrapy.http import HtmlResponse
class DindexSpider(BaseSpider): name = "dindex" USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36" start_urls = [ rules = ( ) def parse(self, response): hxs = HtmlXPathSelector(response) self.log("Scraping: " + response.url) times = hxs.select('//td[@class="stime3"]') for time in times: il = WebsiteLoader(response=response, selector=time) il.add_xpath('publish_date', 'text()') item = il.load_item() yield Request(url=time, callback=self.parse_article) def parse_article(self, response): hxs = HtmlXPathSelector(response) self.log("scraping: " + response.url) sites = hxs.select('//td[@class="article"]') for site in sites: il = WebsiteLoader(response=response, selector=site) il.add_xpath('name', 'a/text()') il.add_xpath('url', 'a/@href') item = il.load_item() yield Request(url=times, callback=self.parse_item) def parse_item(self, response): item = response.meta['item'] yield il.load_item()
2016-02-06 12:21:22 [scrapy] ERROR: Spider error processing <GEThttp://www.newslookup.com/Business/> (referer: None)
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\spidermiddlewares\offsite.py", line 28, in process_spider_output
for x in result:
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\spidermiddlewares\depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\virtualenvs\[TextIndexer]\Scripts\example\dindex\dirbot-mysql\dirbot\spiders\dindex.py", line 32, in parse
yield Request(url=time, callback=self.parse_article)
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\http\request\__init__.py", line 24, in __init__
self._set_url(url)
File "C:\Python27\lib\site-packages\scrapy-1.0.4-py2.7.egg\scrapy\http\request\__init__.py", line 57, in _set_url
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
TypeError: Request url must be str or unicode, got HtmlXPathSelector:
2016-02-06 12:21:22 [scrapy] INFO: Closing spider (finished)