'''Copied from Scrapy 1.03 docs at pdf page 15, section 2.3, Scrapy TutorialRun this, as is, on Dmoz.'''
import scrapyfrom tutorial.items import DmozItem
class DmozSpider(scrapy.Spider): name = "tutfollinks" allowed_domains = ["dmoz.org"] start_urls = [ ]
def parse(self, response): for href in response.css("ul.directory.dir-col > li > a::attr('href')"): url = response.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response): for sel in response.xpath('//ul/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
Traceback (most recent call last): File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks current.result = callback(current.result, *args, **kw) File "/usr/lib/pymodules/python2.7/scrapy/spiders/__init__.py", line 76, in parse raise NotImplementedErrorNotImplementedError2015-10-12 19:31:21 [scrapy] INFO: Closing spider (finished)
malikarumi@Tetuoan2:~/Projects/tutorial/tutorial/spiders$ cat dmoz_debug2.py��''' #this is the offending hidden character - by the way, 'delete' does not work to get rid of it
Copied from Scrapy 1.03 docs at pdf page 15, section 2.3, Scrapy Tutorial
Run this, as is, on Dmoz. It is dmoz_debug2, with the name of the spider 'dmoz'.I changed this to iso-8859 per http://stackoverflow.com/questions/1067742/clean-source-code-files-of-invisible-characters.
'''
import scrapyfrom tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"] start_urls = [ ]
def parse(self, response): for href in response.css("ul.directory.dir-col > li > a::attr('href')"): url = response.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_dir_contents) def parse_dir_contents(self, response): for sel in response.xpath('//ul/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
malikarumi@Tetuoan2:~/Projects/tutorial/tutorial/spiders$ cat tutfollinksc.py''' # this is my retyped copy, as you can see, without the offending characterThis is tutfollinksc, the retyped spider in hopes of getting rid of hidden characterand not implemented error. It is in all respects identical to tutfollinks.
'''
import scrapyfrom tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "tutlinkC_dmoz" # this is where i changed the spider name so it would not be identical to the spider in dmoz_debug2
allowed_domains = ["dmoz.org"] start_urls = [
]def parse(self, response): for href in response.css("ul.directory.dir-col > li > a::attr('href')"): url = response.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response): for sel in response.xpath('//ul/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
malikarumi@Tetuoan2:~/Projects/tutorial/tutorial/spiders$ scrapy runspider tutfollinksc.py -o tutfollinks_dmoz_c.json
Traceback (most recent call last):
File "/usr/bin/scrapy", line 9, in <module> load_entry_point('Scrapy==1.0.3.post6-g2d688cd', 'console_scripts', 'scrapy')() File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 142, in execute cmd.crawler_process = CrawlerProcess(settings) File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 209, in __init__ super(CrawlerProcess, self).__init__(settings) File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 115, in __init__ self.spider_loader = _get_spider_loader(settings) File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 296, in _get_spider_loader return loader_cls.from_settings(settings.frozencopy()) File "/usr/lib/pymodules/python2.7/scrapy/spiderloader.py", line 30, in from_settings return cls(settings) File "/usr/lib/pymodules/python2.7/scrapy/spiderloader.py", line 21, in __init__ for module in walk_modules(name): File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 71, in walk_modules submod = import_module(fullpath) File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module __import__(name) File "/home/malikarumi/Projects/tutorial/tutorial/spiders/dmoz_debug2.py", line 1SyntaxError: Non-ASCII character '\xff' in file /home/malikarumi/Projects/tutorial/tutorial/spiders/dmoz_debug2.py on line 1, but no encoding declared; see http://python.org/dev/peps/pep-0263/ for details
malikarumi@Tetuoan2:~/Projects/tutorial/tutorial/spiders$ scrapy runspider tutfollinksc.py -o tutfollinks_dmoz_c.json
2015-10-15 21:27:42 [scrapy] INFO: Scrapy 1.0.3.post6+g2d688cd started (bot: tutorial)2015-10-15 21:27:42 [scrapy] INFO: Optional features available: ssl, http112015-10-15 21:27:42 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'tutorial.spiders', 'FEED_FORMAT': 'json', 'SPIDER_MODULES': ['tutorial.spiders'], 'FEED_URI': 'tutfollinks_dmoz_c.json', 'BOT_NAME': 'tutorial'}2015-10-15 21:27:42 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter, TelnetConsole, LogStats, CoreStats, SpiderState2015-10-15 21:27:42 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats2015-10-15 21:27:42 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware2015-10-15 21:27:42 [scrapy] INFO: Enabled item pipelines: 2015-10-15 21:27:42 [scrapy] INFO: Spider opened2015-10-15 21:27:42 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)2015-10-15 21:27:42 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:60232015-10-15 21:27:43 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/> (referer: None)2015-10-15 21:27:43 [scrapy] ERROR: Spider error processing <GET http://www.dmoz.org/Computers/Programming/Languages/Python/> (referer: None)
Traceback (most recent call last): File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks current.result = callback(current.result, *args, **kw) File "/usr/lib/pymodules/python2.7/scrapy/spiders/__init__.py", line 76, in parse raise NotImplementedErrorNotImplementedError
2015-10-15 21:27:43 [scrapy] INFO: Closing spider (finished)2015-10-15 21:27:43 [scrapy] INFO: Dumping Scrapy stats:{'downloader/request_bytes': 264, 'downloader/request_count': 1, 'downloader/request_method_count/GET': 1, 'downloader/response_bytes': 7386, 'downloader/response_count': 1, 'downloader/response_status_count/200': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2015, 10, 16, 2, 27, 43, 759336), 'log_count/DEBUG': 2, 'log_count/ERROR': 1, 'log_count/INFO': 7, 'response_received_count': 1, 'scheduler/dequeued': 1, 'scheduler/dequeued/memory': 1, 'scheduler/enqueued': 1, 'scheduler/enqueued/memory': 1, 'spider_exceptions/NotImplementedError': 1, 'start_time': datetime.datetime(2015, 10, 16, 2, 27, 42, 970895)}2015-10-15 21:27:43 [scrapy] INFO: Spider closed (finished)malikarumi@Tetuoan2:~/Projects/tutorial/tutorial/spiders$