"downloader/exception_count": 1,
"downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError": 1,
def _download_robots(self, robots_url, req_netloc, spider):
print "requested download: %s" % robots_url
robots_req = Request(
robots_url,
priority=self.DOWNLOAD_PRIORITY,
meta={'bypass_robots': True, 'req_netloc': req_netloc}
)
dfd = self.crawler.engine.download(robots_req, spider)
dfd.addCallback(self._download_success)dfd.addErrback(self._download_error, robots_req, spider)
def _download_error(self, failure, request, spider):
netloc = request.meta.get('req_netloc')url = urlparse_cached(request)print 'download error ' + url# Check if we have failed for nginx, we try directlyif self._downloading_robots[netloc][0] == CacheLocation.nginx:self._downloading_robots[netloc] = CacheLocation.nginx, DownloadingStatus.downloadedprint 'download error nginx'else:print 'download error direct'# We have failed directly too, check response codes and act accordinglyif isinstance(failure.value, HttpError):http_status = failure.value.response.statusstatus = http_status if http_status in (401, 403, 404) else 404print 'status http ' + statuselse:# Rest of failures, allow fetching ;)status = 404print 'status failure ' + status# Make a reppy rule and addrules = Rules(netloc, status, '', time() + self._cache_lifespan)self._robots_cache.add(rules)# Remove from downloading robotsself._downloading_robots.pop(netloc, None)