I am new in scrapy,python. for dynamic loading I have used ajax call for crawl spider. The code I written is:
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from streetdirectory.items import StreetdirectoryItem
from selenium import webdriver
class StdallurlsSpider(scrapy.Spider):
name = "stdallurls"
allowed_domains = ["streetdirectory.com/businessfinder/"]
start_urls = ['http://www.streetdirectory.com/businessfinder/company/All/All/A/']
def __init__(self):
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub",webdriver.DesiredCapabilities.HTMLUNITWITHJS)
def parse(self, response):
self.driver.get(response.url)
self.driver.implicitly_wait(10)
hxs = Selector(response)
item = StreetdirectoryItem
finalurls = []
while True:
next = self.driver.find_element_by_xpath('.//span[@class="ver_11 viewLink"]/a')
print "-------------next------------",next
try:
next.click()
item['page'] = response.url
urls = self.driver.find_elements_by_xpath('.//h3[@class="fleft"]/a')
print "===============urls============",urls
for url in urls:
url = url.get_attribute("href")
print "...................url.......................",url
finalurls.append(url)
item['urls'] = finalurls
except:
break
self.driver.close()
return item
my Items.py for this is:
import scrapy
from scrapy.item import Item,Field
class StreetdirectoryItem(scrapy.Item):
page = Field()
urls = Field()
pass
when I am trying to crawl it I got an error:
resp = opener.open(request)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 111] Connection refused>
If anybody know the solution then can they let me know please.
#python #scrapy #ajax #seleniumrc #web crawler