python connection refused error in crawl scrapy with seleniumrc

1,887 views

Skip to first unread message

charu awhad

unread,

Jan 7, 2015, 4:28:56 AM1/7/15

to scrapy...@googlegroups.com

I am new in scrapy,python. for dynamic loading I have used ajax call for crawl spider. The code I written is:

import scrapy

import re

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from scrapy.selector import Selector

from scrapy.spider import BaseSpider

from streetdirectory.items import StreetdirectoryItem

from selenium import webdriver

class StdallurlsSpider(scrapy.Spider):

name = "stdallurls"

allowed_domains = ["streetdirectory.com/businessfinder/"]

start_urls = ['http://www.streetdirectory.com/businessfinder/company/All/All/A/']

def __init__(self):
    self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub",webdriver.DesiredCapabilities.HTMLUNITWITHJS)

def parse(self, response):

    self.driver.get(response.url)
    self.driver.implicitly_wait(10)

    hxs = Selector(response)
    item = StreetdirectoryItem
    finalurls = []

    while True:
        next = self.driver.find_element_by_xpath('.//span[@class="ver_11 viewLink"]/a')
        print "-------------next------------",next


        try:
            next.click()
            item['page'] = response.url

            urls = self.driver.find_elements_by_xpath('.//h3[@class="fleft"]/a')
            print "===============urls============",urls

            for url in urls:
                url = url.get_attribute("href")
                print "...................url.......................",url
                finalurls.append(url)
                item['urls'] = finalurls

        except:
            break


self.driver.close()
    return item

my Items.py for this is:

import scrapy

from scrapy.item import Item,Field

class StreetdirectoryItem(scrapy.Item):

page = Field()

urls = Field()

pass

when I am trying to crawl it I got an error:

resp = opener.open(request)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
    response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
    '_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
    result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
    return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
    raise URLError(err)
urllib2.URLError: <urlopen error [Errno 111] Connection refused>

If anybody know the solution then can they let me know please.

#python #scrapy #ajax #seleniumrc #web crawler

Reply all

Reply to author

Forward

0 new messages