I am learning to scrape with selenium and scrapy. I have a page with list of links. I want to click the first link , visit the page crawl items and again come back to main(previous page with list of links) and click on second link and crawl and repeat the process until the desired links are over. All i could do was click the first link and then my crawler stops. What could be done to again crawl the second link and remaining ones?
My spider looks so:
def __init__(self):
self.driver = webdriver.Firefox()
#WebDriverWait wait = new WebDriverWait(driver,10);
#wait.until(ExpectedConditions.frameToBeAvailableAndSwitchToIt('win0divPAGECONTAINER')
def parse(self,response):
self.driver.get('https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL')
selector = Selector(response)
while True:
next = self.driver.find_element_by_xpath('//*[@id="HRS_APPL_WRK_HRS_LST_NEXT"]')
try:
links = []
for link in selector.css('span.PSEDITBOX_DISPONLY').re('.*>(\d+)<.*'):
#intjid = selector.css('span.PSEDITBOX_DISPONLY').re('.*>(\d+)<.*')
abc = 'https://eapplicant.northshore.org/psp/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?Page=HRS_CE_JOB_DTL&Action=A&JobOpeningId='+link+'&SiteId=1&PostingSeq=1'
#print abc
yield Request(abc,callback=self.parse_listing_page, headers={"X-Requested-With": "XMLHttpRequest"}, dont_filter=True)
next.click()
except:
break
#self.driver.close()
def parse_dynamiccontain(self,response):
selector = Selector(response)
url = selector.xpath('//div[id = win0divPAGECONTAINER').extract()
yield Request(url,callback=self.parse_listing_page, headers={"X-Requested-With": "XMLHttpRequest"}, dont_filter=True)
def parse_listing_page(self,response):
selector = Selector(response)
item=northshoreSpiderItem()
print response
item['CompanyName'] = "NorthShore University Health System"
item ['JobDetailUrl'] = response.url
item ['Title'] = selector.xpath('//*[@id="HRS_JO_WRK_POSTING_TITLE$0"]/text()').extract()
item ['Internaljobid'] = selector.xpath(".//*[@id='HRS_JO_WRK_HRS_JOB_OPENING_ID$0']/text()").extract()
item ['City'] = selector.xpath(".//*[@id='HRS_CE_WRK2_HRS_CE_JO_LCTNS$0']/text()").extract()
item ['State'] = selector.xpath(".//*[@id='HRS_CE_WRK2_HRS_CE_JO_LCTNS$0']/text()").extract()
item ['PositionType'] = selector.xpath(".//*[@id='HRS_CE_WRK2_HRS_FULL_PART_TIME$0']/text()").extract()
item ['Country'] = "US"
item['Zipcode'] = "00000"
item['Description'] = selector.xpath('//*[@id="HRS_JO_PDSC_VW_DESCRLONG$0"]/p[1]').extract()
yield item
Note:
Here its repeatedly crawl first page jobs alone not all pages. Please any one guide me to crawl this site?
Here I have add the internal jobid with job details url. which can we extract job details url and job description.
Thanks