Greetings,I have searched far and wide and worked through the scrapy documentation but am really struggling to understand how to read a cookie value correctly.I found this hit on stack overflow http://stackoverflow.com/questions/8708346/access-session-cookie-in-scrapy-spiders but when I use :from scrapy.http.cookies import CookieJar class MySpider(BaseSpider): def parse(self, response): cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request)What should I do now to iterate through the cookies or to retrieve the value of a specific cookie ? I have spent a day on this trying to figure it out myself but have had no success and am really hoping that someone has a simple example of reading a cookie.Thanks in advance !!
28-May-2013 < 2013-05-29
Steven,
Thank you for looking through this. I had not noticed the cookie expiry but I am sure that must be right. Would this have been done to prevent user logon cookie details staying on the client ?
In any case I managed to work around this by retrieving the set-cookie values instead.
I have posted my code below in case this is useful to someone …
from scrapy.contrib.spiders.init import InitSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest, Request
from myinitspider.items import courtcase
import re
import urllib, urllib2, cookielib
class myinitspider(InitSpider):
name = "myinitspider"
domain_name = "<my website to crawl>.com"
start_urls = ["http://www.<my website to crawl>.com/ListSummary.aspx?FolderID=3232"]
first_page = "http://www.<my website to crawl>.com/"
login_page = "http://www.<my website to crawl>.com/handlers/login.ashx"
courtcase_urls = [] # all courtcase Urls on one page
nextpagehref = [2]
eventtargetvar = ""
eventargumentvar = ""
counter = 0
login_failed = False
def init_request(self):
return Request(url=self.first_page, dont_filter=True, callback=self.login)
def login(self, response):
"""Generate a login request."""
return [FormRequest(url=self.login_page,
formdata={'un': 'my username', 'pw': 'my password', 'rm': 'on'},
callback=self.check_login_response)]
def check_login_response(self, response):
for header in response.headers['Set-Cookie']:
if (re.search('user_id=0', header)):
self.login_failed = True
self.log('Login Failed and try to close Spider %s' % header)
crawler.engine.close_spider(self, 'LOGIN FAILED')
return
if (not self.login_failed):
return self.initialized()
else:
self.log('Login Failed is TRUE %s' % self.login_failed)
return
# Getting the court case page urls from each main page
def parse(self, response):
hxs = HtmlXPathSelector(response)
nextpagehref = hxs.select('//a[contains(text(),"Next")]').re(r"'(.+?)'")
eventtargetvar = str(nextpagehref[0])
eventargumentvar = str(nextpagehref[1])
# Extract courtcase urls from page and parse each for item data
self.courtcase_urls.extend(hxs.select("//a[contains(@class,'h2-L06-link')]/@href").extract())
for courtcase in self.courtcase_urls:
yield Request("http://www.<my website to crawl>.com/%s" % courtcase,
meta={'page': eventargumentvar,
'originalurl': courtcase},
callback=self.parse_courtcase)
# Follow pagination up to counter value.
if self.counter < 4000:
self.counter = self.counter + 1
# Follow next page link
yield FormRequest.from_response(response,
formdata = {'__EVENTTARGET': eventtargetvar,
'__EVENTARGUMENT': eventargumentvar},
dont_click = True,
callback=self.parse)
# Getting the required content from each court case page
def parse_courtcase(self, response):
hxs = HtmlXPathSelector(response)
case = courtcase()
case['page'] = response.meta['page']
case['originalurl'] = response.meta['originalurl']
case['url'] = response.url
# Try to select Date
raw = hxs.select("//p[contains(text(), 'Date:')]/following-sibling::p[position()=1]/text()").extract()
if raw:
match = re.search(r"'(.+?)'", str(raw))
if match:
case['date'] = match.group(1)
else:
case['date'] = "Regex match failed"
else:
raw = hxs.select("//text()[contains(., 'Date:')]").extract()
# raw = hxs.select("//div[@id='ctl00_midCPHL_ils_cbArticleText']/text()[3]").extract()
if raw:
match = re.search(r": (.+?)'", str(raw))
if match:
case['date'] = match.group(1)
else:
case['date'] = "Regex match failed"
else:
case['date'] = "XPath select failed"
raw = "XPath select failed"
# self.log('XPath Selected Date = %s' % raw)
# self.log('RegEx Processed Date = %s' % case['date'])
[…. Any more item scraping code required….]
return case
--
You received this message because you are subscribed to the Google Groups "scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to
scrapy-users...@googlegroups.com.
To post to this group, send email to
scrapy...@googlegroups.com.
Visit this group at http://groups.google.com/group/scrapy-users?hl=en.
For more options, visit https://groups.google.com/groups/opt_out.