from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from myproject.items import MyItem
from scrapy.http import FormRequest, Request
class MySpiderLogin(BaseSpider):
name = 'spiderlogin'
def parse(self, response):
return [FormRequest.from_response(response, formnumber = 1,
formdata={'username': 'user', 'password': 'pass'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
def parse_logged(self, response):
self.log("\nHi, I'm collecting links in this page! %s\n" % response.url)
hxs = HtmlXPathSelector(response)
#The "Rules" to get links
url_list = hxs.select('//somethingtoget/@href').extract()
for item_url in url_list:
yield Request(url=item_url, callback=self.parse_item)
def parse_item(self, response):
self.log("\nHi, I'm parsing in this page! %s\n" % response.url)
hxs = HtmlXPathSelector(response)
item = MyItem()
item['desc'] = hxs.select('//somethingtogetdescription').extract()
return item