import sys
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from spider.items import Item
from scrapy.http import Request
#from spider.settings import JsonWriterPipeline
class MySpider (CrawlSpider):
name = 'facebook'
allowed_domains = ['
facebook.com']
start_urls = ['
https://login.facebook.com/login.php']
def parse(self, response):
return [FormRequest.from_response(response,
formname='login_form',
formdata={'email':'xxx',
'pass':'xxx'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
else:
return Request(url="put some URL here",
callback=self.parse_items)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("some path here")
items = []
for title in titles:
item = Item()
item['friendName']= titles.select("some path here").extract().pop().encode('UTF-8',errors='strict')
item['numberOffriends']= titles.select("some path here").extract().pop().encode('UTF-8',errors='strict')
items.append(item)
return (items)
so, im storing scraped data in mysql database (...pop().encode('UTF-8',errors='strict')), for the moment i recieving crapdata, i have to work on xpaths and javascripts in html tags
hope this help a little bit