************************the spider: items.py*********************************************
from scrapy.item import Item, Field
class ProjetvinnicolasItem(Item):
# define the fields for your item here like:
nomVin = Field()
appelation = Field()
millesime= Field()
prix = Field()
************************the spider: test2.py*********************************************
#!/usr/bin/python
#-*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ProjetVinNicolas1.items import Projetvinnicolas1Item
import sys
import codecs
### Kludge to set default encoding to utf-8
reload(sys)
sys.setdefaultencoding('utf-8')
class MySpider(CrawlSpider):
name = "vino"
allowed_domains = ["nicolas.com"]
start_url = ["http://www.nicolas.com/fr/commander_bordeaux.html/"]
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths='//div[@class="glo_pagination_centre"]/a[1]')),
Rule(SgmlLinkExtractor(allow=r'18_409~\d+~10\.htm'), callback='parse_items', follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
res = hxs.select('//table[@class="cpt_fav_table_commande"]/tr[position()>1]')
items = []
for res in res:
item = Projetvinnicolas1Item()
item ["nomVin"] = res.select('td[3]/a/text()').extract()
item ["appelation"] = res.select('td[5]/text()').extract()
item ["millesime"] = res.select('td[7]/text()').extract()
item ["prix"] = res.select('td[9]/b/text()').extract()
items.append(item)
return items