Hello Everyone,
I am new to Scrapy, so bare with me please.
I was able to crawl several webpages in order to get the data that i needed by feeding the urls directly into the start_urls variable. Thousands even.
But now i have a crawler that I feed about 30.000 urls (same domain) and it stops after crawling only about 100. And I just do not know why! :(
I tried to google it, and searching, but could not find a solution. So I was hoping that someone could shed a bit of light and help me on my problem.
Here is the script:
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
import MySQLdb
import logging
import datetime
import os
import time
from scrapy.spider import Spider
from scrapy.selector import Selector
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from dirbot.items import Website
now = datetime.datetime.now()
LOG_FILENAME = os.environ['CRAWLER_LOG']+'PRINFOR_4_date_'+ str(now.day)+'_'+str(now.month)+'_'+str(now.year)+'_hour_'+str(now.hour)+'_'+str(now.minute)+'.log'
logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)
db=MySQLdb.connect(host=os.environ['CRAWLER_HOST'],user=os.environ['CRAWLER_USER'],passwd=os.environ['CRAWLER_PASS'], db=os.environ['CRAWLER_DB'])
cur=db.cursor()
class DmozSpider(Spider):
name = "dmoz_prinfor_4"
allowed_domains = ["
prinfor.pt/"]
start_urls = [""]
#Feeds the link into the crawler
select = ("select distinct(link) from main_links where loja_link_id=19 and type=%s")
data_select =["P"]
cur.execute(select, data_select)
x=0
for row in cur.fetchall():
if x < 1:
start_urls[0]=row[0]
else:
start_urls.append(row[0])
x=x+1
if start_urls[0]<>'':
select = ("delete from produtos where loja_cod_id=19")
cur.execute(select)
cur.execute("commit")
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//body')
items = []
for site in sites:
item = Website()
item['description'] = site.xpath('//h2[@class="productName"]/text()').extract()
item['price'] = site.xpath('//span[@id="our_price_display"]/text()').extract()
item['ref'] = site.xpath('//p[@align="center"]/text()').extract()
item['cat1'] = site.xpath('//span[@class="navigation_end"]/a/text()').extract()
item['description'].append("")
item['price'].append("")
item['ref'].append("")
item['cat1'] .append("")
items.append(item)
item['description'][0] = item['description'][0].strip()
item['price'][0] = item['price'][0].strip()
item['price'][0] = item['price'][0].replace(',','.')
item['price'][0] = item['price'][0].replace(' ','')
item['price'][0] = item['price'][0][:-1]
item['ref'][0] = item['ref'][0][12:]
item['description'][0] = item['description'][0].encode('utf-8', 'replace')
select = ("select MAX(index_id) from produtos where loja_cod_id=19")
cur.execute(select)
for row in cur.fetchall():
if (row[0] is None) or (not row[0]):
index_id = 1
else:
index_id = int(row[0]) +1
prod_cod_id = "PRINFOR-" + str(index_id)
string_url=str(response.url)
insert = ("INSERT INTO produtos (prod_cod_id, loja_cod_id, index_id, act, ref_num, name, prod, price_eur, price_ori, cur_ori, link, disp, cat_n1, cat_n2, cat_n3, cat_n4, new) "
"VALUES (%s, 19, %s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s, %s, %s, %s)"
)
data = [prod_cod_id, index_id, "Y", item['ref'][0], item['description'][0], "", item['price'][0], "EUR" ,string_url, 'Verificar loja'.decode('utf-8'), item['cat1'][0], "", "", "", "Y"]
print index_id
try:
cur.execute(insert, data)
cur.execute("commit")
logging.debug('Foi inserido o link do produto: ' + string_url)
except MySQLdb.OperationalError:
logging.debug('Foi dado um erro no link do produto: ' + string_url)
return items
logging.debug('End of LOG.')