Hello
i have a data base with an url list
i would like to crawl each url for find the h1 and update the row (in data base) with the value of the h1
but i don't understand how i can recover in the update request the value of the url
I tried url[0] but it say to me that the url is not defined
thanks in advance for your help
regards
class H1searchSpider(BaseSpider):
name = "h1search"
def start_requests(self):
self.db = MySQLdb.connect(host="localhost", user="root", passwd="", db="crawler_engine", charset = 'utf8', use_unicode = False)
cur = self.db.cursor()
cur.execute("select url from urls")
for url in cur.fetchall():
yield Request(url[0])
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = DmozItem()
item['h1'] = hxs.select('//h1').extract()
cursor = self.db.cursor()
for j in range(len(item['h1'])):
cursor = self.db.cursor()
sql = "update urls set h1 = '%s' where url = '%s'" % (item['h1'][0], url[0])
cursor.execute(sql)
self.db.commit()
return item