div from xpath returns empty

66 views
Skip to first unread message

Jordan Rodrigues

unread,
Jun 27, 2016, 5:20:49 AM6/27/16
to scrapy-users
Hi there everyone, good evening!

I used Scrapy to crawl some musics from this site: http://www.vagalume.com.br/ (here vagalume.json was generated)

The idea now is to crawl the same musics I crawled from the site above in this another site: https://www.letras.mus.br

I tried to read the data  from vagalume.json and search each music in the site above, but the div from xpath returns empty.

I think the reason for that is that the spider finishes to read the search page before it returns the query from the server. I'm not sure though. What can I do about it?

Here is the code (the current parse method I was using for debug)

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MusicItem(scrapy.Item):
    name
= scrapy.Field()
    author
= scrapy.Field()
    lyrics
= scrapy.Field()




import scrapy
import json

from Letras.items import MusicItem

class LetrasSpider(scrapy.Spider):
    name
= "letras"
    allowed_domains
= ["letras.mus.br"]
    start_urls
= [
       
"https://www.letras.mus.br/?q=peter%20hollens%20misty%20mountains"
   
]

   
def cleanString(self, text):
        txt
= ""

       
for c in text:
           
if c.isalnum():
                txt
+= c
           
else:
               
if c.isspace():
                    txt
+= ' '

       
return txt

   
def retrieveLyrics(self, response):
        lyrics
= ""

       
for sel in response:
            sentence
= sel.extract()
            lyrics
+= self.cleanString(sentence)
            lyrics
+= ' '

       
return lyrics

   
def retrieveMusicName(self, response):
       
return self.cleanString(response.xpath('h1/text()')[0].extract())

   
def retrieveAuthor(self, response):
       
return self.cleanString(response.xpath('h2/a/text()')[0].extract())

   
def parseOneMusic(self, response):
        lyrics
= self.retrieveLyrics(response.xpath('//div[@class="g-pr g-sp"]/div[@class="cnt-letra p402_premium"]/article/p/text()'))
        sel
= response.xpath('//div[@class="cnt-head cnt-head--l"]/div[@class="cnt-head_title"]')
        name
= self.retrieveMusicName(sel)
        author
= self.retrieveAuthor(sel)
        item
= MusicItem()
        item
['name'] = name
        item
['author'] = author
        item
['lyrics'] = lyrics

       
yield item

   
def parseOneAuthor(self, response):
       
for href in response.xpath('//ul[@class="cnt-list"]/li/a[1]/@href'):
            url
= response.urljoin(href.extract())
           
yield scrapy.Request(url, callback=self.parseOneMusic)

   
def parseQuery(self, response):
        url
= response.css('.gsc-expansionArea > div:nth-child(1) > div:nth-child(1) > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > div:nth-child(1) > a:nth-child(1)').extract()
       
self.logger.info("ParseQuery url = {0}".format(url))
       
return scrapy.Request(url, callback=self.parseOneMusic)

   
def treatAuthorName(self, authorName):
       
return authorName.lower().replace(" ", "-")

   
def parse(self, response):
       
for href in response.xpath('//div[@class="wrapper"]/div[@id="all"]/div[@id="cnt_top"]/div[@id="res_busca"]/div[@id="resultado"]/div[@class="all"]/div[@id="cse-search-results"]/div'):
           
self.logger.info("LoggingParse href = {0}\n".format(href))

   
def parse2(self, response):
       
with open('vagalume.json') as vagalume_file:
            vagalumeJson
= json.load(vagalume_file)

           
for vagalumeItem in vagalumeJson:
                url
= "https://www.letras.mus.br/?q={0} {1}".format(vagalumeItem["author"], vagalumeItem["name"])
               
yield scrapy.Request(url, callback=self.parseQuery)



Reply all
Reply to author
Forward
0 new messages