Python - Scraping myrecipes.com using Scrapy

53 views
Skip to first unread message

hby01

unread,
Jan 22, 2016, 10:47:00 AM1/22/16
to scrapy-users
I am trying to scrape recipe website: myrecipes.com to extract recipe details to be stored in an Sqlite db in my recipe android application. So far, I was able to obtain data regarding recipe ingredients, instructions, servings, nutrients etc... I'm trying to obtain data relating to the time it takes a recipe to be made. The issue is the not all recipes have the same time info., some might include: Total time or prep time or cook time but not always. 

To deal with this I use the following code on the html snippet below:

 # Recipe time
        duration_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div[@class = "field-collection-container clearfix"]')

        for duration_node in duration_nodes:
            try:
                path = duration_node.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div/div[@class = "field-recipe-time"]/div/div/span[1]/text()').extract()
                if path == 'Prep: ':
                    recipe['prep_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Cook: ':
                    recipe['cook_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Total: ':
                    recipe['total_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
            except:
                continue
     
        


<div class="panel-pane pane-entity-field pane-node-field-recipe-time recipe-time">
  
        <h2 class="pane-title">Recipe Time</h2>
    
  
  <div class="pane-content">
    <div class="field-collection-container clearfix">
  <div class="field-recipe-time">
    <div class="field-collection-view clearfix view-mode-recipe-time">
<div class="recipe-time-info">
  <span class="recipe-time-text">Prep: </span>
  <span class="recipe-time-duration">25 Minutes</span>
</div>
</div>  </div>
  <div class="field-recipe-time">
    <div class="field-collection-view clearfix view-mode-recipe-time field-collection-view-final">
<div class="recipe-time-info">
  <span class="recipe-time-text">Cook: </span>
  <span class="recipe-time-duration">45 Minutes</span>
</div>
</div>  </div>
</div>  </div>

  
  </div>

The full code is:
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.

from myrecipes.items import MyrecipesRecipe, Ingredient, Nutrients

class MyrecipesSpider(CrawlSpider):
    name = "myrecipes" # name of the spider to be used when crawling
    allowed_domains = ["myrecipes.com"] # where the spider is allowed to go

    def parse(self, response):
        sel = Selector(response) # the selector
        recipe = MyrecipesRecipe()

        # Name
        recipe['name'] = sel.xpath("substring-before(//title/text(),' Recipe')").extract()

        # Cuisine
        recipe['cuisine'] = "Indian"
        
        # Ingredients
        ingredients = []
        ingredient_nodes = sel.xpath('//*[@class = "panel-pane pane-entity-field pane-node-field-ingredients"]/div/div')

        for ingredient_node in ingredient_nodes:
            try:
                name = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "name"]/text()').extract()
                quantity = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "amount"]/text()').extract()
            except:
                continue

            ingredient = Ingredient()
            ingredient['name'] = name
            ingredient['quantity'] = quantity
            ingredients.append(ingredient)

        recipe['ingredients'] = ingredients
        
        # Directions
        instructions = []
        instruction_nodes = sel.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]')

        for instruction_node in instruction_nodes:
            try:
                instruction_step = instruction_node.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]/*/text()').extract()
            except:
                continue
            instructions.append(instruction_step)
        
        recipe['instructions'] = instructions
        
        # Nutritional Info
        nutrients = []
        nutrient_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-nutrition-data"]/div/div[@itemprop = "nutrition"]')

        for nutrient_node in nutrient_nodes:
            try:
                name = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains (@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/text()').extract()
                quantity = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains(@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/span/text()').extract()
            except:
                continue
            
            nutrient = Nutrients()
            nutrient['name'] =  name
            nutrient['quantity'] = quantity
            nutrients.append(nutrient)
        nutrient_name = []
        x = nutrients[0].get('name')
        for i in x:
            if i != "\n":
                nutrient_name.append(i)
        nutrients[0]['name'] = nutrient_name
        
        recipe['nutrients'] = nutrients
        
        # Recipe time
        duration_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div[@class = "field-collection-container clearfix"]')

        for duration_node in duration_nodes:
            try:
                path = duration_node.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div/div[@class = "field-recipe-time"]/div/div/span[1]/text()').extract()
                if path == 'Prep: ':
                    recipe['prep_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Cook: ':
                    recipe['cook_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Total: ':
                    recipe['total_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
            except:
                continue
     
        
        # Number of Servings
        recipe['servings'] = sel.xpath("substring-after(//div[@class = 'panel-pane pane-entity-field pane-node-field-yield']/div[@class = 'pane-content']/div[@itemprop = 'yield']/div[@class = 'field-yield']/text(), ': ')").extract()

        return recipe



When I run the code, the output does not register and I can't figure why. I suspect the issue is with the if-else statements after testing. Any help would be appreciated.



Reply all
Reply to author
Forward
0 new messages