How to return "NA" in Python?

1,736 views

Skip to first unread message

Todd Tucker

unread,

Jul 9, 2013, 7:42:39 AM7/9/13

to scrap...@googlegroups.com

Hi: I am new to Scraperwiki and Python, and trying to figure out how to return "NA" or something similar when there is no item on a scraped webpage that meets my cssselect specifications.

In my code below, I am scraping a double-nested set of webpages. When I scrape a sub-page that does not have a value for the cssselect attribute, it simply copies the value of the last scraped page for which there was a value.

Any tips? Thanks! Todd

import scraperwiki
import urlparse
import lxml.html
import urllib
# scrape_table function: gets passed an individual page to scrape
def scrape_table(root):
  rows = root.cssselect("h2") # selects all <h2> blocks, and then we create a dictionary
  record = {}

  for row in rows:
  table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Title'] = table_cells[0].text_content()
table_cellsurls = table_cells[0].cssselect("a")
#grab the href=" attribute and put that in 'CaseURL'
  record['CaseURL'] = table_cellsurls[0].attrib.get('href')
#creates variable 'caselink'which is a URL adding the href attribute tothe end of the italaw.com/base url
  caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
  print caselinkurl
#create another variable containing the scraped contents from that URL
#Turn the webpage string into an lxml object
  caseroots = lxml.html.fromstring(caselinkurl)
# Use Mozilla to identify appropriate or unique case selectors
#put all the certain tags on the page into a list
  ids=caseroots.cssselect("div div div div a")
  #turns out that the data i want is third and second instance. BUT THE PROBLEM I HAVE IS THAT IT COPIES THE PREVIOUS ROW IF NULL.
  for i in ids:
  if len(ids)>=2:
  record['Rules']=ids[2].text_content()
  record['Treaty']=ids[3].text_content()
  else:
  return None
  #record['Rules']="NA"
  #record['Treaty']="NA"
  #pass
  #print "None"
# As you can see, i have experimented with different ways of returning nothing.
  pars = caseroots.cssselect("span.'case-doc-details'")

  for par in pars:

for i in pars:
  pars1=pars[0].cssselect("a")
if len(pars1)>=0:
  record['DetailsURL']=pars1[0].attrib.get('href')
  else:
  return None
#   record['DetailsURL']="NA"
  #pass
  #print "None"
#Create a third level of scrape.
  caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars1[0].attrib.get('href')).read()
  print caselinkurl2
  caseroots2=lxml.html.fromstring(caselinkurl2)
  pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
  for i in pars2:
  if len(pars2)>=0:
  record['Doc Date']=pars2[0].text_content()
  #record['Doc Date format']=pars2[0].attrib.get('content')
  else:
  #print "None"
  return None
#   record ['Doc Date']="NA"
  pars3=caseroots2.cssselect("div.'field-item even' span.'file' a")
  for i in pars3:
  if len(pars3)>=0:
  record['Doc Type Link']=pars3[0].attrib.get('href')
  record['Doc Type']=pars3[0].text_content()
  else:
  return None

  pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
  for i in pars4:
  if len(pars4)>=0:

  record['Claimant Nominee']=pars4[0].text_content()
  else:
  return None

  pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
  for i in pars5:
  if len(pars5)>=0:

  record['Respondent Nominee']=pars5[0].text_content()
  else:

  return None

  pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
  for i in pars6:
  if len(pars6)>=0:

  record['President']=pars6[0].text_content()
  else:

  return None

# Print out the data we've gathered into the record variable, followed by '-----'
  print record, '------------'
# Finally, save the record to the datastore - 'title' is our unique key
  scraperwiki.sqlite.save(["Title"],record)

# scrape_and_look_for_next_link function: scrapes page, converts to lxml object then calls the scrape_table function
def scrape_and_look_for_next_link(url):
  html = scraperwiki.scrape(url)
  print html
  root = lxml.html.fromstring(html)
  scrape_table(root)

#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

Todd Tucker

unread,

Jul 10, 2013, 12:06:43 PM7/10/13

to scrap...@googlegroups.com

Answered my own question here.

For each query that might turn up a null value, use something like this:

        for par in pars:


            pars1=pars[0].cssselect("a")


            for i in pars1:                
                if len(pars)==0:
                    record['DetailsURL']="None"
                else:


                    record['DetailsURL']=pars1[0].attrib.get('href')

Reply all

Reply to author

Forward

0 new messages