How to return "NA" in Python?

1,736 views
Skip to first unread message

Todd Tucker

unread,
Jul 9, 2013, 7:42:39 AM7/9/13
to scrap...@googlegroups.com
Hi: I am new to Scraperwiki and Python, and trying to figure out how to return "NA" or something similar when there is no item on a scraped webpage that meets my cssselect specifications.

In my code below, I am scraping a double-nested set of webpages. When I scrape a sub-page that does not have a value for the cssselect attribute, it simply copies the value of the last scraped page for which there was a value.

Any tips? Thanks! Todd


import
scraperwiki
import urlparse
import lxml.html
import urllib
# scrape_table function: gets passed an individual page to scrape
def scrape_table(root):
    rows = root.cssselect("h2") # selects all <h2> blocks, and then we create a dictionary
    record = {}
    
    for row in rows:
        table_cells = row.cssselect("h2 a")
        for cell in table_cells:
            record['Title'] = table_cells[0].text_content()
            table_cellsurls = table_cells[0].cssselect("a")
#grab the href=" attribute and put that in 'CaseURL'
            record['CaseURL'] = table_cellsurls[0].attrib.get('href')
#creates variable 'caselink'which is a URL adding the href attribute tothe end of the italaw.com/base url            
            caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
            print caselinkurl
#create another variable containing the scraped contents from that URL
#Turn the webpage string into an lxml object
            caseroots = lxml.html.fromstring(caselinkurl)
# Use Mozilla to identify appropriate or unique case selectors
#put all the certain tags on the page into a list
            ids=caseroots.cssselect("div div div div a")
            #turns out that the data i want is third and second instance. BUT THE PROBLEM I HAVE IS THAT IT COPIES THE PREVIOUS ROW IF NULL.
            for i in ids:
                if len(ids)>=2:
                    record['Rules']=ids[2].text_content()
                    record['Treaty']=ids[3].text_content()
                else:
                    return None
                    #record['Rules']="NA"
                    #record['Treaty']="NA"
                    #pass
                    #print "None"
    # As you can see, i have experimented with different ways of returning nothing.
            pars = caseroots.cssselect("span.'case-doc-details'")


            for par in pars:

                for i in pars:                
                    pars1=pars[0].cssselect("a")
                    if len(pars1)>=0:
                        record['DetailsURL']=pars1[0].attrib.get('href')
                    else:
                        return None
#                        record['DetailsURL']="NA"
                        #pass
                        #print "None"
                 #Create a third level of scrape.
                    caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars1[0].attrib.get('href')).read()
                    print caselinkurl2
                    caseroots2=lxml.html.fromstring(caselinkurl2)
                    pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
                    for i in pars2:
                        if len(pars2)>=0:
                            record['Doc Date']=pars2[0].text_content()
                            #record['Doc Date format']=pars2[0].attrib.get('content')
                        else:
                            #print "None"
                            return None
#                            record ['Doc Date']="NA"
                    pars3=caseroots2.cssselect("div.'field-item even' span.'file' a")
                    for i in pars3:        
                        if len(pars3)>=0:    
                            record['Doc Type Link']=pars3[0].attrib.get('href')
                            record['Doc Type']=pars3[0].text_content()    
                        else:
                            return None

                    pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
                    for i in pars4:
                        if len(pars4)>=0:

                            record['Claimant Nominee']=pars4[0].text_content()
                        else:
                            return None

                    pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
                    for i in pars5:
                        if len(pars5)>=0:
                          
                            record['Respondent Nominee']=pars5[0].text_content()
                        else:
   
                            return None
                          
                    pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
                    for i in pars6:
                        if len(pars6)>=0:
                          
                            record['President']=pars6[0].text_content()
                        else:
                         
                            return None
                         
# Print out the data we've gathered into the record variable, followed by '-----'
            print record, '------------'
# Finally, save the record to the datastore - 'title' is our unique key
            scraperwiki.sqlite.save(["Title"],record)
            
# scrape_and_look_for_next_link function: scrapes page, converts to lxml object then calls the scrape_table function
def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    print html
    root = lxml.html.fromstring(html)
    scrape_table(root)


#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

Todd Tucker

unread,
Jul 10, 2013, 12:06:43 PM7/10/13
to scrap...@googlegroups.com

Answered my own question here.

For each query that might turn up a null value, use something like this:

        for par in pars:

            pars1=pars[0].cssselect("a")

            for i in pars1:                
                if len(pars)==0:
                    record['DetailsURL']="None"
                else:

                    record['DetailsURL']=pars1[0].attrib.get('href')
Reply all
Reply to author
Forward
0 new messages