URL Check

3 views
Skip to first unread message

Chris

unread,
Jan 1, 2013, 10:16:15 AM1/1/13
to qda_...@googlegroups.com
all:

python check web url link. no big deal:

code:

在此输入代码...#coding=utf-8
import urllib
from sgmllib import SGMLParser
import urlparse
import socket
import os
socket.setdefaulttimeout(5)

class URLLister(SGMLParser):  
    '''get all url and add to sequence'''  
    def reset(self): 
        self.urls = []
        SGMLParser.reset(self)        

    def start_a(self, attrs):
        """Get url from sequence"""
        urlList = [v for k, v in attrs if k=='href']
        if urlList:
            self.urls.extend(urlList)
    
    def getHTML(self,targetUrl):
        """Get page content"""
        sockPage=urllib.urlopen(targetUrl) 
        HTML=sockPage.read()  
        sockPage.close()  
        return HTML


def getUrls (targetUrl):
    """Get url and deal with it ,return """
    parser = URLLister();
    HTML=parser.getHTML(targetUrl);
    parser.feed(HTML);
                      
    urlList = parser.urls
    parser.close()
    urlTup = urlparse.urlparse(targetUrl) #Parse URL
    for i in range(len(urlList)):
        urlList[i] = addHttp(urlList[i],urlTup)
    return urlList

def addHttp(url,urlTup):
    """Convert to complete url """
    if url.startswith("http"):return url
    rootUrl = urlTup.scheme + "://" + urlTup.netloc
    if url.startswith("/"):
        fullUrl = rootUrl + url
    else:
        fullUrl = rootUrl + urlTup.path
    return fullUrl

def urlCheck(url):
    try:
        url_ = urllib.urlopen(url)
    except:
        return 1
    print url , ' -----------  Check OK'
    return 0

if __name__ == "__main__":
    targetUrl = "http://www.cnbeta.com"
    logfile = r'ErrorLog.log'
    icount = 0
    urls = getUrls(targetUrl)
    if os.path.exists(logfile):
        os.remove(logfile)
    for url in urls:
        #print 'Check ......' ,url
        if urlCheck(url):
            log_f = open(logfile,'w')
            log_f.write(url + ' ----ERROR----')
            log_f.close()
            icount =+ 1
            print url , 'Loading time out ------- ERROR \n'
            
    print '-----------------Finished-----------------'
    print '-----------------Sumary-------------------'
    
    print 'Total url             : ' , len(urls)
    print 'unavaliable url count : ' , icount
    print 'Successful url count  : ' , len(urls) - icount




Reply all
Reply to author
Forward
0 new messages