在此输入代码...#coding=utf-8
import urllib
from sgmllib import SGMLParser
import urlparse
import socket
import os
socket.setdefaulttimeout(5)
class URLLister(SGMLParser):
'''get all url and add to sequence'''
def reset(self):
self.urls = []
SGMLParser.reset(self)
def start_a(self, attrs):
"""Get url from sequence"""
urlList = [v for k, v in attrs if k=='href']
if urlList:
self.urls.extend(urlList)
def getHTML(self,targetUrl):
"""Get page content"""
sockPage=urllib.urlopen(targetUrl)
HTML=sockPage.read()
sockPage.close()
return HTML
def getUrls (targetUrl):
"""Get url and deal with it ,return """
parser = URLLister();
HTML=parser.getHTML(targetUrl);
parser.feed(HTML);
urlList = parser.urls
parser.close()
urlTup = urlparse.urlparse(targetUrl) #Parse URL
for i in range(len(urlList)):
urlList[i] = addHttp(urlList[i],urlTup)
return urlList
def addHttp(url,urlTup):
"""Convert to complete url """
if url.startswith("http"):return url
rootUrl = urlTup.scheme + "://" + urlTup.netloc
if url.startswith("/"):
fullUrl = rootUrl + url
else:
fullUrl = rootUrl + urlTup.path
return fullUrl
def urlCheck(url):
try:
url_ = urllib.urlopen(url)
except:
return 1
print url , ' ----------- Check OK'
return 0
if __name__ == "__main__":
logfile = r'ErrorLog.log'
icount = 0
urls = getUrls(targetUrl)
if os.path.exists(logfile):
os.remove(logfile)
for url in urls:
#print 'Check ......' ,url
if urlCheck(url):
log_f = open(logfile,'w')
log_f.write(url + ' ----ERROR----')
log_f.close()
icount =+ 1
print url , 'Loading time out ------- ERROR \n'
print '-----------------Finished-----------------'
print '-----------------Sumary-------------------'
print 'Total url : ' , len(urls)
print 'unavaliable url count : ' , icount
print 'Successful url count : ' , len(urls) - icount