[初學者] 如何把python baidu搜尋結果 改成 google搜尋結果

132 views
Skip to first unread message

Klose Lan

unread,
Jan 16, 2018, 9:37:40 PM1/16/18
to python.tw
小弟因為工作上需要,剛學python 兩三天,剛好爬到一篇文章剛好符合目前的條件,只是小弟不懂如何把 baidu 改成 google,有沒有大大幫忙一下,感激不盡

小弟有自己嘗試修改過,但看不懂關於關鍵字搜索的那一塊。

#coding:utf-8
import time
import requests
import urllib.parse
from bs4  import BeautifulSoup as BS
import re
from openpyxl import load_workbook
from openpyxl import Workbook
from collections import Counter

start_time=time.time()
# 加載Excel工作表以及獲取表名(可更改)
wb1 = load_workbook(r"C:\Users\Jackal\Desktop\1.xlsx")
sheet1 = wb1.get_sheet_by_name("工作表1")
# 把所有的人員名字單獨取出來
professors=[]
for x in sheet1['C']:
    if x.value!='姓名':
        professors.append(str(x.value).strip())

# for x in professors:
#     print(x)
length=len(professors)

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                 ' Chrome/48.0.2564.48 Safari/537.36',
}
nums=['1','2','3','4','5','6','7','8','9']
filterlist=['service','webmaster','news','new','fjptwhw','support',
            'cnsaiwai','PKU_NanoST','mysf','zjseu','chinayznew','kf',
            'admin','sp','wlwnews','anhuixinwen','xiaoyuan','xinwen',
            'fuwu','contrib','xmt','sinamedia','rm','m8000','hnqnw',
            'info','zghtbs','web','jjrbtg','zgkyb','shihappy','pic',
            'cnxwzx','gaoling_news','emuchnet','changjiang','leifu',
            'admission','office','administrator','aco','andong','baike',
            'bgs','bravetim','buaazhangjun','cbtql','cksp','cs','fuhuimin',
            'fxxsqy','jdb','jiangyue2012','jinxin','job','jsu','jubao','kefu',
            'kepu','kjdbcb','kjxb0731-160','master','maths','nchkdxb','newmedia',
            'rcb','rencaiban','rczp','renshichu','rsc','shengxch','sjing','sla_office',
            'swxrs','tougao','wanhoo','wbb037','Webmaster','wlwh','xcb','xiaohuzz','xwzx',
            'yjrddh','youthhb','yx1005','zgzxyx','zhouh','zzbgbb','zyang','zuaa','360baike']

#郵箱正則表達式匹配
pattern=re.compile(r"\b\w+([.+-]\w+)*@(?!163)(?!126)(?!qq)(?!vip)\w+([.-]\w+)*\.\w+([.-]\w+)*\b",re.M|re.I)
#匹配中文漢字
regexp = re.compile(r'[^\x00-\x7f]')
def baidu_search(word):
    '''
    根據關鍵字獲取查詢網頁
    '''
    emails=[]
    res_dict={}
    keyword1=word+" 信箱"
    keyword2=word+" email"
    keyword3=word+" Email"
    keyword4=word+" E-mail"
    keywords=[keyword1,keyword2,keyword3,keyword4]#調整搜索的關鍵字

    # print(word)
    for i in range(len(keywords)):
        url=root_url+urllib.parse.quote(keywords[i].encode('utf-8'))#url拼接
        # print(url)
        r=requests.get(url,headers=headers)
        soup=BS(r.text,'html.parser')
        cont=soup.find_all('div', class_="result c-container ")
        # print(type(cont))#class 'bs4.element.ResultSet'>

        for k in range(len(cont)):
            # print("第%d個"%k)
            string=str(cont[k]).strip()
            # print(string)
            try:
                matchObj=pattern.search(string).group()
                 # print('\t-------->',matchObj)
                if matchObj and matchObj.strip().split('@')[0] not in filterlist:
                    #由於之前的郵箱正則表達式沒有寫好,匹配出來還有中文漢字,
                    #在此處去除掉中文漢字
                    try:
                        index=regexp.search(matchObj).span()[0]
                        matchObj=matchObj[:index]
                    except Exception as e:
                        emails.append(matchObj)
                    # print("emails--->",emails)
                    # print(len(emails))
            except Exception as e:
                # print(e)
                pass
            if len(emails)>7:
                # print("break.....")
                break

        if len(emails):
            res_dict[word]=emails
            # print("res_dict",res_dict)
            break

    if len(res_dict)==0:#最終沒有匹配到郵箱信息
        res_dict[word]='Null'
    return res_dict

if __name__ == "__main__":
    resultSet={}#最終結果,存放學者名字、及其郵箱信息
    for i in range(length):
        res_dict=baidu_search(professors[i])
        print(res_dict)
        emails=res_dict[professors[i]]
        if len(emails)>1:
            res_email=Counter(emails).most_common(1)[0][0]#按照每個郵箱出現的頻率,進行排序,頻率最大的那個郵箱就是我們所要查找的郵箱
            resultSet[professors[i]]=res_email
        else:
            resultSet[professors[i]]=emails[0]

    # for key in resultSet.keys():
    #     print(key,resultSet[x])

    sheet1['F1'].value='郵箱'
    # 將每一行的數據寫入
    for i in range(length):
         professor=str(sheet1['C'+str(i+2)].value).strip()
         sheet1['F'+str(i+2)].value=resultSet[professor]

    # 將最終的結果保存在excel中
    wb1.save(r"C:\Users\Jackal\Desktop\1.xlsx")
    end_time=time.time()
    print("進程共耗時:%d秒"%(end_time-start_time))

Reply all
Reply to author
Forward
0 new messages