2009/3/10 张沈鹏 <zsp...@gmail.com>:
> 不多说了
>
> 自己看
>
> 还不成有空整理
>
> 小的词库词不全 少不少名人的人名
> 大词库有很多废弃词
>
> 需要merge整理一下
>
> 如果谁有空先做了吧:)
>
> .....
>
> 不过A4的分类词库质量不错
>
> --
> 张沈鹏
> http://zsp.javaeye.com/
>
--
张沈鹏
http://zsp.javaeye.com/
begin = 290
end = 16219
url_template = "http://pinyin.sogou.com/dict/cell.php?id=%s"
down_link_template = """<a href="download_txt.php?id=%s">"""
txt_link = "http://pinyin.sogou.com/dict/download_txt.php?id=%s"
title_begin = "<title>"
title_end = "-细胞词库-搜狗拼音输入法</title>"
for num in xrange(begin, end+1):
print num
url = url_template%num
html = urlopen(url, timeout=60).read().decode("gb18030", "ignore")
down_link = down_link_template%num
if html.find(down_link) == -1:
continue
title = html[
html.find(title_begin)+len(title_begin)
:
html.find(title_end)
]
begin_string = """<a href="list.php">全部</a>"""
begin = html.find(begin_string)+len(begin_string)
end = html.find("""<div class="cellbox lib">""")
begin2end = html[begin:end]
cat = [i.string for i in BeautifulSoup(begin2end).findAll('a')]
txt_link = "http://pinyin.sogou.com/dict/download_txt.php?id=%s"%num
print num, title
print " ".join(cat)
txt = urlopen(txt_link, timeout=60).read().decode("gb18030", "ignore")
filename = ("%s.%s#%s.txt"%(num, "_".join(cat),
title)).encode("utf-8").replace("/","/")
with open(filename, "w") as out:
out.write(txt)
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
begin = 1