试了下 Durus 对象数据库

Jiahua Huang

unread,

Nov 6, 2007, 9:11:41 AM11/6/07

to python. cn

晚饭前跟大妈聊了会 Durus ,
Durus 是 Quixote 团队的作品, 轻量级的 zodb 对象数据库.

顺便测试了下 Durus 处理大数据库的情况

##用的倒排索引全文搜索类
class WordIndex:
'''简单的倒排索引
'''
def __init__(self, wordDict={}, Dict=dict, commit=None):
'''dWordsIndex 是索引字典
'''
self.dWordsIndex = wordDict
self.Dict = Dict #用 btree 或 OOBTree
self.commit = commit
def addIndex(self, text, uid=None):
'''添加索引
'''
dWordsIndex = self.dWordsIndex
Dict = self.Dict
uid = uid or _str2hash(text)
words = getWordFs(text) #分词,带词频
for word, f in words.iteritems():
if not word in dWordsIndex:
dWordsIndex[word]=Dict()
#Uids = dWordsIndex[word]
#Uids[uid] = f
dWordsIndex[word][uid] = f
if self.commit: self.commit()
return uid
def delIndex(self, text, uid=None):
'''删除索引
'''
dWordsIndex = self.dWordsIndex
uid = uid or _str2hash(text)
words = getWords(text)
if not words: return False
for word in words:
if word in dWordsIndex and uid in dWordsIndex[word]:
del dWordsIndex[word][uid]
if self.commit: self.commit()
return uid
def query(self, QueryString):
'''
@TODO:
'''
pass
def searchIndex(self, text):
'''搜索文章，返回 uid
'''
dWordsIndex = self.dWordsIndex
words = getWords(text) #分词
if not words: return []
dicts = map(dWordsIndex.get, words)
while None in dicts: dicts.remove(None) ## 怎么来的 None?
if not dicts : return []
dicts = map(lambda i:i[1], sorted(map(lambda i:(len(i),i),
dicts))) ##按字典长度排序，先捅掉较短的
#return reduce(lambda d1,d2: set(d1) & set(d2), dicts)
return eval('&'.join(map(lambda i:'set(dicts[%s])'%i,
xrange(0, len(dicts))))) ##更快?
def _dumpIndex(self):
print '{'
for i,t in self.dWordsIndex.iteritems(): print "'%s' : %s,"%(i,t)
print '}'

class _TextIndex(WordIndex):
'''演示用 WordIndex 索引
'''
def __init__(self, wordDict={}, textDict={}, Dict=dict, commit=None):
self.dWordsIndex = wordDict
self.dTextDict = textDict
self.commit = commit
self.Dict = dict
WordIndex.__init__(self, wordDict, Dict, commit)
def add(self, text, uid=None):
dTextDict = self.dTextDict
uid = uid or _str2hash(text)
dTextDict[uid] = text
self.addIndex(text, uid)
return uid
def delete(uid):
dTextDict = self.dTextDict
if not uid in dTextDict: return False
text = dTextDict[uid]
self.delIndex(text, uid)
del dTextDict[uid]
def search(self, text):
rev = self.searchIndex(text)
return rev
def _search(self, text):
dTextDict = self.dTextDict
rev = self.searchIndex(text)
if not rev: return False
word1 = cutword.cutword(text)[0].decode('utf8', 'ignore')
print rev
for uid in rev:
print '=== %s ==='%uid
text = dTextDict[uid].decode('utf8', 'ignore')
ord = text.rfind(word1)
print text[max(0, ord-120):ord+120]
print
def _words4uid(self, uid):
'对比的暴力搜索'
dWordsIndex = self.dWordsIndex
for word, uids in dWordsIndex.iteritems():
if uid in uids: print word,
def _uids4word(self, word):
'对比的暴力搜索'
dTextDict = self.dTextDict
for uid, text in dTextDict.iteritems():
if word in text: print uid,

##连接 Durus 数据库用
def getdb_durus():
#from durus.file_storage import FileStorage
#from durus.connection import Connection
#connection = Connection(FileStorage("testIndexWord.durus"))
# 用 Durus 服务
from durus.client_storage import ClientStorage
from durus.connection import Connection
connection = Connection(ClientStorage())
##
root = connection.get_root()
from durus import persistent, persistent_dict, persistent_list
from durus import btree
Tree = btree.BTree
List = persistent_list.PersistentList
def getdb(name):
if not name in root:
root[name] = Tree()
connection.commit()
return root[name]
dTextDict = getdb('TextDict')
dWordsIndex = getdb('WordsIndex')
commit = connection.commit
ti = _TextIndex(wordDict=dWordsIndex, textDict=dTextDict,
Dict=Tree, commit=commit)
return ti

##用下边的生成随机字符串来填充数据库
import random
def _randomword(n=1, m=7):
return ''.join(map(lambda
i:random.choice('abcdefghijklmnopqrstuvwxyz'),
range(random.randrange(n, m))))

def _randomtext(n=10, m=100):
return ' '.join(map(lambda i:_randomword(1, 3),
range(random.randrange(n, m))))

def _randzhtext(n=10, m=100):
return ''.join(map(lambda i:unichr(random.randrange(19968,
21000)).encode('utf8'), range(random.randrange(n, m))))

##用这个测试运行时间
import time
def _timeit(_src):
'''测试 src 运行时间
'''
exec('''
_t0 = time.time()
%s
_t1 = time.time()
_t3 = _t1 - _t0
'''%_src)
return _t3

## 用这个来随机填充英文
_timeit("for i in range(300000): print i, ti.add(_randomtext(10,100),
_randomword(2,11)) ")
## 用这个来随机填充中文
_timeit("for i in range(300000): print i, ti.add(_randzhtext(10,100),
_randzhtext(2, 10)) ")

Jiahua Huang

unread,

Nov 6, 2007, 9:18:50 AM11/6/07

to python. cn

插入 59W 条随机字符的文章后 SWAP 占用 500M,
然后客户端被 Linux 杀掉

重新连接,测试搜索

>>> _timeit(ti.search('东东'))
1.5020370483398438e-05
>>> _timeit(ti.search('东东'))
4.0531158447265625e-06

## 下边去掉了 print
>>> _timeit("ti._words4uid('东东')")
0.074419975280761719
>>> _timeit("ti._words4uid('东东')")
0.074203968048095703

>>> _timeit("ti._uids4word('东东')")
1.1149060726165771
>>> _timeit("ti._uids4word('东东')")
1.1026270389556885

Jiahua Huang

unread,

Nov 6, 2007, 9:20:06 AM11/6/07

to python. cn

这时的数据库文件大小为 1.6G

-rw-r--r-- 1 huahua huahua 1.6G 2007-11-06 20:37 testIndexWord.durus

Zoom.Quiet

unread,

Nov 6, 2007, 9:26:43 AM11/6/07

to pyth...@googlegroups.com, python-chinese列表, pyth...@googlegroups.com, zp...@googlegroups.com, cpug-ea...@googlegroups.com

很好，收录在维基:
http://wiki.woodpecker.org.cn/moin/Py4Database/LickDurus

On Nov 6, 2007 10:20 PM, Jiahua Huang <jhuang...@gmail.com> wrote:
> 这时的数据库文件大小为 1.6G
>
> -rw-r--r-- 1 huahua huahua 1.6G 2007-11-06 20:37 testIndexWord.durus
>
>
> >
>

--
'''Time is unimportant, only life important!
过程改进乃是开始催生可促生靠谱的人的组织!
'''http://zoomquiet.org
blog @ http://blog.zoomquiet.org/pyblosxom/
wiki @ http://wiki.woodpecker.org.cn/moin/ZoomQuiet
scrap @ http://floss.zoomquiet.org ; http://skm.zoomquiet.org
douban@ http://www.douban.com/people/zoomq/
好看簿 @ http://zoomq.haokanbu.com/
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pls. usage OOo to replace M$ Office. http://zh.openoffice.org
Pls. usage 7-zip to replace WinRAR/WinZip. http://7-zip.org
You can get the truely Freedom 4 software.

Reply all

Reply to author

Forward