[xappy] r607 committed - xappy/datastructures.py,xappy/unittests/calc_hash.py: Add a...

0 views

Skip to first unread message

xa...@googlecode.com

unread,

Feb 24, 2010, 5:28:45 AM2/24/10

to xappy-...@googlegroups.com

Revision: 607
Author: boulton.rj
Date: Wed Feb 24 02:27:13 2010
Log: xappy/datastructures.py,xappy/unittests/calc_hash.py: Add a
ProcessedDocument.remove_term() method, and a brief test of it.
http://code.google.com/p/xappy/source/detail?r=607

Modified:
/trunk/ChangeLog
/trunk/xappy/datastructures.py
/trunk/xappy/unittests/calc_hash.py

=======================================
--- /trunk/ChangeLog Mon Feb 22 16:36:53 2010
+++ /trunk/ChangeLog Wed Feb 24 02:27:13 2010
@@ -1,3 +1,8 @@
+Wed Feb 24 10:26:38 GMT 2010 Richard Boulton <ric...@tartarus.org>
+
+ * xappy/datastructures.py,xappy/unittests/calc_hash.py: Add a
+ ProcessedDocument.remove_term() method, and a brief test of it.
+
Tue Feb 23 00:35:35 GMT 2010 Richard Boulton <ric...@tartarus.org>

* xappy/searchconnection.py: Fix the backwards compatibility
=======================================
--- /trunk/xappy/datastructures.py Wed Dec 30 15:00:21 2009
+++ /trunk/xappy/datastructures.py Wed Feb 24 02:27:13 2010
@@ -211,6 +211,41 @@
for pos in positions:
self._doc.add_posting(prefix + term, pos, 0)

+ def remove_term(self, field, term):
+ """Completely remove a term from the document.
+
+ - `field` is the field to add the term to.
+ - `term` is the term to add.
+
+ """
+ prefix = self._fieldmappings.get_prefix(field)
+ if len(term) > 0:
+ # We use the following check, rather than "isupper()" to ensure
+ # that we match the check performed by the queryparser,
regardless
+ # of our locale.
+ if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
+ prefix = prefix + ':'
+
+ # Note - xapian currently restricts term lengths to about 248
+ # characters - except that zero bytes are encoded in two bytes, so
+ # in practice a term of length 125 characters could be too long.
+ # Xapian will give an error when commit() is called after such
+ # documents have been added to the database.
+ # As a simple workaround, we give an error here for terms over 220
+ # characters, which will catch most occurrences of the error early.
+ #
+ # In future, it might be good to change to a hashing scheme in this
+ # situation (or for terms over, say, 64 characters), where the
+ # characters after position 64 are hashed (we obviously need to do
this
+ # hashing at search time, too).
+ if len(prefix + term) > 220:
+ raise errors.IndexerError("Field %r is too long: maximum
length "
+ "220 - was %d (%r)" %
+ (field, len(prefix + term),
+ prefix + term))
+
+ self._doc.remove_term(prefix + term)
+
def get_terms(self, field):
"""Get the terms in a given field.

=======================================
--- /trunk/xappy/unittests/calc_hash.py Fri Feb 20 02:40:54 2009
+++ /trunk/xappy/unittests/calc_hash.py Wed Feb 24 02:27:13 2010
@@ -45,10 +45,15 @@
doc.id = "2"
pdoc4 = self.iconn.process(doc)

+ pdoc5 = self.iconn.process(doc)
+ pdoc5.add_term('a', 'some', 1)
+ pdoc5.remove_term('a', 'some')
+
self.assertEqual(len(pdoc1.calc_hash()), 40)
self.assertEqual(pdoc1.calc_hash(), pdoc2.calc_hash())
self.assertNotEqual(pdoc1.calc_hash(), pdoc3.calc_hash())
self.assertNotEqual(pdoc1.calc_hash(), pdoc4.calc_hash())
+ self.assertEqual(pdoc4.calc_hash(), pdoc5.calc_hash())