# HG changeset patch
# Parent 87844d4d20ef0909234da5958979d50180c4e54c
converted matched blocks to class to make the sample selection algorithm more selt-documenting

diff -r 87844d4d20ef xappy/highlight.py
--- a/xappy/highlight.py	Tue Aug 03 17:22:49 2010 -0400
+++ b/xappy/highlight.py	Tue Aug 03 21:11:23 2010 -0400
@@ -73,6 +73,20 @@
             self._stemcache[word] = stem
             return stem
 
+
+class _Block(object):
+    """Represent a block of text after by-sentence tokenization.
+    
+    Used to match streaks of related words when making a sample. 
+    """
+    def __init__(self, start, end, nb_char, nb_term, selected):
+        self.start = start
+        self.end = end
+        self.nb_char = nb_char
+        self.nb_term = nb_term
+        self.selected = selected
+
+        
 class Highlighter(object):
     """Class for highlighting text and creating contextual summaries.
 
@@ -189,8 +203,9 @@
         words = self._split_text(text, True)
         self._query_to_stemmed_words(query)
 
-        # build blocks delimited by puncuation, and count matching words in each block
-        # blocks[n] is a block [firstword, endword, charcount, termcount, selected]
+        # build blocks delimited by puncuation, and count matching
+        # words in each block
+
         blocks = []
         start = end = count = blockchars = 0
 
@@ -202,14 +217,14 @@
                 end += 1
             elif words[end] in ',.;:?!\n':
                 end += 1
-                blocks.append([start, end, blockchars, count, False])
+                blocks.append(_Block(start, end, blockchars, count, False))
                 start = end
                 blockchars = 0
                 count = 0
             else:
                 end += 1
         if start != end:
-            blocks.append([start, end, blockchars, count, False])
+            blocks.append(_Block(start, end, blockchars, count, False))
         if len(blocks) == 0:
             return ''
 
@@ -217,9 +232,9 @@
         chars = 0
         for count in xrange(3, -1, -1):
             for b in blocks:
-                if b[3] >= count:
-                    b[4] = True
-                    chars += b[2]
+                if b.nb_term >= count:
+                    b.selected = True
+                    chars += b.nb_char
                     if chars >= maxlen: break
             if chars >= maxlen: break
 
@@ -230,10 +245,10 @@
             if b[4]:
                 if i != lastblock + 1:
                     words2.append('..')
-                words2.extend(words[b[0]:b[1]])
+                words2.extend(words[b.start:b.end])
                 lastblock = i
 
-        if not blocks[-1][4]:
+        if not blocks[-1].selected:
             words2.append('..')
 
         # trim down to maxlen