# HG changeset patch # Parent 87844d4d20ef0909234da5958979d50180c4e54c converted matched blocks to class to make the sample selection algorithm more selt-documenting diff -r 87844d4d20ef xappy/highlight.py --- a/xappy/highlight.py Tue Aug 03 17:22:49 2010 -0400 +++ b/xappy/highlight.py Tue Aug 03 21:11:23 2010 -0400 @@ -73,6 +73,20 @@ self._stemcache[word] = stem return stem + +class _Block(object): + """Represent a block of text after by-sentence tokenization. + + Used to match streaks of related words when making a sample. + """ + def __init__(self, start, end, nb_char, nb_term, selected): + self.start = start + self.end = end + self.nb_char = nb_char + self.nb_term = nb_term + self.selected = selected + + class Highlighter(object): """Class for highlighting text and creating contextual summaries. @@ -189,8 +203,9 @@ words = self._split_text(text, True) self._query_to_stemmed_words(query) - # build blocks delimited by puncuation, and count matching words in each block - # blocks[n] is a block [firstword, endword, charcount, termcount, selected] + # build blocks delimited by puncuation, and count matching + # words in each block + blocks = [] start = end = count = blockchars = 0 @@ -202,14 +217,14 @@ end += 1 elif words[end] in ',.;:?!\n': end += 1 - blocks.append([start, end, blockchars, count, False]) + blocks.append(_Block(start, end, blockchars, count, False)) start = end blockchars = 0 count = 0 else: end += 1 if start != end: - blocks.append([start, end, blockchars, count, False]) + blocks.append(_Block(start, end, blockchars, count, False)) if len(blocks) == 0: return '' @@ -217,9 +232,9 @@ chars = 0 for count in xrange(3, -1, -1): for b in blocks: - if b[3] >= count: - b[4] = True - chars += b[2] + if b.nb_term >= count: + b.selected = True + chars += b.nb_char if chars >= maxlen: break if chars >= maxlen: break @@ -230,10 +245,10 @@ if b[4]: if i != lastblock + 1: words2.append('..') - words2.extend(words[b[0]:b[1]]) + words2.extend(words[b.start:b.end]) lastblock = i - if not blocks[-1][4]: + if not blocks[-1].selected: words2.append('..') # trim down to maxlen