Modified:
trunk/src/esmre.py
trunk/test/test_esmre.py
Log:
Fix issue #10. Extract hints from simple groups and named groups.
(merged from branches/groups)
Modified: trunk/src/esmre.py
==============================================================================
--- trunk/src/esmre.py (original)
+++ trunk/src/esmre.py Wed Oct 1 13:10:36 2008
@@ -2,7 +2,7 @@
# encoding: utf-8
# esmre.py - clue-indexed regular expressions module
-# Copyright (C) 2007 Tideway Systems Limited.
+# Copyright (C) 2007-2008 Tideway Systems Limited.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -22,109 +22,202 @@
import esm
import threading
-def hints(regex):
- hints = [""]
- to_append = ""
+class InBackslashState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
- group_level = 0
- in_class = False
- in_backslash = False
- in_braces = False
-
- for ch in regex:
- if in_backslash:
- in_backslash = False
-
- elif in_class:
- if ch == "]":
- in_class = False
-
- elif ch == "\\":
- in_backslash = True
-
- else:
- pass
+ def process_byte(self, ch):
+ return self.parent_state
+
+
+class InClassState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
+
+ def process_byte(self, ch):
+ if ch == "]":
+ return self.parent_state
- elif group_level > 0:
- if ch == ")":
- group_level -= 1
-
- elif ch == "(":
- group_level += 1
-
- elif ch == "[":
- in_class = True
-
- elif ch == "\\":
- in_backslash = True
-
- else:
- pass
+ elif ch == "\\":
+ return InBackslashState(self)
- elif in_braces:
- if ch == "}":
- in_braces = False
-
- else:
- pass
+ else:
+ return self
+
+
+class InBracesState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
+
+ def process_byte(self, ch):
+ if ch == "}":
+ return self.parent_state
else:
- if ch in "?*":
- to_append = ""
- hints.append("")
-
- elif ch in "+.^$":
- if to_append:
- hints[-1] += to_append
-
- to_append = ""
- hints.append("")
-
- elif ch == "(":
- if to_append:
- hints[-1] += to_append
-
- to_append = ""
- hints.append("")
- group_level += 1
-
- elif ch == "[":
- if to_append:
- hints[-1] += to_append
-
- to_append = ""
- hints.append("")
- in_class = True
-
- elif ch == "{":
- if to_append:
- hints[-1] += to_append[:-1]
-
- to_append = ""
- hints.append("")
- in_braces = True
-
- elif ch == "\\":
- if to_append:
- hints[-1] += to_append
-
- to_append = ""
- hints.append("")
- in_backslash = True
-
- elif ch == "|":
- return []
-
- else:
- if to_append:
- hints[-1] += to_append
-
- to_append = ch
+ return self
+
+
+class CollectingState(object):
+ def __init__(self):
+ self.hints = [""]
+
+ def process_byte(self, ch):
+ self.update_hints(ch)
+ return self.next_state(ch)
+
+ def bank_current_hint_with_last_byte(self):
+ self.hints.append("")
+
+ def bank_current_hint_and_forget_last_byte(self):
+ if isinstance(self.hints[-1], list):
+ del self.hints[-1]
+ else:
+ self.hints[-1] = self.hints[-1][:-1]
+
+ self.hints.append("")
+
+ def forget_all_hints(self):
+ self.hints = [""]
+
+ def append_to_current_hint(self, ch):
+ self.hints[-1] += ch
+
+ def update_hints(self, ch):
+ if ch in "?*{":
+ self.bank_current_hint_and_forget_last_byte()
+
+ elif ch in "+.^$([\\":
+ self.bank_current_hint_with_last_byte()
+
+ elif ch == "|":
+ self.forget_all_hints()
- if to_append:
- hints[-1] += to_append
+ else:
+ self.append_to_current_hint(ch)
+
+ def next_state(self, ch):
+ if ch == "(":
+ return StartOfGroupState(self)
+
+ elif ch == "[":
+ return InClassState(self)
+
+ elif ch == "{":
+ return InBracesState(self)
+
+ elif ch == "\\":
+ return InBackslashState(self)
+
+ elif ch == "|":
+ return self.alternation_state()
+
+ else:
+ return self
+
+ def alternation_state(self):
+ raise NotImplementedError
+
+
+class RootState(CollectingState):
+ def alternation_state(self):
+ raise StopIteration
+
+
+class StartOfGroupState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
+
+ def process_byte(self, ch):
+ if ch == "?":
+ return StartOfExtensionGroupState(self.parent_state)
+ else:
+ return InGroupState(self.parent_state).process_byte(ch)
+
+
+class InGroupState(CollectingState):
+ def __init__(self, parent_state):
+ CollectingState.__init__(self)
+ self.parent_state = parent_state
+ self.had_alternation = False
+
+ def update_hints(self, ch):
+ if ch == ")":
+ if not self.had_alternation:
+ self.parent_state.hints.append(self.hints)
+ else:
+ CollectingState.update_hints(self, ch)
+
+ def next_state(self, ch):
+ if ch == ")":
+ return self.close_group_state()
+ else:
+ return CollectingState.next_state(self, ch)
+
+ def close_group_state(self):
+ return self.parent_state
+
+ def alternation_state(self):
+ self.had_alternation = True
+ return self
+
+
+class StartOfExtensionGroupState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
+
+ def process_byte(self, ch):
+ if ch == "P":
+ return MaybeStartOfNamedGroupState(self.parent_state)
+ else:
+ return IgnoredGroupState(self.parent_state).process_byte(ch)
+
+
+class MaybeStartOfNamedGroupState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
+
+ def process_byte(self, ch):
+ if ch == "<":
+ return InNamedGroupNameState(self.parent_state)
+ else:
+ return IgnoredGroupState(self.parent_state)
+
+
+class InNamedGroupNameState(object):
+ def __init__(self, parent_state):
+ self.parent_state = parent_state
+
+ def process_byte(self, ch):
+ if ch == ">":
+ return InGroupState(self.parent_state)
+ else:
+ return self
+
+
+class IgnoredGroupState(InGroupState):
+ def update_hints(self, ch):
+ pass
+
+
+def hints(regex):
+ state = RootState()
+
+ try:
+ for ch in regex:
+ state = state.process_byte(ch)
+
+ except StopIteration:
+ pass
- return [hint for hint in hints if hint]
+ def flattened(l):
+ for item in l:
+ if isinstance(item, list):
+ for i in flattened(item):
+ yield i
+ else:
+ yield item
+
+ return [hint for hint in flattened(state.hints) if hint]
def shortlist(hints):
Modified: trunk/test/test_esmre.py
==============================================================================
--- trunk/test/test_esmre.py (original)
+++ trunk/test/test_esmre.py Wed Oct 1 13:10:36 2008
@@ -2,7 +2,7 @@
# encoding: utf-8
# esmre_tests.py - tests for esmre module
-# Copyright (C) 2007 Tideway Systems Limited.
+# Copyright (C) 2007-2008 Tideway Systems Limited.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -24,51 +24,51 @@
class HintExtractionTests(unittest.TestCase):
def checkHints(self, expected_hints, regex):
- self.assertEqual(expected_hints, esmre.hints(regex))
+ self.assertEqual(set(expected_hints), set(esmre.hints(regex)))
def testSimpleString(self):
- self.checkHints(["yarr"], "yarr")
+ self.checkHints(["yarr"], r"yarr")
def testSkipsOptionalCharacter(self):
- self.checkHints(["dubloon"], "dubloons?")
+ self.checkHints(["dubloon"], r"dubloons?")
def testStartsNewStringAfterOptionalCharacter(self):
- self.checkHints(["ship", "shape"], "ship ?shape")
+ self.checkHints(["ship", "shape"], r"ship ?shape")
def testSkipsOptionalRepeatedCharacter(self):
- self.checkHints(["bristol", "fasion"], "bristol *fasion")
+ self.checkHints(["bristol", "fasion"], r"bristol *fasion")
def testIncludesRepeatedCharacterButStartsNewHint(self):
self.checkHints(["ava", "st me harties"],
- "ava+st me harties")
+ r"ava+st me harties")
- def testSkipsGroups(self):
+ def testSkipsGroupsWithAlternation(self):
self.checkHints(["Hoist the ", ", ye ", "!"],
- "Hoist the (mizzen mast|main brace), "
- "ye (landlubbers|scurvy dogs)!")
+ r"Hoist the (mizzen mast|main brace), "
+ r"ye (landlubbers|scurvy dogs)!")
def testSkipsAny(self):
self.checkHints(["After 10 paces, ", " marks the spot"],
- "After 10 paces, . marks the spot")
+ r"After 10 paces, . marks the spot")
def testSkipsOneOrMoreAny(self):
self.checkHints(["Hard to ", "!"],
- "Hard to .+!")
+ r"Hard to .+!")
def testSkipsNestedGroups(self):
- self.checkHints(["Squark!"],
- "Squark!( Pieces of (.+)!)")
+ self.checkHints(["Squark!", " Pieces of ", "!"],
+ r"Squark!( Pieces of (.+)!)")
def testSkipsCharacterClass(self):
self.checkHints(["r"],
- "[ya]a*r+")
+ r"[ya]a*r+")
def testRightBracketDoesNotCloseGroupIfInClass(self):
self.checkHints([":=", "X"],
- ":=([)D])X")
+ r":=([)D])X")
def testSkipsBackslashMetacharacters(self):
- self.checkHints(["Cap'n", " "],
+ self.checkHints(["Cap'n", " ", " Beard"],
r"Cap'n\b ([\S] Beard)")
def testBackslashBracketDoesNotCloseGroup(self):
@@ -80,7 +80,7 @@
r":=[)D\]]X")
def testSkipsMetacharactersAfterGroups(self):
- self.checkHints(["Yo ", " and a bottle of rum"],
+ self.checkHints(["Yo ", "ho ", " and a bottle of rum"],
r"Yo (ho )+ and a bottle of rum")
def testSkipsRepetionBraces(self):
@@ -91,7 +91,7 @@
self.checkHints([], r"rum|grog")
def testSkipMatchBeginning(self):
- self.checkHints(["The black perl"], "^The black perl")
+ self.checkHints(["The black perl"], r"^The black perl")
def testSkipMatchEnd(self):
self.checkHints(["Davey Jones' Locker"], r"Davey Jones' Locker$")
@@ -99,6 +99,48 @@
def testOnlyGroupGivesEmptyResult(self):
self.checkHints([], r"(rum|grog)")
+ def testGetsHintsFromGroups(self):
+ self.checkHints(["/"], r"([0-3][0-9]/[0-1][0-9]/[1-2][0-9]{3})")
+
+ def testSkipsOptionalGroups(self):
+ self.checkHints(["Shiver me timbers!"],
+ r"Shiver me timbers!( Arrr!)?")
+
+ def testSkipsMostExtensionGroups(self):
+ for regex in [
+ # set flag
+ r"(?i)(?L)(?m)(?s)(?u)(?x)",
+
+ # non-grouping paren
+ r"(?:foo)",
+
+ # previous named group
+ r"(?P=foo)",
+
+ # comment
+ r"(?#foo)",
+
+ # lookahead
+ r"(?=foo)",
+
+ # negative lookahead
+ r"(?!foo)",
+
+ # lookbehind
+ r"(?<=foo)",
+
+ # negative lookbehind
+ r"(?<!foo)",
+
+ # conditional match
+ r"(?(1)foo|bar)"]:
+
+ self.checkHints([], regex)
+
+ def testGetsHintsFromNamedGroup(self):
+ self.checkHints(
+ ["/"], r"(?P<date>[0-3][0-9]/[0-1][0-9]/[1-2][0-9]{3})")
+
class ShortlistTests(unittest.TestCase):
def checkShortlist(self, expected_shortlist, hints):