...
elif re.match("^DATASET:\s*(.+) ", line):
m=re.match("^DATASET:\s*(.+) ", line)
print m.group(1))
which is ugly because of the duplication but I can't think of a nicer of way
of doing this that will allow for a lot of these sorts of cases. Any
suggestions?
--
View this message in context: http://www.nabble.com/Best-way-to-extract-from-regex-in-if-statement-tp22878967p22878967.html
Sent from the Python - python-list mailing list archive at Nabble.com.
How about something like:
your_regexes = [
re.compile('rx1'),
re.compile('rx2'),
# etc....
]
for line in lines:
for rx in your_regexes:
m = rx.match(line)
if m:
print m.group(1)
break # if only the first matching regex is required,
otherwise leave black for all
Untested, but seems to make sense
hth,
Jon
I've done this in the past with re-to-function pairings:
def action1(matchobj):
print matchobj.group(1)
def action2(matchobj):
print matchobj.group(3)
# ... other actions to perform
searches = [
(re.compile(PATTERN1), action1),
(re.compile(PATTERN2), action2),
# other pattern-to-action pairs
]
# ...
line = ...
for regex, action in searches:
m = regex.match(line)
if m:
action(m)
break
else:
no_match(line)
(note that that's a for/else loop, not an if/else pair)
-tkc
Or in case you want to handle each regexp differently, you can
construct a dict {regexp : callback_function} that picks the right
action depending on which regexp matched. As for how to populate the
dict, if most methods are short expressions, lambda comes in pretty
handly, e.g.
{
rx1: lambda match: match.group(1),
rx2: lambda match: sum(map(int, match.groups())),
...
}
If not, you can combine the handler definition with the mapping update
by using a simple decorator factory such as the following (untested):
def rxhandler(rx, mapping):
rx = re.compile(rx)
def deco(func):
mapping[rx] = func
return func
return deco
d = {}
@rxhandler("^DATASET:\s*(.+) ", d)
def handle_dataset(match):
...
@rxhandler("^AUTHORS:\s*(.+) ", d)
def handle_authors(match):
...
HTH,
George
Sometimes I like to make a special class that saves the result:
class Reg(object): # illustrative code, not tested
def match(self, pattern, line):
self.result = re.match(pattern, line)
return self.result
Then your example would look something like:
save_re = Reg()
....
elif save_re.match("^DATASET:\s*(.+) ", line):
print save_re.result.group(1)
One word of caution: dicts are unsorted, so if more than one
regexp can match a given line, they either need to map to the
same function, or you need to use a list of regexp-to-functions
(see my previous post) for a determinate order.
-tkc
Since this idiom makes repeated references to the input line, I added
that to the constructor of the matching class.
By using __call__, I made the created object callable, taking the RE
expression as its lone argument and returning a boolean indicating
match success or failure. The result of the re.match call is saved in
self.matchresult.
By using __getattr__, the created object proxies for the results of
the re.match call.
I think the resulting code looks pretty close to the original C or
Perl idiom of cascading "elif (c=re_expr_match("..."))" blocks.
(I thought about cacheing previously seen REs, or adding support for
compiled REs instead of just strings - after all, this idiom usually
occurs in a loop while iterating of some large body of text. It turns
out that the re module already caches previously compiled REs, so I
left my cacheing out in favor of that already being done in the std
lib.)
-- Paul
import re
class REmatcher(object):
def __init__(self,sourceline):
self.line = sourceline
def __call__(self, regexp):
self.matchresult = re.match(regexp, self.line)
self.success = self.matchresult is not None
return self.success
def __getattr__(self, attr):
return getattr(self.matchresult, attr)
This test:
test = """\
ABC
123
xyzzy
Holy Hand Grenade
Take the pebble from my hand, Grasshopper
"""
outfmt = "'%s' is %s [%s]"
for line in test.splitlines():
matchexpr = REmatcher(line)
if matchexpr(r"\d+$"):
print outfmt % (line, "numeric", matchexpr.group())
elif matchexpr(r"[a-z]+$"):
print outfmt % (line, "lowercase", matchexpr.group())
elif matchexpr(r"[A-Z]+$"):
print outfmt % (line, "uppercase", matchexpr.group())
elif matchexpr(r"([A-Z][a-z]*)(\s[A-Z][a-z]*)*$"):
print outfmt % (line, "a proper word or phrase",
matchexpr.group())
else:
print outfmt % (line, "something completely different", "...")
Produces:
'ABC' is uppercase [ABC]
'123' is numeric [123]
'xyzzy' is lowercase [xyzzy]
'Holy Hand Grenade' is a proper word or phrase [Holy Hand Grenade]
'Take the pebble from my hand, Grasshopper' is something completely
different [...]
That is quite similar to the one I use...
"""
Matcher class encapsulating a call to re.search for ease of use in conditionals.
"""
import re
class Matcher(object):
"""
Matcher class
m = Matcher()
if m.search(r'add (\d+) (\d+)', line):
do_add(m[0], m[1])
elif m.search(r'mult (\d+) (\d+)', line):
do_mult(m[0], m[1])
elif m.search(r'help (\w+)', line):
show_help(m[0])
"""
def search(self, r, s):
"""
Do a regular expression search and return if it matched.
"""
self.value = re.search(r, s)
return self.value
def __getitem__(self, n):
"""
Return n'th matched () item.
Note so the first matched item will be matcher[0]
"""
return self.value.group(n+1)
def groups(self):
"""
Return all the matched () items.
"""
return self.value.groups()
--
Nick Craig-Wood <ni...@craig-wood.com> -- http://www.craig-wood.com/nick