Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

Loop in regex match

5 views
Skip to first unread message

Shmuel Metz

unread,
Feb 5, 2012, 11:13:17 AM2/5/12
to
I have an regex that appears to hang where I would expect it to fail
due to \d not matching a comma. If I remove the \d then the regex
correctly matches the partial string. the match that hangs is at line
452 of the script:

if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d )/xi)

The actual script is

extproc perl.exe -SW
# extproc F:\Perl\bin\perl -SW
#!/usr/bin/perl -W

use 5.010;
use Data::Dumper;
use Regexp::Common qw /net URI/;
use Socket;
use strict;
my $decOctetPat = qr/ \d |
[1-9] \d |
1 \d \d |
2 [0-4] \d |
25 [0-5]
/x;
my $IPv4addressPat = qr/ (?:$decOctetPat\.){3} $decOctetPat /x;
my $IPv6h16 = qr/[[:xdigit:]]{1,4}/;
my $IPv6ls32 = qr/ $IPv6h16 \: $IPv6h16 | $IPv4addressPat /x;
my $IPv6AddrPat = qr/ (?: (?: $IPv6h16 \: ){6} $IPv6ls32 ) |
(?: \:\: (?: $IPv6h16 \: ){5} $IPv6ls32 ) |
(?: (?: $IPv6h16 )? \:\: (?: $IPv6h16 \: ){4} $IPv6ls32 ) |
(?: (?: $IPv6h16 \: $IPv6h16 )? \:\: (?: $IPv6h16 \: ){3} $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){2} $IPv6h16 )? \:\: (?: $IPv6h16 \: ){2} $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){3} $IPv6h16 )? \:\: $IPv6h16 \: $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){5} $IPv6h16 )? \:\: $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){6} $IPv6h16 )? \:\: )
/x;
my $domainPat = qr/[[:alnum:]]+
[[:alnum:]-]*
(?:\. [[:alnum:]]+ [[:alnum:]-]*)*
/x;
my $addressLiteralPat = qr/\[
(?:$IPv4addressPat |
$IPv6AddrPat
)
\]
/x;

my $atextPat = qr"(?:[\w!#\$%&'*+/=?^`{|}~-]+)";
#y $FWS = qr/ (?:[ \t]*\15?\12)? [ \t]+ /x;
my $FWS = qr/ (?:[ \t]*\15?\n)? [ \t]+ /x;
my $atomPat = qr/$atextPat+/x;
my $ctext = '[\x21-\x27\x2A-\x5B\x5D-\x7E]';
my $quotedPairPat = qr/ \\ [\x20-\x7E] /x;

my $commentPat =qr/
\(
(?:$FWS?
(?:$ctext | $quotedPairPat | (?R))
)*
$FWS?
\)
/x;

my $CFWS = qr/
(?: (?:$FWS+ $commentPat)+ $FWS?) |
$FWS
/x;

my $dayPat = qr/$CFWS? (?<DAY>\d{1,2}) $CFWS?/x;

# per RFC 5322 case matters
my $day_of_weekPat = qr/
$CFWS
(?<DAY_OF_WEEK>
Mon |
Tue |
Wed |
Thu |
Fri |
Sat |
Sun
)
$CFWS?
/x;

my $dotStringPat = qr/$atextPat+ (?:\. $atextPat)*/x;
my $dtextPat = '[\x21-\x50\x54-\x7E]';

my $hourPat = qr/(?<HOUR>
(\d\d)
(?(?{$^N > 24})
(*FAIL)
)
)
/x;

my $idLeftPat = qr/$dotStringPat/;
my $LinkPat = qr/TCP | $atomPat/xi;
my $noFoldLiteralPat = qr/\[ $dtextPat* \]/x;
my $idRightPat = qr/$dotStringPat | $noFoldLiteralPat/x;
my $msgIdPat = qr/\s* \< $idLeftPat \@ $idRightPat \> \s*/x;

my $minutePat = qr/(?<MINUTE>
(\d\d)
(?(?{$^N > 59})
(*FAIL)
)
)
/x;

# per RFC 5322 case matters
my $monthPat = qr/
(?<MONTH>
Jan |
Feb |
Mat |
Apr |
May |
Jum |
Jul |
Aug |
Sep |
Oct |
Mov |
Dec
)
/x;

my $obs_zonePat = qr/
CDT
CST
EDT
EST
GMT
MDT
MST
PDT
PST
UT
[A-IK-Za-ik-z]
/x;

my $qtextPat = '[\x20-\x21\x23-\x5B\x5d-\x7E]';
my $QcontentPat = qr/$qtextPat | $quotedPairPat/x;
my $QuotedStringPat = qr/"$QcontentPat*"/;

my $rDNSstat =qr/
\s*
(?:\s* \( may \s be \s forged \) ) |
(?: \s* \( misconfigured \s sender \) ) |
(?: \s* RDNS \s failed)
/x;

# Non-5321 prefix to domain in TCP-INFO
my $RecLocalPat = '(?:(?:IDENT:)?[\w+-]+[\w\.+-]*@)?';

my $secondPat = qr/(?<SECOND>
(\d\d)
(?(?{$^N > 59})
(*FAIL)
)
)
/x;

# Malformed Received headers may have 'RDNS failed' after the IP address
# or a dotted quad without framing []
my $TCPinfoPat =qr/
(?<IP>$addressLiteralPat) (?:\s* RDNS \s failed)? |
(?<IP>$IPv4addressPat) |
(?:$RecLocalPat
(?<RDNS>$domainPat)
$FWS
(?<IP>$addressLiteralPat)
$rDNSstat*
)
/x;

my $time_of_day = qr/$hourPat : $minutePat (?: : $secondPat)?/x;

my $yearPat = qr/$FWS (?<YEAR>\d{2,4}) $FWS/x;

# RFC 5322 semantic constraint not applied in order to match malformed zones.
my $zonePat = qr/$FWS
(?<ZONE>
(?:
(?:
[+-]
\d\d\d\d
) |
$obs_zonePat
)
)
/x;

# RFC 5322 shows spaces in day and year, not here
my $datePat = qr/$dayPat $monthPat $yearPat/x;

# Received: FROM non-5321 tokens seen in the wild
my $Non5321DomainPat = qr/
\. |
$IPv4addressPat |
\d+
/x;

# Malformed Received headers may have a leading hyphen in a
# domain name, a period as a domain name or an address
# literal without TCPINFO. They may also have an IPv4
# address expressed as a hexadecimal, decimal or octal constant.
my $ExtendedDomainPat = qr/
(?:(?<HELO>-?$domainPat) \s (?<IP>$addressLiteralPat)) |
(?:(?<HELO>-?$domainPat) (?:$FWS \( $TCPinfoPat \))?) |
(?<IP>$addressLiteralPat) (?:$FWS \( $TCPinfoPat \))? |
$Non5321DomainPat (?:$FWS \( $TCPinfoPat \))?
/x;

my $localPartPat = qr/$dotStringPat | $QuotedStringPat/x;
my $MailboxPat = qr/(?<LOCAL_PART>$localPartPat) \@ (?<DOMAIN>$domainPat | $addressLiteralPat)/x;

my $protocolPat = qr/SMTP | ESMTP | $atomPat/xi;

# I don't expect to see source routing in the wild
my $RecPathPat = qr/
\<
(?:\@ $domainPat (?:, \@ $domainPat)* :)?
$MailboxPat
\>
/x;

# Can't use $RE{net}{domain} due to malformed domain names
my $RecHELOpat = "(?<HELO>(?:-?$domainPat)|" .
"\\.|" .
"(?:\\[$RE{net}{IPv4}\\])|" .
"$RE{net}{IPv4}|" .
"\\d+)";

# Road Runner Received: FROM
my $RRfromPat = qr/
(?<HELO>$RE{net}{IPv4})
\s+
\(
Forwarded-For:
\s
\[
(?<IP>$RE{net}{IPv4})
\]
\)
/x;

# QMAIL Received: FROM
my $QMfromPat = qr/(?<IP>$RE{net}{IPv4})
\s+
\(
\[
(?<RDNS>$domainPat)
\]
:
\d+
\s+
"
\w+
\s*
\[
(?<HELO>$domainPat)
\]
"
[^)]*
\)';
/x;

my $RecSrcPat = qr/$RecLocalPat
(?<RDNS>$domainPat)?
\s*
\[
(?<IP>$RE{net}{IPv4})
\]
\s*
(?:\(may\sbe\sforged\))?
\s*
(?:\(misconfigured\ssender\))?
\s*
(?:\s*RDNS\sfailed)?
/x;

# The RFC 5321 syntax for From-domain does not allow an address literal without
# TCP-info in parentheses, but Yahoo creates a Stamp in that format.
# Some software puts significant information in comments beyond the
# TCPINFO of the Extended-Domain.
my $RecFromPat = qr/^
FROM
$FWS
(?<FROM>
$ExtendedDomainPat |
(?:
(
\[
(?<IP>$IPv4addressPat)
\]
)
\s*
\(
HELO=$RecHELOpat
\)
) |

(?:(?<RDNS>$domainPat)
\s+
\(
\[
(?<IP>$IPv4addressPat)
\]
\s+
HELO=$RecHELOpat
\)
) |
(?:(?<RDNS>$domainPat)
\s+
\(
HELO
\s
$RecHELOpat
\)
\s+
\(
\[
(?<IP>$IPv4addressPat)
\]
\)
) |
$QMfromPat |
$RRfromPat
)
/xi;

# per RFC 5321 it's CFWS "BY" FWS Extended-Domain
# in the wild it's CFWS "BY" FWS Domain FWS '(' MTA ')'
my $RecByPat = qr!$CFWS
BY
$FWS
(?<BY1>
(?:$domainPat |
\[ $RE{net}{IPv4} \]
)
)
(?:
$FWS
\(
(?<BY2>[\s\w\./-]+)
\)
)?
!xi;
my $RecForPat = qr/$CFWS FOR $FWS (?: $RecPathPat | $MailboxPat)/xi;
my $RecIdPat = qr/$CFWS ID $FWS (?<ID>$atomPat | $msgIdPat)/xi;
my $RecViaPat = qr/$CFWS VIA $FWS (?<LINK>$LinkPat)/xi;

# m$ lookout violates RFC 5321 syntax
my $RecWithMS = qr/Microsoft \s+ (?:ESMTP|SMTP) (?:\s+ Server | SVC\(\d+(?:\.\d+)*\))/xi;
my $RecWithPat = qr/$CFWS
WITH $FWS
(?:
(?:ESMTP|SMTP) |
$RecWithMS |
NNFMP # Yahoo
)
(?:
$FWS
\(
SMTP
[\d\w\.-]*
\)
)?
/xi;

my $RecOptInfo = qr/
(?<VIA>$RecViaPat)?
(?<WITH>$RecWithPat)?
(?:$RecIdPat)?
(?<FOR>$RecForPat)?
/xi;

my $timePat = qr/$time_of_day $zonePat/x;

# RFC 5322 shows spaces in day and year, not here
my $date_timePat = qr/
(?: $day_of_weekPat [,])?
$datePat
$timePat
(?:$CFWS)?
/x;

my $RecPat = qr/^
$RecFromPat
$RecByPat
$RecOptInfo
$CFWS?
\;
# $date_timePat
$datePat
# $timePat
# $time_of_day
$hourPat
/x;

my @testheaders = (<<'EOF1',<<'EOF2',<<'EOF3',<<'EOF4',<<'EOF5',<<'EOF6',<<EOF7,<<EOF8);
from amethyst.nstc.com (majo...@amethyst.nstc.com [207.166.196.179]) by mail.acm.org (8.8.5/8.7.5) with ESMTP id AAA44952 for <Shm...@ACM.Org>; Wed, 6 Jan 1999 00:06:57 -0500
EOF1
(from majordomo@localhost)
by amethyst.nstc.com (8.9.1/8.9.1/nstc.com) id AAA16796
for freemail-outgoing; Wed, 6 Jan 1999 00:26:52 -0500
EOF2
from devel.nacs.net (IDENT:ro...@devel.nacs.net [207.166.192.85])
by amethyst.nstc.com (8.9.1/8.9.1/nstc.com) with ESMTP id AAA16789
for <free...@nstc.com>; Wed, 6 Jan 1999 00:26:49 -0500
EOF3
from relay1.mnsinc.com (relay1.mnsinc.com [206.55.3.25])
by devel.nacs.net (8.8.7/8.8.8) with ESMTP id WAA10733
for <free...@nstc.com>; Tue, 5 Jan 1999 22:44:35 -0500
EOF4
from U86 (u86.os2bbs.com [206.55.10.86])
by relay1.mnsinc.com (8.9.0/8.9.0) with SMTP id WAA29325
for <free...@nstc.com>; Tue, 5 Jan 1999 22:39:39 -0500 (EST)
EOF5
from localhost (localhost [127.0.0.1])
by lincoln-at-leros.patriot.net (Postfix) with ESMTP id 12BBE55E73
for <mari...@patriot.net>; Fri, 27 Jan 2012 09:23:59 -0500 (EST)
EOF6
from mail.acm.org [199.222.69.4] by piglet.toward.com with ESMTP
(SMTPD32-4.06) id AF982F028E; Wed, 06 Jan 1999 00:07:36 EDT
EOF7
from mail.acm.org [199.222.69.4] by piglet.toward.com with ESMTP
(foo) id AF982F028E; Wed, 06 Jan 1999 00:07:36 EDT
EOF8

msg("\n\@testheaders has " . scalar @testheaders . " lines\n");
foreach (@testheaders) {
msg("\n\t --> $_\n");
my $RecTest = qr/
$RecFromPat
$RecByPat?
$RecOptInfo
$CFWS?
\;
(?: $day_of_weekPat [,])?
$datePat
$timePat
(?:$CFWS)?
/xi;
if (/$RecTest/) {
msg("\nMatched \$RecTest\n");
foreach my $key (sort keys %+) {
print STDERR "\$+{$key}=$+{$key}\n";
}
msg("\n");
foreach my $key (sort keys %-) {
print STDERR "\$-{$key}=",grep defined, @{$-{$key}},"\n";
}
} else {
msg("\nDid not match\$RecTest\n");
}
# if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? (?<DAY>\d{1,2}) )/xi) {
# if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d{1,2} )/xi) {
if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d )/xi) {
msg("\nMatched $1\n");
# msg("\nDumper(\%+):\n");
# msg(Dumper(%+),"\n");
# msg("\nDumper(\%-):\n");
# msg(Dumper(%-),"\n");
foreach my $key (sort keys %+) {
print STDERR "\$+{$key}=$+{$key}\n";
}
msg("\n");
foreach my $key (sort keys %-) {
print STDERR "\$-{$key}=",grep defined, @{$-{$key}},"\n";
}
} else {
msg("\nDid not match\n");
}
msg("\n");
}

sub msg {
print STDERR @_;
}

1;
__END__


--
Shmuel (Seymour J.) Metz, SysProg and JOAT <http://patriot.net/~shmuel>

Unsolicited bulk E-mail subject to legal action. I reserve the
right to publicly post or ridicule any abusive E-mail. Reply to
domain Patriot dot net user shmuel+news to contact me. Do not
reply to spam...@library.lspace.org

Ben Morrow

unread,
Feb 5, 2012, 6:43:29 PM2/5/12
to

Quoth Shmuel (Seymour J.) Metz <spam...@library.lspace.org.invalid>:
> I have an regex that appears to hang where I would expect it to fail
> due to \d not matching a comma. If I remove the \d then the regex
> correctly matches the partial string. the match that hangs is at line
> 452 of the script:

The only usual reason for a regex to hang is because you have nested
quantifiers that require an exponential number of backtracks to check
all possibilities. The standard example would be something like

("a" x 50) =~ /(a+a+)+b/

which will not match, but will do a lot of backtracking before it
decides that. I can't see any obvious case of that in the code you
posted, but you may want to run the program under use re "debug" to get
some idea of where the match is getting stuck. Otherwise, you'll need to
start ripping bits out of the pattern and/or the string until you find
the problem.

If you do find that's the problem, the usual solution (assuming you
can't get rid of the nested quantifiers) is to use (?>) to limit the
backtracking. (You have to be careful, of course, to still allow
backtracking where it is needed.) 5.10 also has '++', '?+' and '*+'
variants of the quantifiers which never backtrack.

Ben

s...@netherlands.com

unread,
Feb 5, 2012, 9:24:33 PM2/5/12
to
On Sun, 05 Feb 2012 11:13:17 -0500, Shmuel (Seymour J.) Metz <spam...@library.lspace.org.invalid> wrote:

>I have an regex that appears to hang where I would expect it to fail
>due to \d not matching a comma. If I remove the \d then the regex
>correctly matches the partial string. the match that hangs is at line
>452 of the script:
>
>if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d )/xi)
>

I suspect the massive backtracking is being caused by the
usage of the combination of:

/$RecFromPat $RecByPat? $RecOptInfo/x

wherever it is being used like that.

This means there are underlying real world problems with the construction
(ie; lack of thought) of the core regex's.

I fixed your temporary problems, but will not try to debug the flawed massive
composite regex's in your program.

From the comments in the code, it looks as if you are trying to program to a standard.
That runs more the realm of validation first, data extraction second.
These things are easy to misconstrue when you try to mix and match when it comes to
quantifiers.

I was going to explicitly state the changes you should make, but it was to much work
given the flawwed basis. You should pick out the changes.

Good luck shmuel !
PS. I can donate my services for a donation!
(Donations are negotiable)
PPS. Be careful with (?R), it recurses to the beginning of the whole (composite?) regex,
although, I'm only %90 sure that escapes the enclosing qr// object.

-sln

-----------------
\(
(?:
(?> (?:\\[\s\S] | [^(\\)])+ )
| (?-1)

)*
\)
(?>
$RecFromPat
$RecByPat?
$RecOptInfo
)
$CFWS?
\;
(?: $day_of_weekPat [,])?
$datePat
$timePat
(?:$CFWS)?
/xi;
if (/$RecTest/) {
msg("\nMatched \$RecTest\n");
foreach my $key (sort keys %+) {
print STDERR "\$+{$key}=$+{$key}\n";
}
msg("\n");
foreach my $key (sort keys %-) {
print STDERR "\$-{$key}=",grep defined, @{$-{$key}},"\n";
}
} else {
msg("\nDid not match\$RecTest\n");
}
# if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? (?<DAY>\d{1,2}) )/xi) {
# if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d{1,2} )/xi) {
if (/((?> $RecFromPat $RecByPat? $RecOptInfo) \; $CFWS? \d )/xi) {

s...@netherlands.com

unread,
Feb 5, 2012, 9:27:57 PM2/5/12
to
The backtracking is massive.

-sln

Shmuel Metz

unread,
Feb 6, 2012, 10:51:38 AM2/6/12
to
In <udeui79a99fmv3kva...@4ax.com>, on 02/05/2012
at 06:24 PM, s...@netherlands.com said:

>I suspect the massive backtracking is being caused by the usage of
>the combination of:

>/$RecFromPat $RecByPat? $RecOptInfo/x

my $RecOptInfo = qr/
(?<VIA>$RecViaPat)?+
(?<WITH>$RecWithPat)?+
(?:$RecIdPat)?+
(?<FOR>$RecForPat)?+
/xi;

>wherever it is being used like that.

At Ben's suggestion I made some of the matches greedy and that seems
to have solved the problem. $CFWS was indeed one of the regexen I had
to change:

my $CFWS = qr/
(?: (?:$FWS+ $commentPat)++ $FWS?+) |
$FWS
/x;

>I fixed your temporary problems,

It looks like you changed the syntax of comments; \S will match
characters that were excluded from $ctext.

>From the comments in the code, it looks as if you are trying to
>program to a standard.

Theoretically. The Received header fields is described by RFC 5321,
but there's a lot of software that's non-compliant and I need to parse
the header fields that they generate as well as the compliant ones.
That forces this to be an exercise in rapid prototyping :-(

Shmuel Metz

unread,
Feb 6, 2012, 10:55:29 AM2/6/12
to
In <1qg309-...@anubis.morrow.me.uk>, on 02/05/2012
at 11:43 PM, Ben Morrow <b...@morrow.me.uk> said:

>Otherwise, you'll need to start ripping bits out of the pattern
>and/or the string until you find the problem.

I've been able to reproduce the problem using a much smaller pattern,
and I initially tried making things greedy $CFWS, but it was still
hanging

my $FWS = qr/ (?:[ \t]*+\R)? [ \t]++ /x;

my $ctext = '[\x21-\x27\x2A-\x5B\x5D-\x7E]';

my $quotedPairPat = qr/ \\ [\x20-\x7E] /x;

my $commentPat =qr/
\(
(?:$FWS?+
(?:$ctext | $quotedPairPat | (?R))
)*
$FWS?+
\)
/x;

my $CFWS = qr/
(?: (?:$FWS+ $commentPat)++ $FWS?+) |
$FWS
/x;

if (/( \; (?>$CFWS)? \d )/xi)

The other piece I needed was greedy matches in

my $RecOptInfo = qr/
(?<VIA>$RecViaPat)?+
(?<WITH>$RecWithPat)?+
(?:$RecIdPat)?+
(?<FOR>$RecForPat)?+
/xi;

Thanks.
0 new messages