Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

Perl5a11d Bug

0 views
Skip to first unread message

Michael H. Coen

unread,
Jul 28, 1994, 6:15:25 PM7/28/94
to
Actually, here's a much simper manisfestation of the bug:

---------------------------------------
File: t/bug2.pl

$_ = "<hi;Larry>";
if (($a, $b) = /<(.+);(.+)?>/) {
print "$a, $b\n";
}
---------------------------------------

---------------------------------------
mhcoen@double-chex>perl5 t/bug2 (outputs nothing)
mhcoen@double-chex>
---------------------------------------
mhcoen@double-chex>perl t/bug2
hi, Larry
---------------------------------------

Any chance of getting a patch for this?

- Michael

Michael H. Coen

unread,
Jul 28, 1994, 6:00:14 PM7/28/94
to
Here's a Perl 5 regular expression bug I just found. The addition of
the ? in the regular expression breaks it:

(Sorry the example is a little complex, but this is the simplest
manifestation of the bug I could find.)

---------------------------------------
File: t/bug2.pl

$_ = "<hi;there;Larry>";

if (($a, $b, $c, $d, $e) = /<([^;]+)(;[^;]+(;(.+)))?>/) {
print "$a, $d\n";
}

---------------------------------------
mhcoen@double-chex>perl5 t/bug2 (outputs nothing)
mhcoen@double-chex>
---------------------------------------
mhcoen@double-chex>perl t/bug2
hi, Larry
---------------------------------------

This is perl, version 5.0, Alpha 11 (unsupported)

$RCSfile: perl.c,v $$Revision: 5.0 $$Date: 92/08/07 18:25:50 $
Patch level: 0


More specifically, this is release alpha 11d.

- Michael

Larry Wall

unread,
Aug 4, 1994, 10:43:32 PM8/4/94
to
In article <319alt...@life.ai.mit.edu> mhc...@double-chex.ai.mit.edu (Michael H. Coen) writes:
: Actually, here's a much simper manisfestation of the bug:

Here's the patch for it. It should be applied to Alpha 12.

I basically had to rewrite the backtracking algorithm, because I was
going about it all wrong. The old code made the boneheaded assumption
that an inner match could only match one way. But in order to allow an
inner match to match multiple ways, you have to let the inner match
check whether the rest of the regexp matches, and that involves going
back up the parse tree by recursing deeper! It gets hairy when you
want to keep track of how many times you've matched at each of several
levels simultaneously, and the inner matches are managing the
backtracking of the outer matches.

I'm not smart enough to be happy about thinking that hard.

Larry

*** /scalpel/lwall/perl5alpha12/regcomp.h Fri Jun 24 21:13:12 1994
--- regcomp.h Thu Aug 4 16:32:23 1994
***************
*** 81,89 ****
#define CLOSE 28 /* num Analogous to OPEN. */
#define MINMOD 29 /* no Next operator is not greedy. */
#define GBOL 30 /* no Matches where last m//g left off. */
! #define IFMATCH 31 /* no Fails if the following matches. */
#define UNLESS 32 /* no Fails if the following matches. */
#define SUCCEED 33 /* no Return from a subroutine, basically. */

/*
* Opcode notes:
--- 81,90 ----
#define CLOSE 28 /* num Analogous to OPEN. */
#define MINMOD 29 /* no Next operator is not greedy. */
#define GBOL 30 /* no Matches where last m//g left off. */
! #define IFMATCH 31 /* no Succeeds if the following matches. */
#define UNLESS 32 /* no Fails if the following matches. */
#define SUCCEED 33 /* no Return from a subroutine, basically. */
+ #define WHILE 34 /* no Do curly processing and see if rest matches. */

/*
* Opcode notes:
***************
*** 110,116 ****
#ifndef DOINIT
extern char regarglen[];
#else
! char regarglen[] = {0,0,0,0,0,0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0};
#endif

#ifndef DOINIT
--- 111,117 ----
#ifndef DOINIT
extern char regarglen[];
#else
! char regarglen[] = {0,0,0,0,0,0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,0};
#endif

#ifndef DOINIT
***************
*** 149,155 ****
MINMOD,
BOL,
BRANCH,
! END
};
#endif

--- 150,158 ----
MINMOD,
BOL,
BRANCH,
! BRANCH,
! END,
! WHILE
};
#endif

***************
*** 157,163 ****
#ifndef DOINIT
extern char varies[];
#else
! char varies[] = {BRANCH,BACK,STAR,PLUS,CURLY,CURLYX,REF,0};
#endif

/* The following always have a length of 1. */
--- 160,166 ----
#ifndef DOINIT
extern char varies[];
#else
! char varies[] = {BRANCH,BACK,STAR,PLUS,CURLY,CURLYX,REF,WHILE,0};
#endif

/* The following always have a length of 1. */
***************
*** 213,220 ****
--- 216,225 ----

#ifdef REGALIGN
#define NEXTOPER(p) ((p) + 4)
+ #define PREVOPER(p) ((p) - 4)
#else
#define NEXTOPER(p) ((p) + 3)
+ #define PREVOPER(p) ((p) - 3)
#endif

#define MAGIC 0234
*** /scalpel/lwall/perl5alpha12/regcomp.c Wed Aug 3 00:26:17 1994
--- regcomp.c Thu Aug 4 16:36:30 1994
***************
*** 580,586 ****
if ((flags&SIMPLE))
reginsert(CURLY, ret);
else {
! regtail(ret, regnode(SUCCEED));
reginsert(CURLYX,ret);
regtail(ret, regnode(NOTHING));
}
--- 580,586 ----
if ((flags&SIMPLE))
reginsert(CURLY, ret);
else {
! regtail(ret, regnode(WHILE));
reginsert(CURLYX,ret);
regtail(ret, regnode(NOTHING));
}
***************
*** 1480,1487 ****
--- 1480,1493 ----
case UNLESS:
p = "UNLESS";
break;
+ case IFMATCH:
+ p = "IFMATCH";
+ break;
case SUCCEED:
p = "SUCCEED";
+ break;
+ case WHILE:
+ p = "WHILE";
break;
default:
FAIL("corrupted regexp opcode");
*** /scalpel/lwall/perl5alpha12/regexec.c Sun Jul 24 23:26:51 1994
--- regexec.c Thu Aug 4 19:14:57 1994
***************
*** 60,65 ****
--- 60,81 ----
static char* regprogram = 0;
#endif

+ /* Current curly descriptor */
+ typedef struct curcur CURCUR;
+ struct curcur {
+ int parenfloor; /* how far back to strip paren data */
+ int cur; /* how many instances of scan we've matched */
+ int min; /* the minimal number of scans to match */
+ int max; /* the maximal number of scans to match */
+ int minmod; /* whether to work our way up or down */
+ char * scan; /* the thing to match */
+ char * next; /* what has to match after it */
+ char * lastloc; /* where we started matching this scan */
+ CURCUR * oldcc; /* current curly before we started this one */
+ };
+
+ static CURCUR* regcc;
+
typedef I32 CHECKPOINT;

CHECKPOINT
***************
*** 132,138 ****
--- 148,157 ----
register I32 tmp;
I32 minlen = 0; /* must match at least this many chars */
I32 dontbother = 0; /* how many characters not to try at end */
+ CURCUR cc;

+ regcc = &cc;
+
#ifdef DEBUGGING
regnarrate = debug & 512;
regprogram = prog->program;
***************
*** 714,768 ****
*reglastparen = n;
break;
case CURLYX: {
! int parenfloor = *reglastparen;
! ln = ARG1(scan); /* min to match */
! n = ARG2(scan); /* max to match */
! scan = NEXTOPER(scan) + 4;
reginput = locinput;
! n -= ln;
! while (ln-- > 0) {
! if (!regmatch(scan))
! return 0;
! if (reginput == locinput)
! return regmatch(next);
! locinput = reginput;
}
! if (minmod) {
! if (regmatch(next))
! return 1;
! while (n-- > 0) {
! if (!regmatch(scan))
! return 0;
! if (reginput == locinput)
! return regmatch(next);
! if (regmatch(next))
! return 1;
! locinput = reginput;
! }
! return 0;
}
! else {
! CHECKPOINT cp = regcppush(parenfloor);
! for (ln = 0; ln < n; ln++) {
! if (!regmatch(scan))
! break;
! if (reginput == locinput)
! break;
! locinput = reginput;
! regcppush(parenfloor);
! }
! while (ln-- >= 0) {
! reginput = regcppop();
! if (regmatch(next)) {
! regcpblow(cp);
! return 1;
! }
! }
! return 0;
}
- }
- /*NOTREACHED*/

case BRANCH: {
if (OP(next) != BRANCH) /* No choice. */
next = NEXTOPER(scan);/* Avoid recursion. */
--- 733,833 ----
*reglastparen = n;
break;
case CURLYX: {
! CURCUR cc;
! CHECKPOINT cp = savestack_ix;
! cc.oldcc = regcc;
! regcc = &cc;
! cc.parenfloor = *reglastparen;
! cc.cur = -1;
! cc.min = ARG1(scan);
! cc.max = ARG2(scan);
! cc.scan = NEXTOPER(scan) + 4;
! cc.next = next;
! cc.minmod = minmod;
! cc.lastloc = 0;
reginput = locinput;
! n = regmatch(PREVOPER(next)); /* start on the WHILE */
! regcpblow(cp);
! regcc = cc.oldcc;
! return n;
! }
! /* NOT REACHED */
! case WHILE: {
! /*
! * This is really hard to understand, because after we match
! * what we're trying to match, we must make sure the rest of
! * the RE is going to match for sure, and to do that we have
! * to go back UP the parse tree by recursing ever deeper. And
! * if it fails, we have to reset our parent's current state
! * that we can try again after backing off.
! */
!
! CURCUR* cc = regcc;
! n = cc->cur + 1;
! reginput = locinput;
!
! /* If degenerate scan matches "", assume scan done. */
!
! if (locinput == cc->lastloc) {
! regcc = cc->oldcc;
! ln = regcc->cur;
! if (regmatch(cc->next))
! return TRUE;
! regcc->cur = ln;
! regcc = cc;
! return FALSE;
}
!
! /* First just match a string of min scans. */
!
! if (n < cc->min) {
! cc->cur = n;
! cc->lastloc = locinput;
! return regmatch(cc->scan);
}
!
! /* Prefer next over scan for minimal matching. */
!
! if (cc->minmod) {
! regcc = cc->oldcc;
! ln = regcc->cur;
! if (regmatch(cc->next))
! return TRUE; /* All done. */
! regcc->cur = ln;
! regcc = cc;
!
! if (n >= cc->max) /* Maximum greed exceeded? */
! return FALSE;
!
! /* Try scanning more and see if it helps. */
! reginput = locinput;
! cc->cur = n;
! cc->lastloc = locinput;
! return regmatch(cc->scan);
}

+ /* Prefer scan over next for maximal matching. */
+
+ if (n < cc->max) { /* More greed allowed? */
+ regcppush(cc->parenfloor);
+ cc->cur = n;
+ cc->lastloc = locinput;
+ if (regmatch(cc->scan))
+ return TRUE;
+ regcppop(); /* Restore some previous $<digit>s? */
+ reginput = locinput;
+ }
+
+ /* Failed deeper matches of scan, so see if this one works. */
+ regcc = cc->oldcc;
+ ln = regcc->cur;
+ if (regmatch(cc->next))
+ return TRUE;
+ regcc->cur = ln;
+ regcc = cc;
+ return FALSE;
+ }
+ /* NOT REACHED */
case BRANCH: {
if (OP(next) != BRANCH) /* No choice. */
next = NEXTOPER(scan);/* Avoid recursion. */

0 new messages