Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] string_str_index faster, take 2

7 views
Skip to first unread message

Luke Palmer

unread,
Apr 27, 2003, 3:44:16 AM4/27/03
to perl6-i...@perl.org
Alright, here's another string_str_index patch.

This one screams at 2.5 times the speed of Perl 5 for the single byte
encoding. Multibytes are implemented with a stupid, slow algorithm.

If anyone knows of a way to implement the Boyer-Moore algorithm with
multi byte encodings, please tell me or do so. (The only way I can
think of is with a hashtable, which would probably be slower than the
brute force method implemented here.)

This is still only my second patch, so considerations, critiques,
etc. are most welcome.

Luke

? string_str_index.diff
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/string.c,v
retrieving revision 1.124
diff -C4 -r1.124 string.c
*** string.c 16 Apr 2003 05:28:51 -0000 1.124
--- string.c 27 Apr 2003 07:35:51 -0000
***************
*** 316,382 ****
return s->encoding->decode(s->encoding->skip_forward(s->strstart, idx));
}
}

! /* This is a helper function for string_str_index. It is based on the
! * failure function (a lightweight sort of DFA), and runs in O(m + n)
! * instead of O(m n).
*/
static INTVAL
! string_str_index_internal(struct Parrot_Interp *interp, const STRING *str2,
! const STRING *str, UINTVAL start)
{
! Buffer* failure = NULL;
! UINTVAL s, i, t, len, len2;
!
! len = string_length(str);
! len2 = string_length(str2);
!
! /* Check whether we need the failure function */
! for (s = 1; s < len; s++) {
! if (string_ord(str, s) == string_ord(str, 0))
! break;
}

! /* We only allocate the failure buffer if we need it */
! if (s < len) {
! Parrot_block_DOD(interp);
! failure = new_buffer_header(interp);
! Parrot_allocate_zeroed(interp, failure, sizeof(UINTVAL) * (len + 1));
! Parrot_unblock_DOD(interp);
}
!
! /* Compute the rest of the failure function (if we ended up needing it) */
! t = 0;
! for (; s < len; s++) {
! while (t > 0 && string_ord(str, s) != string_ord(str, t))
! t = ((UINTVAL*)failure->bufstart)[t];
! if (string_ord(str, s) == string_ord(str, t)) {
! ((UINTVAL*)failure->bufstart)[s + 1] = ++t;
}
else {
! ((UINTVAL*)failure->bufstart)[s + 1] = 0;
! }
}

! /* Perform match */
! s = 0;
! for (i = start; i < len2; i++) {
! if (s == len)
! break;
! while (s > 0 && string_ord(str2, i) != string_ord(str, s))
! if (failure)
! s = ((UINTVAL*)failure->bufstart)[s];
! else
! s = 0;
! if (string_ord(str2, i) == string_ord(str, s))
! s++;
! }
!
! if (s == len)
! return i - s;
! else
! return -1;
}

/*=for api string string_str_index
* return the character position of s2 in s at or after start
--- 316,419 ----
return s->encoding->decode(s->encoding->skip_forward(s->strstart, idx));
}
}

! /* string_str_index_multibyte: Helper function for string_str_index.
! * This implements a naive substring search, but one that is guaranteed to
! * work for all encodings.
*/
static INTVAL
! string_str_index_multibyte(struct Parrot_Interp *interpreter,
! const STRING *str, const STRING *find, UINTVAL start)
{
! const void* const lastmatch =
! str->encoding->skip_backward((char*)str->strstart + str->strlen,
! find->encoding->characters(find, find->strlen));
! const void* const lastfind = (char*)find->strstart + find->strlen;
! const void* sp;
! const void* fp;
! const void* ip;
! INTVAL pos = start;
!
! sp = str->encoding->skip_forward(str->strstart, start);
! while (sp < lastmatch) {
! fp = find->strstart;
! ip = sp;
!
! for (; fp < lastfind; fp = find->encoding->skip_forward(fp, 1),
! ip = str->encoding->skip_forward(ip, 1)) {
! if (find->encoding->decode(fp) != str->encoding->decode(ip))
! break;
! }
!
! if (fp == lastfind) {
! return pos;
! }
!
! sp = str->encoding->skip_forward(sp, 1);
! pos++;
}

! return -1;
! }
!
! /* string_str_index_singlebyte: Helper function for string_str_index.
! * This is optimized for the simple case where both strings are in
! * encoding_singlebyte. It implements the Boyer-Moore string search
! * algorithm.
! */
! static INTVAL
! string_str_index_singlebyte(struct Parrot_Interp *interpreter,
! const STRING *str, const STRING *find, UINTVAL start)
! {
! const unsigned char* const find_strstart = find->strstart;
! const unsigned char* const str_strstart = str->strstart;
! const UINTVAL find_strlen = find->strlen;
! const UINTVAL str_strlen = str->strlen;
! const unsigned char* const lastmatch =
! str_strstart + str_strlen - find_strlen;
! UINTVAL* p;
! const unsigned char* cp;
! UINTVAL endct, pos;
! UINTVAL badshift[256];
!
! /* Prepare the bad shift buffer */
!
! for (p = &badshift[0] ; p < &badshift[256] ; p++) {
! *p = find_strlen;
}
!
! endct = 1;
! cp = find_strstart + find_strlen - 2;
! for (; cp >= find_strstart ; cp--, endct++) {
! if (endct < badshift[*cp]) {
! badshift[*cp] = endct;
! }
! }
!
! /* Perform the match */
!
! pos = start;
! cp = str_strstart + start;
! while (cp <= lastmatch) {
! register const unsigned char* sp = cp + find_strlen;
! register const unsigned char* fp = find_strstart + find_strlen;
!
! while (fp > find_strstart) {
! if (*--fp != *--sp)
! break;
! }
! if (*fp == *sp) {
! return pos;
}
else {
! register UINTVAL bsi = badshift[*sp];
! cp += bsi;
! pos += bsi;
! }
}

! return -1;
}

/*=for api string string_str_index
* return the character position of s2 in s at or after start
***************
*** 397,410 ****
return -1;
if (!s2 || !string_length(s2))
return -1;

/* if different transcode to s */
if (s->encoding != s2->encoding || s->type != s2->type)
s2 = string_transcode(interpreter, const_cast(s2), s->encoding,
s->type, NULL);

! return string_str_index_internal(interpreter, s, s2, start);
}



--- 434,455 ----
return -1;
if (!s2 || !string_length(s2))
return -1;

+ #if 0
/* if different transcode to s */
if (s->encoding != s2->encoding || s->type != s2->type)
s2 = string_transcode(interpreter, const_cast(s2), s->encoding,
s->type, NULL);
+ #endif

! if (s->encoding->index == enum_encoding_singlebyte &&
! s2->encoding->index == enum_encoding_singlebyte) {
! return string_str_index_singlebyte(interpreter, s, s2, start);
! }
! else {
! return string_str_index_multibyte(interpreter, s, s2, start);
! }
}



Index: t/op/string.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string.t,v
retrieving revision 1.42
diff -C4 -r1.42 string.t
*** t/op/string.t 18 Feb 2003 12:39:21 -0000 1.42
--- t/op/string.t 27 Apr 2003 07:35:51 -0000
***************
*** 1,7 ****
#! perl -w

! use Parrot::Test tests => 99;
use Test::More;

output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
set S4, "JAPH\n"
--- 1,7 ----
#! perl -w

! use Parrot::Test tests => 100;
use Test::More;

output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
set S4, "JAPH\n"
***************
*** 1232,1239 ****
--- 1232,1263 ----
end
CODE
0
1234
+ -1
+ OUTPUT
+
+ output_is(<<'CODE',<<OUTPUT,"index, big, hard to match strings");
+ # Builds a 24th iteration fibonacci string (approx. 100K)
+ set S1, "a"
+ set S2, "b"
+ set I0, 0
+ LOOP:
+ set S3, S1
+ concat S1, S2, S3
+ set S2, S3
+ inc I0
+ lt I0, 24, LOOP
+
+ index I1, S1, S2
+ print I1
+ print "\n"
+
+ index I1, S1, S2, 50000
+ print I1
+ print "\n"
+ CODE
+ 46368
-1
OUTPUT

output_is(<<'CODE',<<OUTPUT,"num to string");

Steve Fink

unread,
Apr 27, 2003, 3:57:36 AM4/27/03
to Luke Palmer, perl6-i...@perl.org
On Apr-27, Luke Palmer wrote:
> Alright, here's another string_str_index patch.
>
> This one screams at 2.5 times the speed of Perl 5 for the single byte
> encoding. Multibytes are implemented with a stupid, slow algorithm.
>
> If anyone knows of a way to implement the Boyer-Moore algorithm with
> multi byte encodings, please tell me or do so. (The only way I can
> think of is with a hashtable, which would probably be slower than the
> brute force method implemented here.)
>
> This is still only my second patch, so considerations, critiques,
> etc. are most welcome.

Thanks, applied (mindlessly). The only comment I have is that unified
diffs are nicer (diff -u or cvs diff -u). As for the code itself, I
only verified that it still passes the tests.

0 new messages