* Changed simplified index-of to use correct BMH algorithm, for a theoretical better performance.

0 views
Skip to first unread message

l...@chromium.org

unread,
Dec 17, 2008, 3:56:41 AM12/17/08
to erik....@gmail.com, v8-...@googlegroups.com
Reviewers: Erik Corry,

Message:
Small review.

Description:
Minor changes, mostly cosmetic, in string search.

Please review this at http://codereview.chromium.org/14505

Affected files:
M src/runtime.cc


Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index
24b19047eb564ff6b4c1842ed99dba46bb9112f2..b9f1f48ef2138a4179b5436b4572e70d851e6e49
100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -1151,15 +1151,14 @@ static inline int CharOccurence(int char_code) {
return bad_char_occurence[char_code % kBMAlphabetSize];
}

-// Restricted simplified Boyer-Moore string matching. Restricts tables to a
-// suffix of long pattern strings and handles only equivalence classes
-// of the full alphabet. This allows us to ensure that tables take only
-// a fixed amount of space.
+// Restricted simplified Boyer-Moore string matching.
+// Uses only the bad-shift table of Boyer-Moore and only uses it
+// for the character compared to the last character of the needle.
template <typename schar, typename pchar>
-static int BoyerMooreSimplified(Vector<const schar> subject,
- Vector<const pchar> pattern,
- int start_index,
- bool* complete) {
+static int BoyerMooreHorsepool(Vector<const schar> subject,
+ Vector<const pchar> pattern,
+ int start_index,
+ bool* complete) {
int n = subject.length();
int m = pattern.length();
// Only preprocess at most kBMMaxShift last characters of pattern.
@@ -1170,6 +1169,7 @@ static int BoyerMooreSimplified(Vector<const schar>
subject,
int badness = -m; // How bad we are doing without a good-suffix table.
int idx; // No matches found prior to this index.
pchar last_char = pattern[m - 1];
+ int last_char_shift = m - 1 - CharOccurence<schar, pchar>(last_char);
// Perform search
for (idx = start_index; idx <= n - m;) {
int j = m - 1;
@@ -1185,19 +1185,17 @@ static int BoyerMooreSimplified(Vector<const schar>
subject,
}
}
j--;
- while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
+ while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
if (j < 0) {
*complete = true;
return idx;
} else {
- int bc_occ = CharOccurence<schar, pchar>(c);
- int shift = bc_occ < j ? j - bc_occ : 1;
- idx += shift;
+ idx += last_char_shift;
// Badness increases by the number of characters we have
// checked, and decreases by the number of characters we
// can skip by shifting. It's a measure of how we are doing
// compared to reading each character exactly once.
- badness += (m - j) - shift;
+ badness += (m - j) - last_char_shift;
if (badness > 0) {
*complete = false;
return idx;
@@ -1237,12 +1235,15 @@ static int BoyerMooreIndexOf(Vector<const schar>
subject,
return idx;
} else if (j < start) {
// we have matched more than our tables allow us to be smart about.
- idx += 1;
+ // Fall back on BMH shift.
+ idx += m - 1 - CharOccurence<schar, pchar>(last_char);
} else {
int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.
int bc_occ = CharOccurence<schar, pchar>(c);
int shift = j - bc_occ; // Bad-char shift.
- shift = (gs_shift > shift) ? gs_shift : shift;
+ if (gs_shift > shift) {
+ shift = gs_shift;
+ }
idx += shift;
}
} while (idx <= n - m);
@@ -1286,7 +1287,7 @@ static int SimpleIndexOf(Vector<const schar> subject,
badness++;
if (badness > 0) {
*complete = false;
- return (i);
+ return i;
}
if (subject[i] != pattern_first_char) continue;
int j = 1;
@@ -1357,7 +1358,7 @@ static int StringMatchStrategy(Vector<const schar>
sub,
bool complete;
int idx = SimpleIndexOf(sub, pat, start_index, &complete);
if (complete) return idx;
- idx = BoyerMooreSimplified(sub, pat, idx, &complete);
+ idx = BoyerMooreHorsepool(sub, pat, idx, &complete);
if (complete) return idx;
return BoyerMooreIndexOf(sub, pat, idx);
}


erik....@gmail.com

unread,
Dec 17, 2008, 4:07:18 AM12/17/08
to l...@chromium.org, v8-...@googlegroups.com
LGTM


http://codereview.chromium.org/14505/diff/1/2
File src/runtime.cc (right):

http://codereview.chromium.org/14505/diff/1/2#newcode1172
Line 1172: int last_char_shift = m - 1 - CharOccurence<schar,
pchar>(last_char);
Noun. occurence. Common misspelling of occurrence.

http://codereview.chromium.org/14505

Reply all
Reply to author
Forward
0 new messages