[PATCH 1/1] Avoid using a local window for splitting

7 views

Skip to first unread message

Zachary Dremann

unread,

Jun 2, 2023, 11:26:26 PM6/2/23

to bup-...@googlegroups.com, Zachary Dremann

The splitting code in bupsplit.c/_hashsplit.c was written as if it had
to handle the case where bytes were fed one by-one, but it actually gets
the whole data chunk: there's no need to copy to a local window, when we
have the original data available: we can simply use the passed data to
fetch the byte that is leaving the current window. This should actually
be a fair bit faster.

Additionally, try to use __builtin_ctz for getting extra digest bits if
it is available.

Signed-off-by: Zachary Dremann <dre...@gmail.com>
---
lib/bup/_hashsplit.c | 49 +++++++++++++++++++++-----------------------
lib/bup/bupsplit.c | 20 ++++++++++++++----
lib/bup/bupsplit.h | 46 +++++++++++++++++++++++++++--------------
3 files changed, 70 insertions(+), 45 deletions(-)

On my machine, this changes this command:
`python -m timeit -r 10 -s 'from bup._helpers import HashSplitter' -s
'from os import urandom' -s 'import io' -s 'b = urandom(1000000)' 'files
= [io.BytesIO(b)]' 'h = HashSplitter(files, 14)' 'list(h)'`

from 3.29 msecs to 1.41 msecs

diff --git a/lib/bup/_hashsplit.c b/lib/bup/_hashsplit.c
index ade41318..0abe5ab0 100644
--- a/lib/bup/_hashsplit.c
+++ b/lib/bup/_hashsplit.c
@@ -523,8 +523,9 @@ static size_t HashSplitter_find_offs(unsigned int nbits,
// is ignored).

assert(nbits <= 32);
+ size_t offset = 0;

- PyThreadState *thread_state = PyEval_SaveThread();
+ Py_BEGIN_ALLOW_THREADS

// Compute masks for the two 16-bit rollsum components such that
// (s1_* | s2_*) is the mask for the entire 32-bit value. The
@@ -535,34 +536,30 @@ static size_t HashSplitter_find_offs(unsigned int nbits,
Rollsum r;
rollsum_init(&r);

- size_t count;
- for (count = 0; count < len; count++) {
- rollsum_roll(&r, buf[count]);
-
+ const unsigned char *end = buf + len;
+ const unsigned char *first_window_end = len > BUP_WINDOWSIZE ? buf + BUP_WINDOWSIZE : end;
+ const unsigned char *p;
+ for (p = buf; p < first_window_end; p++) {
+ rollsum_add(&r, 0, *p);
if ((r.s2 & s2_mask) == s2_mask && (r.s1 & s1_mask) == s1_mask) {
- uint32_t rsum = rollsum_digest(&r);
-
- rsum >>= nbits;
- /*
- * See the DESIGN document, the bit counting loop used to
- * be written in a way that shifted rsum *before* checking
- * the lowest bit, make that explicit now so the code is a
- * bit easier to understand.
- */
- rsum >>= 1;
- *extrabits = 0;
- while (rsum & 1) {
- (*extrabits)++;
- rsum >>= 1;
- }
-
- PyEval_RestoreThread(thread_state);
- assert(count < len);
- return count + 1;
+ goto found;
}
}
- PyEval_RestoreThread(thread_state);
- return 0;
+ for (; p < end; p++) {
+ // Safety: If this loop is entered, `first_window_end != end`, which can only occur if `end - start > BUP_WINDOWSIZE`,
+ // in which case, `first_window_end = start + BUP_WINDOWSIZE`, so accessing `p[-BUP_WINDOWSIZE]` is safe.
+ rollsum_add(&r, p[-BUP_WINDOWSIZE], *p);
+ if ((r.s2 & s2_mask) == s2_mask && (r.s1 & s1_mask) == s1_mask) {
+ goto found;
+ }
+ }
+ goto end;
+found:
+ *extrabits = rollsum_extra_digest_bits(rollsum_digest(&r), nbits);
+ offset = (p - buf) + 1;
+end:
+ Py_END_ALLOW_THREADS
+ return offset;
}

static PyObject *HashSplitter_iternext(HashSplitter *self)
diff --git a/lib/bup/bupsplit.c b/lib/bup/bupsplit.c
index add858d9..b884fe1f 100644
--- a/lib/bup/bupsplit.c
+++ b/lib/bup/bupsplit.c
@@ -34,13 +34,25 @@
#include <stdlib.h>
#include <stdio.h>

-uint32_t rollsum_sum(uint8_t *buf, size_t ofs, size_t len)
+
+uint32_t rollsum_sum(const uint8_t *buf, size_t ofs, size_t len)
{
- size_t count;
+ const uint8_t *p;
Rollsum r;
rollsum_init(&r);
- for (count = ofs; count < len; count++)
- rollsum_roll(&r, buf[count]);
+ const uint8_t *start = buf + ofs;
+ const uint8_t *end = buf + len;
+ const uint8_t *first_window_end = end - start > BUP_WINDOWSIZE ? start + BUP_WINDOWSIZE : end;
+
+ for (p = start; p < first_window_end; p++) {
+ rollsum_add(&r, 0, *p);
+ }
+ for (; p < end; p++) {
+ // Safety: If this loop is entered, `first_window_end != end`, which can only occur if `end - start > BUP_WINDOWSIZE`,
+ // in which case, `first_window_end = start + BUP_WINDOWSIZE`, so accessing `p[-BUP_WINDOWSIZE]` is safe.
+ rollsum_add(&r, p[-BUP_WINDOWSIZE], *p);
+ }
+
return rollsum_digest(&r);
}

diff --git a/lib/bup/bupsplit.h b/lib/bup/bupsplit.h
index 5b698d98..32e75951 100644
--- a/lib/bup/bupsplit.h
+++ b/lib/bup/bupsplit.h
@@ -45,16 +45,12 @@

typedef struct {
unsigned s1, s2;
- uint8_t window[BUP_WINDOWSIZE];
- int wofs;
} Rollsum;

static inline void rollsum_init(Rollsum *r)
{
r->s1 = BUP_WINDOWSIZE * ROLLSUM_CHAR_OFFSET;
r->s2 = BUP_WINDOWSIZE * (BUP_WINDOWSIZE-1) * ROLLSUM_CHAR_OFFSET;
- r->wofs = 0;
- memset(r->window, 0, BUP_WINDOWSIZE);
}

// These formulas are based on rollsum.h in the librsync project.
@@ -64,21 +60,41 @@ static inline void rollsum_add(Rollsum *r, uint8_t drop, uint8_t add)
r->s2 += r->s1 - (BUP_WINDOWSIZE * (drop + ROLLSUM_CHAR_OFFSET));
}

-// For some reason, gcc 4.3 (at least) optimizes badly if find_ofs()
-// is static and rollsum_roll is an inline function. Let's use a macro
-// here instead to help out the optimizer.
-#define rollsum_roll(r, ch) do { \
- rollsum_add((r), (r)->window[(r)->wofs], ch); \
- (r)->window[(r)->wofs] = ch; \
- (r)->wofs = ((r)->wofs + 1) % BUP_WINDOWSIZE; \
-} while (0)
-
static inline uint32_t rollsum_digest(Rollsum *r)
{
return (r->s1 << 16) | (r->s2 & 0xffff);
}
-
-uint32_t rollsum_sum(uint8_t *buf, size_t ofs, size_t len);
+
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_ctz)
+#define BUPSLIT_HAS_BUILTIN_CTZ 1
+#endif
+#endif
+
+static inline unsigned rollsum_extra_digest_bits(uint32_t digest, unsigned int nbits)
+{
+ digest >>= nbits;
+ /*
+ * See the DESIGN document, the bit counting loop used to
+ * be written in a way that shifted rsum *before* checking
+ * the lowest bit, make that explicit now so the code is a
+ * bit easier to understand.
+ */
+ digest >>= 1;
+#if BUPSLIT_HAS_BUILTIN_CTZ
+ // SAFETY: `~digest` will never be 0, since we shift in at least 1 zero (which becomes a one)
+ return __builtin_ctz(~digest);
+#else
+ unsigned extra_bits = 0;
+ while (digest & 1) {
+ extra_bits++;
+ digest >>= 1;
+ }
+ return extra_bits;
+#endif
+}
+
+uint32_t rollsum_sum(const uint8_t *buf, size_t ofs, size_t len);
int bupsplit_selftest(void);

#endif /* __BUPSPLIT_H */
--
2.41.0

Johannes Berg

unread,

Jun 5, 2023, 7:16:59 AM6/5/23

to Zachary Dremann, bup-...@googlegroups.com

On Fri, 2023-06-02 at 23:25 -0400, Zachary Dremann wrote:
> The splitting code in bupsplit.c/_hashsplit.c was written as if it had
> to handle the case where bytes were fed one by-one, but it actually gets
> the whole data chunk: there's no need to copy to a local window, when we
> have the original data available: we can simply use the passed data to
> fetch the byte that is leaving the current window. This should actually
> be a fair bit faster.
>
> Additionally, try to use __builtin_ctz for getting extra digest bits if
> it is available.

I think it'd be good to split this. The __builtin_ctz() is certainly
nice, I had even considered it but been too lazy to detect availability
etc.

The other part I'd need to review more closely, I'd be especially
looking at what happens at input-chunk boundaries, since we also read in
(larger but still) pieces.

Either way, to me it feels like this shouldn't be a single patch.

johannes

Reply all

Reply to author

Forward

0 new messages