[PATCH 1/2] bloom: add open() alongside create(); make ShaBloom private

0 views
Skip to first unread message

Rob Browning

unread,
Mar 2, 2026, 6:24:47 PM (2 days ago) Mar 2
to bup-...@googlegroups.com
Make ShaBloom() private so that we can simplify initialization, having
it only accept an open file (instead of a file or a path), and
then have ShaBloom() take ownwership of the open file. Just get the
filename from the incoming file.name.

Provide a new bloom.create() function to replace the direct calls to
ShaBloom() so that all construction happens via either open() or
create(), which require different arguments.

Improve the handling of the open filter file by protecting it with an
ExitStack during setup.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/bloom.py | 75 +++++++++++++++++++++---------------------
lib/bup/cmd/bloom.py | 9 +++--
lib/bup/git.py | 2 +-
test/int/test_bloom.py | 20 +++++------
4 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/lib/bup/bloom.py b/lib/bup/bloom.py
index f54cc780..e8b90420 100644
--- a/lib/bup/bloom.py
+++ b/lib/bup/bloom.py
@@ -80,11 +80,13 @@ None of this tells us what max_pfalse_positive to choose.
Brandon Low <lost...@lostlogicx.com> 2011-02-04
"""

-import os, math, struct
+from contextlib import ExitStack
+import builtins, os, math, struct

from bup import _helpers
from bup.helpers import (debug1, debug2, log, mmap_read, mmap_readwrite,
mmap_readwrite_private, unlink)
+from bup.io import path_msg as pm


BLOOM_VERSION = 2
@@ -98,24 +100,21 @@ _total_steps = 0
bloom_contains = _helpers.bloom_contains
bloom_add = _helpers.bloom_add

-# FIXME: check bloom create() and ShaBloom handling/ownership of "f".
-# The ownership semantics should be clarified since the caller needs
-# to know who is responsible for closing it.

-class ShaBloom:
+class _ShaBloom:
"""Wrapper which contains data from multiple index files. """
- def __init__(self, filename, f=None, readwrite=False, expected=-1):
- self.closed = False
- self.name = filename
+ def __init__(self, file, *, readwrite=False, expected=-1):
+ # Takes ownership of file (eventually closing it)
+ assert file.name.endswith(b'.bloom')
self.readwrite = readwrite
- self.file = None
+ self.file = file
self.map = None
- assert(filename.endswith(b'.bloom'))
- if readwrite:
- assert(expected > 0)
- self.file = f = f or open(filename, 'r+b')
- f.seek(0)
-
+ if not readwrite:
+ self.map = mmap_read(self.file)
+ else:
+ assert set(file.mode) == set('r+b'), file.mode
+ assert expected > 0
+ file.seek(0)
# Decide if we want to mmap() the pages as writable ('immediate'
# write) or else map them privately for later writing back to
# the file ('delayed' write). A bloom table's write access
@@ -132,30 +131,25 @@ class ShaBloom:
# one bit flipped per memory page), let's use a "private" mmap,
# which defeats Linux's ability to flush it to disk. Then we'll
# flush it as one big lump during close().
- pages = os.fstat(f.fileno()).st_size // 4096 * 5 # assume k=5
+ pages = os.fstat(file.fileno()).st_size // 4096 * 5 # assume k=5
self.delaywrite = expected > pages
debug1('bloom: delaywrite=%r\n' % self.delaywrite)
if self.delaywrite:
self.map = mmap_readwrite_private(self.file, close=False)
else:
self.map = mmap_readwrite(self.file, close=False)
- else:
- self.file = f or open(filename, 'rb')
- self.map = mmap_read(self.file)
got = self.map[0:4]
if got != b'BLOM':
- log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
+ log(f'Warning: invalid BLOM header ({pm(got)}) in {pm(file.name)}\n')
self._init_failed()
return
ver = struct.unpack('!I', self.map[4:8])[0]
if ver < BLOOM_VERSION:
- log('Warning: ignoring old-style (v%d) bloom %r\n'
- % (ver, filename))
+ log(f'Warning: ignoring old-style (v{ver}) bloom {pm(file.name)}\n')
self._init_failed()
return
if ver > BLOOM_VERSION:
- log('Warning: ignoring too-new (v%d) bloom %r\n'
- % (ver, filename))
+ log(f'Warning: ignoring too-new (v{ver}) bloom {pm(file.name)}\n')
self._init_failed()
return

@@ -238,7 +232,7 @@ class ShaBloom:
return int(self.entries)


-def create(name, expected, delaywrite=None, f=None, k=None):
+def create(name, expected, delaywrite=None, k=None):
"""Create and return a bloom filter for `expected` entries."""
bits = int(math.floor(math.log(expected * MAX_BITS_EACH // 8, 2)))
k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
@@ -246,18 +240,25 @@ def create(name, expected, delaywrite=None, f=None, k=None):
log('bloom: warning, max bits exceeded, non-optimal\n')
bits = MAX_BLOOM_BITS[k]
debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
- f = f or open(name, 'w+b')
- f.write(b'BLOM')
- f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
- assert(f.tell() == 16)
- # NOTE: On some systems this will not extend+zerofill, but it does on
- # darwin, linux, bsd and solaris.
- f.truncate(16+2**bits)
- f.seek(0)
- if delaywrite != None and not delaywrite:
- # tell it to expect very few objects, forcing a direct mmap
- expected = 1
- return ShaBloom(name, f=f, readwrite=True, expected=expected)
+ with ExitStack() as ctx:
+ f = ctx.enter_context(builtins.open(name, 'w+b'))
+ f.write(b'BLOM')
+ f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
+ assert f.tell() == 16
+ # NOTE: On some systems this will not extend+zerofill, but it does on
+ # darwin, linux, bsd and solaris.
+ f.truncate(16+2**bits)
+ f.seek(0)
+ if delaywrite != None and not delaywrite:
+ # tell it to expect very few objects, forcing a direct mmap
+ expected = 1
+ ctx.pop_all()
+ return _ShaBloom(f, readwrite=True, expected=expected)
+
+
+def open(filename, *, readwrite=False, expected=-1):
+ return _ShaBloom(builtins.open(filename, 'r+b' if readwrite else 'rb'),
+ readwrite=readwrite, expected=expected)


def clear_bloom(dir):
diff --git a/lib/bup/cmd/bloom.py b/lib/bup/cmd/bloom.py
index c8ab090e..c437213c 100644
--- a/lib/bup/cmd/bloom.py
+++ b/lib/bup/cmd/bloom.py
@@ -31,7 +31,7 @@ def ruin_bloom(bloomfilename):
log(path_msg(bloomfilename) + '\n')
add_error('bloom: %s not found to ruin\n' % path_msg(bloomfilename))
return
- with bloom.ShaBloom(bloomfilename, readwrite=True, expected=1) as b:
+ with bloom.open(bloomfilename, readwrite=True, expected=1) as b:
b.map[16 : 16 + 2**b.bits] = b'\0' * 2**b.bits


@@ -41,7 +41,7 @@ def check_bloom(path, bloomfilename, idx):
if not os.path.exists(bloomfilename):
log('bloom: %s: does not exist.\n' % path_msg(rbloomfilename))
return
- with bloom.ShaBloom(bloomfilename) as b:
+ with bloom.open(bloomfilename) as b:
if not b.valid():
add_error('bloom: %r is invalid.\n' % path_msg(rbloomfilename))
return
@@ -70,7 +70,7 @@ def do_bloom(path, outfilename, k, force):
b = None
try:
if os.path.exists(outfilename) and not force:
- b = bloom.ShaBloom(outfilename)
+ b = bloom.open(outfilename)
if not b.valid():
debug1("bloom: Existing invalid bloom found, regenerating.\n")
b.close()
@@ -116,8 +116,7 @@ def do_bloom(path, outfilename, k, force):
else:
b, b_tmp = None, b
b_tmp.close()
- b = bloom.ShaBloom(outfilename, readwrite=True,
- expected=add_count)
+ b = bloom.open(outfilename, readwrite=True, expected=add_count)
if b is None: # Need all idxs to build from scratch
add += rest
add_count += rest_count
diff --git a/lib/bup/git.py b/lib/bup/git.py
index 37d38f36..7349be1f 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -712,7 +712,7 @@ class PackIdxList:
new_packs.sort(reverse=True, key=lambda x: len(x))
self.packs = new_packs
if self.bloom is None and os.path.exists(bfull):
- self.bloom = bloom.ShaBloom(bfull)
+ self.bloom = bloom.open(bfull)
try:
if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
self.do_bloom = True
diff --git a/test/int/test_bloom.py b/test/int/test_bloom.py
index c9021037..4a91b110 100644
--- a/test/int/test_bloom.py
+++ b/test/int/test_bloom.py
@@ -1,9 +1,9 @@

-import os
-import errno, platform, tempfile
-import logging
+import errno, logging, os, platform

from bup import bloom
+from bup.helpers import unlink
+

def test_bloom(tmpdir):
hashes = [os.urandom(20) for i in range(100)]
@@ -16,7 +16,7 @@ def test_bloom(tmpdir):
with bloom.create(tmpdir + b'/pybuptest.bloom', expected=100, k=k) as b:
b.add_idx(ix)
assert b.pfalse_positive() < .1
- with bloom.ShaBloom(tmpdir + b'/pybuptest.bloom') as b:
+ with bloom.open(tmpdir + b'/pybuptest.bloom') as b:
all_present = True
for h in hashes:
all_present &= (b.exists(h) or False)
@@ -28,19 +28,17 @@ def test_bloom(tmpdir):
assert false_positives < 10
os.unlink(tmpdir + b'/pybuptest.bloom')

- tf = tempfile.TemporaryFile(dir=tmpdir)
- with bloom.create(b'bup.bloom', f=tf, expected=100) as b:
- assert b.file == tf
+ with bloom.create(b'bup.bloom', expected=100) as b:
+ assert b.file.name == b'bup.bloom'
assert b.k == 5
+ unlink(b'bup.bloom')

# Test large (~1GiB) filter. This may fail on s390 (31-bit
# architecture), and anywhere else where the address space is
# sufficiently limited.
- tf = tempfile.TemporaryFile(dir=tmpdir)
skip_test = False
try:
- with bloom.create(b'bup.bloom', f=tf, expected=2**28,
- delaywrite=False) as b:
+ with bloom.create(b'bup.bloom', expected=2**28, delaywrite=False) as b:
assert b.k == 4
except EnvironmentError as ex:
(ptr_width, linkage) = platform.architecture()
@@ -49,3 +47,5 @@ def test_bloom(tmpdir):
+ str(ex))
else:
raise
+ finally:
+ unlink(b'bup.bloom')
--
2.47.3

Reply all
Reply to author
Forward
0 new messages