[PATCH 26/75] Include empty top level directories in get --rewrite tests

1 view
Skip to first unread message

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
A previous version of get --rewrite would drop empty top level
directories, a case which was not covered by the existing
tests. Adjust test-get-rewrite and test_get.py to --strip their saves
so that we can include empty root directories.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
test/ext/test-rewrite | 39 ++++++++++++++++-----------------
test/ext/test_get.py | 51 +++++++++++++++++++++----------------------
2 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
index 82857942..17e9ccfb 100755
--- a/test/ext/test-rewrite
+++ b/test/ext/test-rewrite
@@ -53,10 +53,10 @@ WVPASS compare "$BUP_DIR" split "$BUP_DIR2" test

WVSTART make multiple saves
WVPASS bup index "$top/test/sampledata"
-WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
-WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
-WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
-WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
+WVPASS bup save -n save --strip "$top/test/sampledata"
+WVPASS bup save -n save --strip "$top/test/sampledata"
+WVPASS bup save -n save --strip "$top/test/sampledata"
+WVPASS bup save -n save --strip "$top/test/sampledata"

WVSTART rewrite to different split
WVPASS bup -d "$BUP_DIR" ls -l save
@@ -92,36 +92,35 @@ bup+()

# force a re-save of the testfile1 to get it w/o size
WVPASS bup index --fake-invalid "$top/test/sampledata/y/testfile1"
-WVPASS bup+ -d "$BUP_DIR" save -n save --strip-path="$top" \
- "$top/test/sampledata"
+WVPASS bup+ -d "$BUP_DIR" save -n save --strip "$top/test/sampledata"

# check that we get the "unknown" size out
-WVPASS bup+ -d "$BUP_DIR" ls -l save/latest/test/sampledata/y/testfile1 |
+WVPASS bup+ -d "$BUP_DIR" ls -l save/latest/y/testfile1 |
WVPASS grep -- -1122334455
# and that augmentation worked
-WVPASS bup -d "$BUP_DIR" ls -l save/latest/test/sampledata/y/testfile1 |
+WVPASS bup -d "$BUP_DIR" ls -l save/latest/y/testfile1 |
WVPASS grep -- 158664

# now rewrite again - and then the size should be correct even without augmentation
WVPASS bup -d "$BUP_DIR4" get --rewrite -s "$BUP_DIR" --append: save save2
-WVPASS bup+ -d "$BUP_DIR4" ls -l save/latest/test/sampledata/y/testfile1 |
+WVPASS bup+ -d "$BUP_DIR4" ls -l save/latest/y/testfile1 |
WVPASS grep -- 158664

# and again for the other kind of splitting
WVPASS bup -d "$BUP_DIR3" get --rewrite -s "$BUP_DIR" --append: save save2
-WVPASS bup+ -d "$BUP_DIR3" ls -l save2/latest/test/sampledata/y/testfile1 |
+WVPASS bup+ -d "$BUP_DIR3" ls -l save2/latest/y/testfile1 |
WVPASS grep -- 158664

WVSTART rewrite with excluded files
WVPASS bup -d "$BUP_DIR5" init
-WVPASS bup -d "$BUP_DIR5" get --rewrite -s "$BUP_DIR4" \
- --exclude-rx '^/test/sampledata/y/' --append save
+WVPASS bup -d "$BUP_DIR5" get --rewrite -s "$BUP_DIR4" --exclude-rx '^/y/' \
+ --append save
WVPASS extract_all "$BUP_DIR4" "save" "orig"
WVPASS extract_all "$BUP_DIR5" "save" "new"
-rm -rf "$tmpdir/restore/orig/"*"/test/sampledata/y/"
-# that rm -rf changed timestamps - just ignore them
-touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/orig/"*"/test/sampledata/"
-touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"*"/test/sampledata/"
+WVPASS rm -rf "$tmpdir/restore/orig/"*"/y/"
+# rm -rf changed timestamps - just ignore them
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/orig/"*"/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"*"/"
touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
WVPASS "$top/dev/compare-trees" "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
WVPASS rm -rf "$tmpdir/restore"
@@ -130,13 +129,13 @@ WVSTART "rewrite with excluded files (in repo)"
WVPASS git config -f "$BUP_DIR/config" bup.split.trees true
WVPASS git config -f "$BUP_DIR/config" bup.split.files legacy:14
WVPASS bup -d "$BUP_DIR" get --rewrite -s "$BUP_DIR" \
- --exclude-rx '^/test/sampledata/y/' --append: save save-new
+ --exclude-rx '^/y/' --append: save save-new
WVPASS extract_all "$BUP_DIR" "save" "orig"
WVPASS extract_all "$BUP_DIR" "save-new" "new"
-rm -rf "$tmpdir/restore/orig/"*"/test/sampledata/y/"
+rm -rf "$tmpdir/restore/orig/"*"/y/"
# that rm -rf changed timestamps - just ignore them
-touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/orig/"*"/test/sampledata/"
-touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"*"/test/sampledata/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/orig/"*"/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"*"/"
touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
WVPASS "$top/dev/compare-trees" "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
WVPASS rm -rf "$tmpdir/restore"
diff --git a/test/ext/test_get.py b/test/ext/test_get.py
index 75206972..4bd60edc 100644
--- a/test/ext/test_get.py
+++ b/test/ext/test_get.py
@@ -389,7 +389,7 @@ def _test_replace(get_disposition, src_info):
for item in (b'.tag/commit-2', b'src/latest', b'src'):
exr = run_get(get_disposition, b'--replace', (item, b'.tag/obj'),
given=ex_ref)
- validate_tagged_save(b'obj', getcwd() + b'/src',
+ validate_tagged_save(b'obj', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out)
verify_only_refs(heads=[], tags=(b'obj',))

@@ -404,7 +404,7 @@ def _test_replace(get_disposition, src_info):
+ ex_type + ' with ' + item_type)
exr = run_get(get_disposition, b'--replace', (item, b'obj'),
given=ex_ref)
- validate_save(b'obj/latest', getcwd() + b'/src',
+ validate_save(b'obj/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out)
verify_only_refs(heads=(b'obj',), tags=[])

@@ -427,12 +427,12 @@ def _test_replace(get_disposition, src_info):
wvstart(get_disposition + ' --replace, implicit destinations')

exr = run_get(get_disposition, b'--replace', b'src')
- validate_save(b'src/latest', getcwd() + b'/src',
+ validate_save(b'src/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out)
verify_only_refs(heads=(b'src',), tags=[])

exr = run_get(get_disposition, b'--replace', b'.tag/commit-2')
- validate_tagged_save(b'commit-2', getcwd() + b'/src',
+ validate_tagged_save(b'commit-2', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out)
verify_only_refs(heads=[], tags=(b'commit-2',))

@@ -502,7 +502,7 @@ def _test_ff(get_disposition, src_info):
for given in (None, (b'.tag/commit-1', b'obj'), (b'.tag/commit-2', b'obj')):
exr = run_get(get_disposition, b'--ff', (src, b'obj'), given=given)
wvpasseq(0, exr.rc)
- validate_save(b'obj/latest', getcwd() + b'/src',
+ validate_save(b'obj/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out)
verify_only_refs(heads=(b'obj',), tags=[])

@@ -514,7 +514,7 @@ def _test_ff(get_disposition, src_info):
ex((b'find', b'get-dest/refs'))
ex((bup_cmd, b'-d', b'get-dest', b'ls'))

- validate_save(b'src/latest', getcwd() + b'/src',
+ validate_save(b'src/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out)
#verify_only_refs(heads=('src',), tags=[])

@@ -605,7 +605,7 @@ def _test_append(get_disposition, src_info):
exr = run_get(get_disposition, b'--append', (item, b'obj'),
given=existing, rewrite=rewrite)
wvpasseq(0, exr.rc)
- validate_new_save(b'obj/latest', getcwd() + b'/src',
+ validate_new_save(b'obj/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out,
rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])
@@ -621,7 +621,7 @@ def _test_append(get_disposition, src_info):
given=(b'.tag/commit-2', b'obj'),
rewrite=rewrite)
wvpasseq(0, exr.rc)
- validate_new_save(b'obj/latest', getcwd() + b'/src',
+ validate_new_save(b'obj/latest', b'/',
commit_1_id, tree_1_id, b'src-1', exr.out,
rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])
@@ -653,7 +653,7 @@ def _test_append(get_disposition, src_info):
(b'src/latest', False)):
exr = run_get(get_disposition, b'--append', item, rewrite=rewrite)
wvpasseq(0, exr.rc)
- validate_new_save(b'src/latest', getcwd() + b'/src', commit_2_id, tree_2_id,
+ validate_new_save(b'src/latest', b'/', commit_2_id, tree_2_id,
b'src-2', exr.out, rewrite=rewrite)
verify_only_refs(heads=(b'src',), tags=[])

@@ -736,7 +736,7 @@ def _test_pick_common(get_disposition, src_info, force=False):
given=given, rewrite=rewrite)
wvpasseq(0, exr.rc)
if rewrite:
- validate_tagged_save(b'obj', getcwd() + b'/src', None, None,
+ validate_tagged_save(b'obj', b'/', None, None,
b'src-2', exr.out)
else:
validate_new_tagged_commit(b'obj', commit_2_id, tree_2_id,
@@ -764,7 +764,7 @@ def _test_pick_common(get_disposition, src_info, force=False):
wvpasseq(0, exr.rc)
validate_clean_repo()
if rewrite:
- validate_tagged_save(b'obj', getcwd() + b'/src', None, None,
+ validate_tagged_save(b'obj', b'/', None, None,
b'src-2', exr.out)
else:
validate_new_tagged_commit(b'obj', commit_2_id, tree_2_id, exr.out)
@@ -780,7 +780,7 @@ def _test_pick_common(get_disposition, src_info, force=False):
wvpasseq(0, exr.rc)
ex((bup_cmd, b'-d', b'get-dest', b'ls', b'--commit-hash', b'obj'))
validate_clean_repo()
- validate_new_save(b'obj/latest', getcwd() + b'/src',
+ validate_new_save(b'obj/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out,
rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])
@@ -794,7 +794,7 @@ def _test_pick_common(get_disposition, src_info, force=False):
rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_clean_repo()
- validate_new_save(b'obj/latest', getcwd() + b'/src',
+ validate_new_save(b'obj/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out,
rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])
@@ -811,7 +811,7 @@ def _test_pick_common(get_disposition, src_info, force=False):
rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_clean_repo()
- validate_new_save(b'obj/latest', getcwd() + b'/src',
+ validate_new_save(b'obj/latest', b'/',
commit_1_id, tree_1_id, b'src-1', exr.out,
rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])
@@ -827,7 +827,7 @@ def _test_pick_common(get_disposition, src_info, force=False):
exr = run_get(get_disposition, flavor, b'src/latest', rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_clean_repo()
- validate_new_save(b'src/latest', getcwd() + b'/src',
+ validate_new_save(b'src/latest', b'/',
commit_2_id, tree_2_id, b'src-2', exr.out,
rewrite=rewrite)
verify_only_refs(heads=(b'src',), tags=[])
@@ -877,7 +877,7 @@ def _test_new_tag(get_disposition, src_info):
for item in (b'.tag/commit-2', b'src/latest', b'src'):
exr = run_get(get_disposition, b'--new-tag', (item, b'.tag/obj'))
wvpasseq(0, exr.rc)
- validate_tagged_save(b'obj', getcwd() + b'/src/', commit_2_id, tree_2_id,
+ validate_tagged_save(b'obj', b'/', commit_2_id, tree_2_id,
b'src-2', exr.out)
verify_only_refs(heads=[], tags=(b'obj',))

@@ -922,7 +922,7 @@ def _test_new_tag(get_disposition, src_info):
wvstart(get_disposition + ' --new-tag, implicit destinations')
exr = run_get(get_disposition, b'--new-tag', b'.tag/commit-2')
wvpasseq(0, exr.rc)
- validate_tagged_save(b'commit-2', getcwd() + b'/src/', commit_2_id, tree_2_id,
+ validate_tagged_save(b'commit-2', b'/', commit_2_id, tree_2_id,
b'src-2', exr.out)
verify_only_refs(heads=[], tags=(b'commit-2',))

@@ -994,14 +994,14 @@ def create_get_src():
mkdir(b'src')
open(b'src/unrelated', 'a').close()
ex((bup_cmd, b'-d', b'get-src', b'index', b'src'))
- ex((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'unrelated-branch', b'src'))
+ ex((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'unrelated-branch', b'--strip', b'src'))

ex((bup_cmd, b'-d', b'get-src', b'index', b'--clear'))
rmrf(b'src')
mkdir(b'src')
open(b'src/zero', 'a').close()
ex((bup_cmd, b'-d', b'get-src', b'index', b'src'))
- exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'src'))
+ exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'--strip', b'src'))
out = exr.out.splitlines()
tree_0_id = out[0]
commit_0_id = out[-1]
@@ -1012,12 +1012,13 @@ def create_get_src():

rmrf(b'src')
mkdir(b'src')
+ mkdir(b'src/empty-dir')
mkdir(b'src/x')
mkdir(b'src/x/y')
ex((bup_cmd + b' -d get-src random 1k > src/1'), shell=True)
ex((bup_cmd + b' -d get-src random 1m > src/x/2'), shell=True)
ex((bup_cmd, b'-d', b'get-src', b'index', b'src'))
- exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'src'))
+ exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'--strip', b'src'))
out = exr.out.splitlines()
tree_1_id = out[0]
commit_1_id = out[-1]
@@ -1033,7 +1034,7 @@ def create_get_src():
with open(b'src/tiny-file', 'ab') as f: f.write(b'xyzzy')
ex((bup_cmd, b'-d', b'get-src', b'index', b'src'))
ex((bup_cmd, b'-d', b'get-src', b'tick')) # Ensure the save names differ
- exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'src'))
+ exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'--strip', b'src'))
out = exr.out.splitlines()
tree_2_id = out[0]
commit_2_id = out[-1]
@@ -1041,13 +1042,11 @@ def create_get_src():
save_2 = exr.out.splitlines()[2]
rename(b'src', b'src-2')

- src_root = getcwd() + b'/src'
-
subtree_path = b'src-2/x'
- subtree_vfs_path = src_root + b'/x'
+ subtree_vfs_path = b'/x'

# No support for "ls -d", so grep...
- exr = exo((bup_cmd, b'-d', b'get-src', b'ls', b'-s', b'src/latest' + src_root))
+ exr = exo((bup_cmd, b'-d', b'get-src', b'ls', b'-s', b'src/latest'))
out = exr.out.splitlines()
subtree_id = None
for line in out:
@@ -1056,7 +1055,7 @@ def create_get_src():
assert(subtree_id)

# With a tiny file, we'll get a single blob, not a chunked tree
- tinyfile_path = src_root + b'/tiny-file'
+ tinyfile_path = b'/tiny-file'
exr = exo((bup_cmd, b'-d', b'get-src', b'ls', b'-s', b'src/latest' + tinyfile_path))
tinyfile_id = exr.out.splitlines()[0].split()[0]

--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/tree.py | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index ac4dcca6..a0c0c728 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -31,13 +31,19 @@ class TreeItem:
self.oid = oid
self.meta = meta or _empty_metadata
def __repr__(self):
- return f'<bup.tree.TreeItem object at 0x{id(self):x} name={self.name!r}>'
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' name={self.name!r} oid={self.oid.hex()}>'
def mangled_name(self):
return mangle_name(self.name, self.mode, self.gitmode)

class RawTreeItem(TreeItem):
def mangled_name(self):
return self.name
+ def __repr__(self):
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' name={self.name!r}>'

class SplitTreeItem(RawTreeItem):
__slots__ = 'first_full_name', 'last_full_name'
@@ -45,6 +51,11 @@ class SplitTreeItem(RawTreeItem):
super().__init__(name, GIT_MODE_TREE, GIT_MODE_TREE, treeid, None)
self.first_full_name = first
self.last_full_name = last
+ def __repr__(self):
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' first_full_name={self.first_full_name!r}' \
+ f' last_full_name={self.last_full_name!r}>'

def _abbreviate_tree_names(names):
"""Return a list of unique abbreviations for the given names."""
@@ -100,6 +111,11 @@ class StackDir:
self.name = name
self.meta = meta
self.items = []
+ def __repr__(self):
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' name={self.name!r}' \
+ f' items={[(x.name, x.oid.hex()) for x in self.items]!r}>'


class Stack:
@@ -108,6 +124,11 @@ class Stack:
self._repo = repo
self._split_config = split_config

+ def __repr__(self):
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' path={self.path()!r}>'
+
def __len__(self):
return len(self._stack)

--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index e78e0df8..4b7ede10 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -5,7 +5,7 @@ from dataclasses import replace as dcreplace
from contextlib import nullcontext
from stat import S_ISDIR
from typing import Optional, Union
-import os, sys, textwrap, time
+import os, re, sys, textwrap, time

from bup import client, compat, git, hashsplit, vfs
from bup.commit import commit_message
@@ -99,7 +99,7 @@ def usage(argspec, width=None):
msg = []
msg.append(textwrap.fill(usage, width=width, subsequent_indent=' '))
msg.append('\n\n')
- msg.append(textwrap.fill(preamble.replace('\n', ' '), width=width))
+ msg.append(textwrap.fill(re.sub(r'\n\s+', r' ', preamble), width=(width - 1)))
msg.append('\n')
for group_name, group_args in groups:
msg.extend(['\n', group_name, '\n'])
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
The hashsplitter config is now expected to include anything that could
cause a save to "meaningfully" differ from another created with a
different config (e.g. is tree splitting enabled, what's the split
"granularity", ...).

This will support a forthcoming "get" option to --rewrite saves when
transferring them; we're going to rely on comparing configs to
categorize each request. e.g. if the configs differ, then the transfer
should probably be a --rewrite.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/save.py | 8 +++-----
lib/bup/hashsplit.py | 24 +++++++++++++++---------
lib/bup/helpers.py | 8 ++++++++
lib/bup/tree.py | 5 ++---
4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/lib/bup/cmd/save.py b/lib/bup/cmd/save.py
index cb77be05..0d2c4ee6 100644
--- a/lib/bup/cmd/save.py
+++ b/lib/bup/cmd/save.py
@@ -122,7 +122,7 @@ def opts_from_cmdline(o, argv):

return opt

-def save_tree(opt, reader, hlink_db, msr, repo, split_trees, split_cfg):
+def save_tree(opt, reader, hlink_db, msr, repo, split_cfg):
# Metadata is stored in a file named .bupm in each directory. The
# first metadata entry will be the metadata for the current directory.
# The remaining entries will be for each of the other directory
@@ -139,7 +139,7 @@ def save_tree(opt, reader, hlink_db, msr, repo, split_trees, split_cfg):

# Maintain a stack of information representing the current location in

- stack = Stack(repo, split_cfg, split_trees=split_trees)
+ stack = Stack(repo, split_cfg)

prog_count = 0
prog_subcount = 0
@@ -451,7 +451,6 @@ def main(argv):
split_cfg = hashsplit.configuration(repo.config_get)
except ConfigError as ex:
opt_parser.fatal(ex)
- split_trees = repo.config_get(b'bup.split.trees', opttype='bool')
sys.stdout.flush()
out = byte_stream(sys.stdout)

@@ -474,8 +473,7 @@ def main(argv):
with msr, \
hlinkdb.HLinkDB(fsindex.hlink) as hlink_db, \
index.Reader(fsindex.stat) as reader:
- tree = save_tree(opt, reader, hlink_db, msr, repo, split_trees,
- split_cfg)
+ tree = save_tree(opt, reader, hlink_db, msr, repo, split_cfg)
if opt.tree:
out.write(hexlify(tree))
out.write(b'\n')
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index da79b632..8f71c8be 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -3,6 +3,7 @@ import math, os, re

from bup import _helpers
from bup.config import ConfigError
+from bup.helpers import dict_subset


BUP_BLOBBITS = 13
@@ -21,6 +22,8 @@ def _fanbits():

fanbits = _fanbits

+_splitter_args = ('progress', 'keep_boundaries', 'blobbits', 'fanbits')
+
def splitter(files, *, progress=None, keep_boundaries=False, blobbits=None,
fanbits=None):
return HashSplitter(files,
@@ -33,23 +36,26 @@ def splitter(files, *, progress=None, keep_boundaries=False, blobbits=None,
_method_rx = br'legacy:(13|14|15|16|17|18|19|20|21)'

def configuration(config_get):
- """Return a hashsplitter configuration map based on information
- provided by config_get."""
+ """Return a splitting configuration map based on information
+ provided by config_get. The valid entries include the splitter()
+ keyword arguments, and the configuration must include every option
+ that affects the way data will be split. (See also, rewriting via
+ get --rewrite.)
+
+ """
+ cfg = {'trees': config_get(b'bup.split.trees', opttype='bool')}
method = config_get(b'bup.split.files')
if method is None:
- return {}
+ return cfg
m = re.fullmatch(_method_rx, method)
if not m:
raise ConfigError(f'invalid bup.split.files setting {method}')
- blobbits = int(m.group(1))
- return {'blobbits': blobbits}
+ cfg['blobbits'] = int(m.group(1))
+ return cfg

def from_config(files, split_config):
"""Return a hashsplitter for the given split_config."""
- # Currently, the split_config is just a map of the options
- # expected by splitter, so this is a trivial adapter, and
- # any error handling is up to splitter().
- return splitter(files, **split_config)
+ return splitter(files, **dict_subset(split_config, _splitter_args))


total_split = 0
diff --git a/lib/bup/helpers.py b/lib/bup/helpers.py
index 683ed8cf..c81a1dfc 100644
--- a/lib/bup/helpers.py
+++ b/lib/bup/helpers.py
@@ -35,6 +35,14 @@ EXIT_FALSE = 1
EXIT_FAILURE = 2


+def dict_subset(dict, keys):
+ result = {}
+ for k in keys:
+ if k in dict:
+ result[k] = dict[k]
+ return result
+
+
nullctx = nullcontext() # only need one

def nullcontext_if_not(manager):
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index b6a3de51..ba450e2e 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -94,11 +94,10 @@ class StackDir:
self.items = []

class Stack:
- def __init__(self, repo, split_config, *, split_trees=False):
+ def __init__(self, repo, split_config):
self._stack = []
self._repo = repo
self._split_config = split_config
- self._split_trees = split_trees

def __len__(self):
return len(self._stack)
@@ -221,7 +220,7 @@ class Stack:

def _write(self, tree):
items = self._clean(tree)
- if not self._split_trees:
+ if not self._split_config['trees']:
return self._write_tree(tree.meta, items)
items.sort(key=lambda x: x.name)
return self._write_split_tree(tree.meta, items)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
For a while, Johannes Berg and I have been working on adding support
for rewriting and repairing saves. Both are handled by "bup get".

Rewriting allows you to create a new save with a configuration that
differs from the source save. At the moment, the relevant settings
are bup.split.trees, and bup.split.files, allowing you to split or
unsplit trees and to change the deduplication granularity.

Repairs attempt to fix problems with a save, for example, bup get
--repair can replace missing objects with placeholder files so that
other commands (e.g. restore) will be able to work with the save
again.

Both of these operations are fairly expensive, since they involve (at
a minimum) traversing the entire save, though if you're working with
multiple related saves (e.g. repairing part or all of a branch in a
single invocation), later saves may be less expensive than earlier
ones.

I'd also recommend treating both rewrites and repairs with caution for
now, and double-checking their results, given that they're new,
complex, and are (of course) making changes.

bup validate-refs has been added (superseding validate-ref-links) and
is capable of identifying some potential, historical issues with .bupm
files (metadata) which bup get --repair can now fix or mitigate.

These are the last significant changes we'd wanted to make before
finally releasing 0.34. See the commit messages and changes to
Documentation/ and note/ for additional higher-level information. You
may also notice some changes in direction during the series, so you
might want to look at the overall diff to the docs and notes for a
condensed summary, i.e.

git diff origin/main origin/tmp/rewrite-repair ...

Please test this if you have time. It's also available as a temporary
branch here (which will be rebased as any further changes are made):

https://github.com/bup/bup/tree/tmp/review/rewrite-repair
https://codeberg.org/bup/bup/src/branch/tmp/review/rewrite-repair

Thanks
--
Rob Browning
rlb @defaultvalue.org and @debian.org
GPG as of 2011-07-10 E6A9 DA3C C9FD 1FF8 C676 D2C4 C0F0 39E9 ED1B 597A
GPG as of 2002-11-03 14DD 432F AE39 534D B592 F9A0 25C8 D377 8C7E 73A4

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/rewrite.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 581ae5fa..fc96fcd7 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -4,13 +4,15 @@ from contextlib import ExitStack, closing, nullcontext
from itertools import chain
from os.path import join as pj
from stat import S_ISDIR, S_ISLNK, S_ISREG
-import os, sqlite3
+import os, sqlite3, time

from bup import hashsplit, metadata, vfs
from bup.git import get_cat_data, parse_commit
from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
-from bup.helpers import path_components, should_rx_exclude_path, temp_dir
+from bup.helpers import \
+ hostname, path_components, should_rx_exclude_path, temp_dir
from bup.io import qsql_id
+from bup.pwdgrp import userfullname, username
from bup.tree import Stack


@@ -274,14 +276,12 @@ class Rewriter:
save_oidx = hexlify(save_path[2][1].coid)
ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
author = ci.author_name + b' <' + ci.author_mail + b'>'
- committer = ci.committer_name + b' <' + ci.committer_mail + b'>'
+ committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
return (dstrepo.write_commit(tree, parent,
author,
ci.author_sec,
ci.author_offset,
- committer,
- ci.committer_sec,
- ci.committer_offset,
+ committer, time.time(), None,
ci.message),
tree)
finally:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com, Johannes Berg
From: Johannes Berg <joha...@sipsolutions.net>

If we don't know the size, don't pretend we do know it's 0. Leave it
None and requre clients to decide how to handle it. This matters for
'bup rewrite' which would otherwise always fill in "0" for directories
that should have "None" for their size (i.e. not stored.), changing
their hashes.

Add a new "public" argument to augment_item_meta() and
ensure_item_has_metadata() that when true, produces metadata intended
for public consumption (i.e. via ftp/ls/web). For now, that just means
that directory sizes will be 0 instead of None.

Signed-off-by: Johannes Berg <joha...@sipsolutions.net>
Reviewed-by: Rob Browning <r...@defaultvalue.org>
[r...@defaultvalue.org: add/use public argument]
[r...@defaultvalue.org: adjust commit message]
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/fuse.py | 5 +++--
lib/bup/cmd/web.py | 6 ++++--
lib/bup/ls.py | 9 ++++++---
lib/bup/vfs.py | 32 ++++++++++++++++++++------------
test/lib/buptest/vfs.py | 2 --
5 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/lib/bup/cmd/fuse.py b/lib/bup/cmd/fuse.py
index 6c5718dd..3cb28c73 100644
--- a/lib/bup/cmd/fuse.py
+++ b/lib/bup/cmd/fuse.py
@@ -59,10 +59,11 @@ class BupFs(fuse.Fuse):
if not item:
return -errno.ENOENT
if self.fake_metadata:
- item = vfs.augment_item_meta(self.repo, item, include_size=True)
+ item = vfs.augment_item_meta(self.repo, item, include_size=True,
+ public=True)
else:
item = vfs.ensure_item_has_metadata(self.repo, item,
- include_size=True)
+ include_size=True, public=True)
meta = item.meta
# FIXME: do we want/need to do anything more with nlink?
st = fuse.Stat(st_mode=meta.mode, st_nlink=1, st_size=meta.size)
diff --git a/lib/bup/cmd/web.py b/lib/bup/cmd/web.py
index 87e7343c..7807a558 100644
--- a/lib/bup/cmd/web.py
+++ b/lib/bup/cmd/web.py
@@ -176,7 +176,8 @@ def _dir_contents(repo, resolution, params, param_info):
yield item_info(b'..', parent_item, parent_item, b'..')
continue
mp = params.get('meta')
- res_item = vfs.ensure_item_has_metadata(repo, item, include_size=mp)
+ res_item = vfs.ensure_item_has_metadata(repo, item, include_size=mp,
+ public=True)
yield item_info(name, item, res_item, include_size=mp)


@@ -272,7 +273,8 @@ class BupRequestHandler(tornado.web.RequestHandler):
"""
try:
file_item = resolved[-1][1]
- file_item = vfs.augment_item_meta(repo, file_item, include_size=True)
+ file_item = vfs.augment_item_meta(repo, file_item,
+ include_size=True, public=True)

# Defer the set_header() calls until after we start
# writing so we can still generate a 500 failure if
diff --git a/lib/bup/ls.py b/lib/bup/ls.py
index 9076bf5a..0fe2412c 100644
--- a/lib/bup/ls.py
+++ b/lib/bup/ls.py
@@ -165,10 +165,12 @@ def within_repo(repo, opt, out, pwd=b''):
continue
if opt.l:
sub_item = vfs.ensure_item_has_metadata(repo, sub_item,
- include_size=True)
+ include_size=True,
+ public=True)
elif want_meta:
sub_item = vfs.augment_item_meta(repo, sub_item,
- include_size=True)
+ include_size=True,
+ public=True)
line = item_line(sub_item, sub_name)
if not opt.long_listing and istty1:
pending.append(line)
@@ -178,7 +180,8 @@ def within_repo(repo, opt, out, pwd=b''):
else:
if opt.long_listing:
leaf_item = vfs.augment_item_meta(repo, leaf_item,
- include_size=True)
+ include_size=True,
+ public=True)
line = item_line(leaf_item, os.path.normpath(path))
if not opt.long_listing and istty1:
pending.append(line)
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index b6a3b587..c44ad9ad 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -357,8 +357,6 @@ def _read_dir_meta(bupm):
if not m:
return default_dir_mode
assert m.mode is not None
- if m.size is None:
- m.size = 0
return m

def _treeish_tree_data(repo, oid):
@@ -425,6 +423,8 @@ def _compute_item_size(repo, item):
if isinstance(item, FakeLink):
return len(item.target)
return len(_readlink(repo, item.oid))
+ if S_ISDIR(mode):
+ return None
return 0

def item_size(repo, item):
@@ -1095,20 +1095,27 @@ def try_resolve(repo, path, parent=None, want_meta=True):
return follow
return res

-def augment_item_meta(repo, item, include_size=False):
- """Ensure item has a Metadata instance for item.meta. If item.meta is
- currently a mode, replace it with a compatible "fake" Metadata
- instance. If include_size is true, ensure item.meta.size is
- correct, computing it if needed. If item.meta is a Metadata
- instance, this call may modify it in place or replace it.
+def augment_item_meta(repo, item, *, include_size=False, public=False):
+ """Ensure item has a Metadata instance for item.meta. If
+ item.meta is currently a mode, replace it with a compatible "fake"
+ Metadata instance. If include_size is true, ensure item.meta.size
+ is correct, computing it if needed. If public is true, produce
+ metadata suitable for "public consumption", e.g. via
+ ls/fuse/web. This, for example, sets dir sizes to 0. If item.meta
+ is a Metadata instance, this call may modify it in place or
+ replace it.

"""
+ def maybe_public(mode, size):
+ if public and S_ISDIR(mode) and size is None:
+ return 0
+ return size
# If we actually had parallelism, we'd need locking...
assert repo
m = item.meta
if isinstance(m, Metadata):
if include_size and m.size is None:
- m.size = _compute_item_size(repo, item)
+ m.size = maybe_public(m.mode, _compute_item_size(repo, item))
return item._replace(meta=m)
return item
# m is mode
@@ -1122,7 +1129,7 @@ def augment_item_meta(repo, item, include_size=False):
meta.symlink_target = target
meta.size = len(target)
elif include_size:
- meta.size = _compute_item_size(repo, item)
+ meta.size = maybe_public(m, _compute_item_size(repo, item))
return item._replace(meta=meta)

def fill_in_metadata_if_dir(repo, item):
@@ -1139,7 +1146,7 @@ def fill_in_metadata_if_dir(repo, item):
item = items[0][1]
return item

-def ensure_item_has_metadata(repo, item, include_size=False):
+def ensure_item_has_metadata(repo, item, *, include_size=False, public=False):
"""If item is a directory, attempt to find and add its metadata. If
the item still doesn't have a Metadata instance for item.meta,
give it one via augment_item_meta(). May be useful for the output
@@ -1148,7 +1155,8 @@ def ensure_item_has_metadata(repo, item, include_size=False):
"""
return augment_item_meta(repo,
fill_in_metadata_if_dir(repo, item),
- include_size=include_size)
+ include_size=include_size,
+ public=public)

def join(repo, ref):
"""Generate a list of the content of all blobs that can be reached
diff --git a/test/lib/buptest/vfs.py b/test/lib/buptest/vfs.py
index 6495fa49..dbd3c609 100644
--- a/test/lib/buptest/vfs.py
+++ b/test/lib/buptest/vfs.py
@@ -21,8 +21,6 @@ def tree_items(repo, oid):
try:
maybe_meta = lambda : Metadata.read(bupm) if bupm else None
m = maybe_meta()
- if m and m.size is None:
- m.size = 0
yield TreeDictValue(name=b'.', oid=oid, meta=m)
tree_ents = vfs.ordered_tree_entries(tree_entries(tree_data), bupm=True)
for name, mangled_name, kind, gitmode, sub_oid in tree_ents:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Thanks to Johannes Berg for the suggestion.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 48 ++++++---------
lib/bup/rewrite.py | 144 ++++++++++++++++++++++++++-------------------
2 files changed, 102 insertions(+), 90 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 265bd59c..885f5475 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -1,11 +1,11 @@

from binascii import hexlify, unhexlify
from collections import namedtuple
-from contextlib import ExitStack, closing
+from contextlib import nullcontext
from stat import S_ISDIR
-import os, sys, textwrap, sqlite3, time
+import os, sys, textwrap, time

-from bup import client, compat, git, hashsplit, rewrite, vfs
+from bup import client, compat, git, hashsplit, vfs
from bup.commit import commit_message
from bup.compat import argv_bytes
from bup.config import derive_repo_addr
@@ -19,11 +19,11 @@ from bup.helpers import \
note_error,
parse_num,
parse_rx_excludes,
- temp_dir,
tty_width)
from bup.io import path_msg
from bup.pwdgrp import userfullname, username
from bup.repo import LocalRepo, make_repo
+from bup.rewrite import Rewriter


argspec = (
@@ -138,6 +138,7 @@ def parse_args(args):
opt.ignore_missing = False
opt.rewrite = None # None means "didn't specify"
opt.rewrite_db = None
+ opt.rewriter = None # internal, synthetic "option"...
opt.source = opt.remote = None
opt.target_specs = []

@@ -284,11 +285,8 @@ def append_commit(src_loc, parent, src_repo, dest_repo, opt):
root, ref, save = path
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
- return rewrite.append_save(path, parent, src_repo, dest_repo,
- opt.dest_split_cfg, opt.exclude_rxs,
- # FIXME: ...
- opt.rewrite_db_conn,
- opt.rewrite_db_mapping)
+ return opt.rewriter.append_save(path, parent, src_repo, dest_repo,
+ opt.exclude_rxs)


def append_commits(src_loc, dest_hash, src_repo, dest_repo, opt):
@@ -324,12 +322,9 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, opt):
last_c, tree = dest_hash, None
for commit in commits:
coid = unhexlify(commit)
- last_c, tree = rewrite.append_save(path + (entry_for_coid[coid],),
- last_c, src_repo, dest_repo,
- opt.dest_split_cfg, opt.exclude_rxs,
- # FIXME: ...
- opt.rewrite_db_conn,
- opt.rewrite_db_mapping)
+ last_c, tree = opt.rewriter.append_save(path + (entry_for_coid[coid],),
+ last_c, src_repo, dest_repo,
+ opt.exclude_rxs)
assert tree is not None
return last_c, tree

@@ -753,25 +748,16 @@ def main(argv):
compression_level=opt.compress) as dest_repo:

src_split_cfg = hashsplit.configuration(src_repo.config_get)
- opt.dest_split_cfg = hashsplit.configuration(dest_repo.config_get)
+ dest_split_cfg = hashsplit.configuration(dest_repo.config_get)

- if src_split_cfg != opt.dest_split_cfg and opt.rewrite is None:
+ if src_split_cfg != dest_split_cfg and opt.rewrite is None:
misuse('repository configs differ; specify --rewrite or --no-rewrite')

- ctx = ExitStack()
- if opt.rewrite:
- if not opt.rewrite_db:
- rwdb_tmpdir = ctx.enter_context(temp_dir(prefix='bup-rewrite-'))
- opt.rewrite_db = f'{rwdb_tmpdir}/db'
- rwdb_conn = sqlite3.connect(opt.rewrite_db)
- rwdb_conn.text_factory = bytes
- ctx.enter_context(closing(rwdb_conn))
- opt.rewrite_db_conn = rwdb_conn # FIXME: ...
- with closing(rwdb_conn.cursor()) as rwdb_cur:
- opt.rewrite_db_mapping = \
- rewrite.prep_mapping_table(rwdb_cur, opt.dest_split_cfg)
-
- with ctx:
+ opt.rewriter = \
+ Rewriter(split_cfg=dest_split_cfg, db=opt.rewrite_db) \
+ if opt.rewrite else None
+
+ with opt.rewriter or nullcontext():

# Resolve and validate all sources and destinations,
# implicit or explicit, and do it up-front, so we can
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index e65fa506..581ae5fa 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -1,15 +1,15 @@

from binascii import hexlify
-from contextlib import closing
+from contextlib import ExitStack, closing, nullcontext
from itertools import chain
from os.path import join as pj
from stat import S_ISDIR, S_ISLNK, S_ISREG
-import os
+import os, sqlite3

from bup import hashsplit, metadata, vfs
from bup.git import get_cat_data, parse_commit
from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
-from bup.helpers import path_components, should_rx_exclude_path
+from bup.helpers import path_components, should_rx_exclude_path, temp_dir
from bup.io import qsql_id
from bup.tree import Stack

@@ -21,7 +21,7 @@ def _fs_path_from_vfs(path):
return fs + b'/'


-def prep_mapping_table(db, split_cfg):
+def _prep_mapping_table(db, split_cfg):
# This currently only needs to track items that may be split,
# depending on the current repo settings (e.g. files and
# directories); it records the result so we can re-use it if we
@@ -38,7 +38,7 @@ def prep_mapping_table(db, split_cfg):
' without rowid')
return table_id

-def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
+def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
"""Return (replacement_item, converted_oid, git_mode) for the
given item if any, *and* if the dstrepo has the item.oid. If not,
converted_oid and mode will be None. The replacement_item will
@@ -77,7 +77,7 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

-def vfs_walk_recursively(srcrepo, dstrepo, path, excludes, db, mapping):
+def _vfs_walk_recursively(srcrepo, dstrepo, path, excludes, db, mapping):
item = path[-1][1]
assert len(path) >= 3
# drop branch/DATE
@@ -94,15 +94,15 @@ def vfs_walk_recursively(srcrepo, dstrepo, path, excludes, db, mapping):
continue
if S_ISDIR(vfs.item_mode(sub_item)):
conv_item, oid, _ = \
- previous_conversion(dstrepo, sub_item, True, db, mapping)
+ _previous_conversion(dstrepo, sub_item, True, db, mapping)
if conv_item is not sub_item:
sub_path = sub_path[:-1] + ((sub_path[-1][0], conv_item),)
if oid is None:
- yield from vfs_walk_recursively(srcrepo, dstrepo, sub_path,
- excludes, db, mapping)
+ yield from _vfs_walk_recursively(srcrepo, dstrepo, sub_path,
+ excludes, db, mapping)
yield sub_path

-def rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
+def _rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
assert isinstance(name, bytes)
target = vfs.readlink(srcrepo, item)
git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
@@ -115,8 +115,8 @@ def rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
assert item.meta.size == len(item.meta.symlink_target)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

-def rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack, wdbc,
- mapping):
+def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
+ wdbc, mapping):
# save_path is the vfs path to the save ref, e.g. to branch/DATE
fs_path = _fs_path_from_vfs(path[3:]) # not including /branch/DATE
assert not fs_path.startswith(b'/') # because resolve(parent=...)
@@ -148,7 +148,7 @@ def rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack, wdbc,
# First, things that can't be affected by the rewrite
item_mode = vfs.item_mode(item)
if S_ISLNK(item_mode):
- rewrite_link(item, item_mode, filen, srcrepo, dstrepo, stack)
+ _rewrite_link(item, item_mode, filen, srcrepo, dstrepo, stack)
return
if not S_ISREG(item_mode) and not S_ISDIR(item_mode):
# Everything here (pipes, devices, etc.) should be fully
@@ -160,7 +160,7 @@ def rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack, wdbc,
return

item, oid, git_mode = \
- previous_conversion(dstrepo, item, not filen, wdbc, mapping)
+ _previous_conversion(dstrepo, item, not filen, wdbc, mapping)

if not filen:
# Since there's no filename, this is a subdir -- finish it.
@@ -212,51 +212,77 @@ def rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack, wdbc,
(item.oid, oid, chunked, item_size))
stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)

+class Rewriter:
+ def __init__(self, *, split_cfg, db=None):
+ assert isinstance(db, (bytes, type(None)))
+ self._context = nullcontext()
+ with ExitStack() as ctx:
+ self._split_cfg = split_cfg
+ self._db_path = db
+ if db:
+ self._db_tmpdir = None
+ else:
+ self._db_tmpdir = \
+ ctx.enter_context(temp_dir(prefix='bup-rewrite-'))
+ self._db_path = f'{self._db_tmpdir}/db'
+ self._db_conn = sqlite3.connect(self._db_path)
+ ctx.enter_context(closing(self._db_conn))
+ self._db_conn.text_factory = bytes
+ with closing(self._db_conn.cursor()) as cur:
+ self._mapping = _prep_mapping_table(cur, split_cfg)
+ self._context = ctx.pop_all()

-def append_save(save_path, parent, srcrepo, dstrepo, split_cfg,
- excludes, workdb, mapping):
- # Strict for now
- assert isinstance(parent, (bytes, type(None))), parent
- if parent:
- assert len(parent) == 20, parent
- assert len(save_path) == 3, (len(save_path), save_path)
- assert isinstance(save_path[1][1], vfs.RevList)
- leaf_name, leaf_item = save_path[2]
- if isinstance(leaf_item, vfs.FakeLink):
- # For now, vfs.contents() does not resolve the one FakeLink
- assert leaf_name == b'latest', save_path
- res = srcrepo.resolve(leaf_item.target, parent=save_path[:-1],
- follow=False, want_meta=False)
- leaf_name, leaf_item = res[-1]
- save_path = res
- assert isinstance(leaf_item, vfs.Commit), leaf_item
- # Currently, the workdb must always be ready to commit (see finally below)
- with closing(workdb.cursor()) as dbc:
- try:
- # Maintain a stack of information representing the current
- # location in the archive being constructed.
- stack = Stack(dstrepo, split_cfg)
- for path in vfs_walk_recursively(srcrepo, dstrepo, save_path,
- excludes, dbc, mapping):
- rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg,
- stack, dbc, mapping)
+ def __enter__(self):
+ return self
+ def __exit__(self, exc_type, exc_value, traceback):
+ with self._context:
+ pass

- while len(stack) > 1: # pop all parts above root folder
- stack.pop()
- tree = stack.pop() # and the root to get the tree
+ def append_save(self, save_path, parent, srcrepo, dstrepo, excludes):
+ # Strict for now
+ assert isinstance(parent, (bytes, type(None))), parent
+ if parent:
+ assert len(parent) == 20, parent
+ assert len(save_path) == 3, (len(save_path), save_path)
+ assert isinstance(save_path[1][1], vfs.RevList)
+ leaf_name, leaf_item = save_path[2]
+ if isinstance(leaf_item, vfs.FakeLink):
+ # For now, vfs.contents() does not resolve the one FakeLink
+ assert leaf_name == b'latest', save_path
+ res = srcrepo.resolve(leaf_item.target, parent=save_path[:-1],
+ follow=False, want_meta=False)
+ leaf_name, leaf_item = res[-1]
+ save_path = res
+ assert isinstance(leaf_item, vfs.Commit), leaf_item
+ # Currently, the workdb must always be ready to commit (see finally below)
+ with closing(self._db_conn.cursor()) as dbc:
+ try:
+ # Maintain a stack of information representing the current
+ # location in the archive being constructed.
+ stack = Stack(dstrepo, self._split_cfg)

- save_oidx = hexlify(save_path[2][1].coid)
- ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
- author = ci.author_name + b' <' + ci.author_mail + b'>'
- committer = ci.committer_name + b' <' + ci.committer_mail + b'>'
- return (dstrepo.write_commit(tree, parent,
- author,
- ci.author_sec,
- ci.author_offset,
- committer,
- ci.committer_sec,
- ci.committer_offset,
- ci.message),
- tree)
- finally:
- workdb.commit() # the workdb is always ready for commit
+ for path in _vfs_walk_recursively(srcrepo, dstrepo, save_path,
+ excludes, dbc, self._mapping):
+ _rewrite_save_item(save_path, path, srcrepo, dstrepo,
+ self._split_cfg, stack, dbc,
+ self._mapping)
+
+ while len(stack) > 1: # pop all parts above root folder
+ stack.pop()
+ tree = stack.pop() # and the root to get the tree
+
+ save_oidx = hexlify(save_path[2][1].coid)
+ ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
+ author = ci.author_name + b' <' + ci.author_mail + b'>'
+ committer = ci.committer_name + b' <' + ci.committer_mail + b'>'
+ return (dstrepo.write_commit(tree, parent,
+ author,
+ ci.author_sec,
+ ci.author_offset,
+ committer,
+ ci.committer_sec,
+ ci.committer_offset,
+ ci.message),
+ tree)
+ finally:
+ self._db_conn.commit() # the workdb is always ready for commit
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Add them to the spec since they are part of the specification, and
also because we're eventually going to become target-specific.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 29 ++++++++++++++++-------------
1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 4b7ede10..2c286ca2 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -3,6 +3,7 @@ from binascii import hexlify, unhexlify
from collections import namedtuple
from dataclasses import replace as dcreplace
from contextlib import nullcontext
+from re import Pattern
from stat import S_ISDIR
from typing import Optional, Union
import os, re, sys, textwrap, time
@@ -134,6 +135,7 @@ class Spec:
src: bytes
dest: bytes
missing: Optional[MissingConfig] = None
+ excludes: Optional[list[Pattern]] = None

def spec_msg(s):
if not s.dest:
@@ -226,10 +228,11 @@ def parse_args(args):
continue
else:
misuse()
- opt.exclude_rxs = parse_rx_excludes(exclude_opts, misuse)
- if opt.exclude_rxs and not opt.rewrite:
+ excludes = parse_rx_excludes(exclude_opts, misuse)
+ if excludes and not opt.rewrite:
misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
- opt.target_specs = [dcreplace(x, missing=missing) for x in opt.target_specs]
+ opt.target_specs = [dcreplace(x, missing=missing, excludes=excludes)
+ for x in opt.target_specs]
return opt

# FIXME: client error handling (remote exceptions, etc.)
@@ -290,9 +293,9 @@ def transfer_commit(name, hash, parent, src_repo, dest_repo, missing):
return c, tree


-def append_commit(src_loc, parent, src_repo, dest_repo, missing, opt):
+def append_commit(src_loc, parent, src_repo, dest_repo, missing, excludes, opt):
if not opt.rewrite:
- assert isinstance(src_loc, (bytes, Loc))
+ assert isinstance(src_loc, (bytes, Loc)), src_loc
oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
return transfer_commit(None, # unused
oidx, parent, src_repo, dest_repo, missing)
@@ -304,18 +307,18 @@ def append_commit(src_loc, parent, src_repo, dest_repo, missing, opt):
root, ref, save = path
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
- return opt.rewriter.append_save(path, parent, src_repo, dest_repo,
- opt.exclude_rxs)
+ return opt.rewriter.append_save(path, parent, src_repo, dest_repo, excludes)


-def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, opt):
+def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, excludes,
+ opt):
if not opt.rewrite:
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()
last_c, tree = dest_hash, None
for commit in commits:
last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
- missing, opt)
+ missing, excludes, opt)
assert tree is not None
return last_c, tree

@@ -343,7 +346,7 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, opt):
coid = unhexlify(commit)
last_c, tree = opt.rewriter.append_save(path + (entry_for_coid[coid],),
last_c, src_repo, dest_repo,
- opt.exclude_rxs)
+ excludes)
assert tree is not None
return last_c, tree

@@ -576,7 +579,7 @@ def handle_append(item, src_repo, dest_repo, opt):
if item.dest.hash:
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
return append_commits(item.src, item.dest.hash, src_repo, dest_repo,
- item.spec.missing, opt)
+ item.spec.missing, item.spec.excludes, opt)


def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
@@ -621,12 +624,12 @@ def handle_pick(item, src_repo, dest_repo, opt):
# if the dest is committish, make it the parent
if item.dest.type in ('branch', 'commit', 'save'):
return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
- item.spec.missing, opt)
+ item.spec.missing, item.spec.excludes, opt)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
return append_commit(item.src, None, src_repo, dest_repo, item.spec.missing,
- opt)
+ item.spec.excludes, opt)


def resolve_new_tag(spec, src_repo, dest_repo):
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/repo/local.py | 5 +++++
lib/bup/repo/remote.py | 6 ++++++
2 files changed, 11 insertions(+)

diff --git a/lib/bup/repo/local.py b/lib/bup/repo/local.py
index f09d350c..35fc732a 100644
--- a/lib/bup/repo/local.py
+++ b/lib/bup/repo/local.py
@@ -53,6 +53,11 @@ class LocalRepo(RepoProtocol):
self._deduplicate_writes = True
self.closed = False

+ def __repr__(self):
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' repo_dir={self.repo_dir!r}>'
+
def close(self):
if not self.closed:
self.closed = True
diff --git a/lib/bup/repo/remote.py b/lib/bup/repo/remote.py
index f16e3c51..9545eb23 100644
--- a/lib/bup/repo/remote.py
+++ b/lib/bup/repo/remote.py
@@ -6,6 +6,7 @@ from bup.repo.base import _make_base, RepoProtocol
class RemoteRepo(RepoProtocol):
def __init__(self, address, create=False, compression_level=None,
max_pack_size=None, max_pack_objects=None):
+ self._address = address
self.closed = True # in case Client instantiation fails
self.client = client.Client(address, create=create)
self.closed = False
@@ -23,6 +24,11 @@ class RemoteRepo(RepoProtocol):
self.resolve = self.client.resolve
self._packwriter = None

+ def __repr__(self):
+ cls = self.__class__
+ return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
+ f' address={self._address!r}>'
+
def close(self):
if not self.closed:
self.closed = True
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 29 +++++++++++++++++------------
lib/bup/io.py | 6 ++++++
test/int/test_io.py | 20 +++++++++++++++++++-
3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index 105ff99a..c9912603 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -1,6 +1,7 @@

from binascii import hexlify, unhexlify
from contextlib import closing
+from itertools import chain
from stat import S_ISDIR, S_ISLNK, S_ISREG
import os
import sqlite3
@@ -13,7 +14,7 @@ from bup.helpers import \
valid_save_name, log,
parse_rx_excludes,
should_rx_exclude_path)
-from bup.io import path_msg
+from bup.io import path_msg, qsql_id
from bup.tree import Stack
from bup.repo import make_repo
from bup.config import derive_repo_addr, ConfigError
@@ -29,6 +30,18 @@ exclude-rx= skip paths matching the unanchored regex (may be repeated)
exclude-rx-from= skip --exclude-rx patterns in file (may be repeated)
"""

+def prep_mapping_table(db, split_cfg):
+ settings = [str(x) for x in chain.from_iterable(sorted(split_cfg.items()))]
+ for x in settings: assert '_' not in x
+ table_id = f'bup_rewrite_mapping_to_bits_{"_".join(settings)}'
+ table_id = qsql_id(table_id)
+ db.execute(f'create table if not exists {table_id}'
+ ' (src blob primary key,'
+ ' dst blob not null,'
+ ' mode integer,'
+ ' size integer)'
+ ' without rowid')
+ return table_id

def converted_already(dstrepo, item, vfs_dir, db, mapping):
size = -1 # irrelevant
@@ -187,15 +200,7 @@ def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
commits.reverse()
with closing(workdb.cursor()) as wdbc:
try:
- tablename = 'mapping_to_bits'
- for k, v in split_cfg.items():
- tablename += f'_{k}_{v}'
- workdb.execute(f"create table if not exists {tablename}"
- ' (src blob primary key,'
- ' dst blob not null,'
- ' mode integer,'
- ' size integer)'
- ' without rowid')
+ mapping = prep_mapping_table(wdbc, split_cfg)

# Maintain a stack of information representing the current
# location in the archive being constructed.
@@ -211,9 +216,9 @@ def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
coid=commit)
for fullname, item in vfs_walk_recursively(srcrepo, dstrepo,
citem, excludes,
- wdbc, tablename):
+ wdbc, mapping):
rewrite_item(item, commit_name, fullname, srcrepo, src,
- dstrepo, split_cfg, stack, wdbc, tablename)
+ dstrepo, split_cfg, stack, wdbc, mapping)

while len(stack) > 1: # pop all parts above root folder
stack.pop()
diff --git a/lib/bup/io.py b/lib/bup/io.py
index d4e0116b..4ccdc58b 100644
--- a/lib/bup/io.py
+++ b/lib/bup/io.py
@@ -253,6 +253,12 @@ def path_msg(x):
return enc_shs(fsdecode(x))


+def qsql_id(s):
+ return ''.join(('"', s.replace('"', '""'), '"'))
+def qsql_str(s):
+ return ''.join(("'", s.replace("'", "''"), "'"))
+
+
assert not hasattr(py_mmap.mmap, '__del__')

class mmap(py_mmap.mmap):
diff --git a/test/int/test_io.py b/test/int/test_io.py
index d683aa0c..f9e306b5 100644
--- a/test/int/test_io.py
+++ b/test/int/test_io.py
@@ -1,7 +1,7 @@

from wvpytest import *

-from bup.io import enc_dsq, enc_dsqs, enc_sh, enc_shs
+from bup.io import enc_dsq, enc_dsqs, enc_sh, enc_shs, qsql_id, qsql_str


def _dsq_enc_byte(b):
@@ -101,3 +101,21 @@ def test_enc_shs():
== enc_shs(b'\x80'.decode('ascii', errors='surrogateescape'))
assert r"$'\xb5'" \
== enc_shs(b'\xb5'.decode('utf-8', errors='surrogateescape'))
+
+def test_qsql_id():
+ assert '""""' == qsql_id('"')
+ assert '"x"' == qsql_id('x')
+ assert '"""x"' == qsql_id('"x')
+ assert '"x"""' == qsql_id('x"')
+ assert '"x"""' == qsql_id('x"')
+ assert '"x""y"' == qsql_id('x"y')
+ assert '"x""y""z"' == qsql_id('x"y"z')
+
+def test_qsql_str():
+ assert "''''" == qsql_str("'")
+ assert "'x'" == qsql_str("x")
+ assert "'''x'" == qsql_str("'x")
+ assert "'x'''" == qsql_str("x'")
+ assert "'x'''" == qsql_str("x'")
+ assert "'x''y'" == qsql_str("x'y")
+ assert "'x''y''z'" == qsql_str("x'y'z")
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
We need to track the the vfs_mode in order to distinguish a
pipe/device/etc. from an empty file since they'll all have the same
oid.

Insert a row into the db for every item so that we'll know we've
already checked any whose oid doesn't change, i.e. so we don't attempt
to split the same oid more than once.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 88 +++++++++++++++++++++++-------------------
1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index d2b49137..27bef880 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -36,10 +36,12 @@ def prep_mapping_table(db, split_cfg):
table_id = f'bup_rewrite_mapping_to_bits_{"_".join(settings)}'
table_id = qsql_id(table_id)
db.execute(f'create table if not exists {table_id}'
- ' (src blob primary key,'
+ ' (src blob,'
' dst blob not null,'
- ' mode integer,'
- ' size integer)'
+ ' vfs_mode integer,'
+ ' git_mode integer,'
+ ' size integer,'
+ ' primary key (src, vfs_mode))'
' without rowid')
return table_id

@@ -53,25 +55,20 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
"""
if isinstance(item.meta, metadata.Metadata):
size = item.meta.size
- mode = item.meta.mode
+ item_mode = item.meta.mode
else:
size = None
- mode = item.meta
-
- # if we know the size, and the oid exists already (small file w/o
- # hashsplit) then simply return it can't do that if it's a
- # directory, since it might exist but in the non-augmented
- # version, so dirs always go through the database lookup
-
- # FIXME: this seems wrong - what if we're splitting in-repo to smaller chunks?
- #if not vfs_dir and size is not None and dstrepo.exists(item.oid):
- # return item.oid, mode
- db.execute(f'select dst, mode, size from {mapping} where src = ?',
- (item.oid,))
+ item_mode = item.meta
+
+ db.execute(f'select dst, vfs_mode, git_mode, size from {mapping}'
+ ' where src = ? and vfs_mode = ?',
+ (item.oid, item_mode))
data = db.fetchone()
if not data:
- return item, None, None
- dst, mode, size = data
+ return item, None, None, None
+ assert db.fetchone() is None
+ dst, vfs_mode, git_mode, size = data
+ assert vfs_mode == item_mode
# augment the size if appropriate
if size is not None and isinstance(item.meta, metadata.Metadata):
if item.meta.size is not None:
@@ -81,9 +78,9 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
item.meta.size = size
# if we have it in the DB and in the destination repo, return it
if dstrepo.exists(dst):
- return item, dst, mode
+ return item, dst, vfs_mode, git_mode
# this only happens if you reuse a database
- return item, None, None
+ return item, None, None, None

def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
fullname=b''):
@@ -95,7 +92,7 @@ def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
if should_rx_exclude_path(check_name, excludes):
continue
if S_ISDIR(vfs.item_mode(item)):
- item, oid, _ = previous_conversion(dstrepo, item, True, db, mapping)
+ item, oid, _, _ = previous_conversion(dstrepo, item, True, db, mapping)
if oid is None:
yield from vfs_walk_recursively(srcrepo, dstrepo, item,
excludes, db, mapping,
@@ -125,7 +122,8 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
meta = None
stack.push(dir_name, meta)

- item, oid, mode = previous_conversion(dstrepo, item, not filen, wdbc, mapping)
+ item, oid, vfs_mode, git_mode = \
+ previous_conversion(dstrepo, item, not filen, wdbc, mapping)

if not filen:
if len(stack) == 1:
@@ -133,20 +131,23 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
# Since there's no filename, this is a subdir -- finish it.
newtree = stack.pop(override_tree=oid)
if oid is None:
- wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
- (item.oid, newtree))
+ assert vfs_mode is None, item.oid.hex()
+ assert git_mode is None, item.oid.hex()
+ vfs_mode = vfs.item_mode(item)
+ wdbc.execute(f'insert into {mapping}'
+ ' (src, dst, vfs_mode) values (?, ?, ?)',
+ (item.oid, newtree, vfs_mode))
return

- vfs_mode = vfs.item_mode(item)
-
# already converted - oid and mode are known
if oid is not None:
- assert mode is not None, oid
- stack.append_to_current(filen, vfs_mode, mode, oid, item.meta)
+ assert vfs_mode is not None, oid.hex()
+ assert git_mode is not None, oid.hex()
+ stack.append_to_current(filen, vfs_mode, git_mode, oid, item.meta)
return

+ vfs_mode = vfs.item_mode(item)
item_size = None
- size_augmented = False
if S_ISREG(vfs_mode):
item_size = 0
def write_data(data):
@@ -154,7 +155,7 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
item_size += len(data)
return dstrepo.write_data(data)
with vfs.tree_data_reader(srcrepo, item.oid) as f:
- mode, oid = hashsplit.split_to_blob_or_tree(
+ git_mode, oid = hashsplit.split_to_blob_or_tree(
write_data, dstrepo.write_tree,
hashsplit.from_config([f], split_cfg))
if isinstance(item.meta, metadata.Metadata):
@@ -162,20 +163,18 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
# must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
item.meta.size = item_size
- size_augmented = True
else:
assert item.meta.size == item_size
elif S_ISDIR(vfs_mode):
assert False # handled above
elif S_ISLNK(vfs_mode):
target = vfs.readlink(srcrepo, item)
- mode, oid = (GIT_MODE_SYMLINK, dstrepo.write_symlink(target))
+ git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
if isinstance(item.meta, metadata.Metadata):
if item.meta.size is None:
# must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
item.meta.size = len(item.meta.symlink_target)
- size_augmented = True
else:
assert item.meta.size == len(item.meta.symlink_target)
item_size = len(target)
@@ -183,13 +182,22 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
# Everything else should be fully described by its metadata,
# so just record an empty blob, so the paths in the tree and
# .bupm will match up.
- mode, oid = (GIT_MODE_FILE, dstrepo.write_data(b''))
-
- if size_augmented or oid != item.oid:
- wdbc.execute(f'insert into {mapping} (src, dst, mode, size)'
- ' values (?, ?, ?, ?)',
- (item.oid, oid, mode, item_size))
- stack.append_to_current(filen, vfs_mode, mode, oid, item.meta)
+ assert item_size is None
+ git_mode, oid = GIT_MODE_FILE, dstrepo.write_data(b'')
+
+ wdbc.execute(f'select src, dst, vfs_mode, size from {mapping}'
+ ' where src = ? and vfs_mode = ?',
+ (item.oid, vfs_mode))
+ row = wdbc.fetchone()
+ assert wdbc.fetchone() is None
+ if row: # reusing previously populated db
+ assert row == (item.oid, oid, vfs_mode, git_mode, item_size)
+ else:
+ wdbc.execute(f'insert into {mapping}'
+ ' (src, dst, vfs_mode, git_mode, size)'
+ ' values (?, ?, ?, ?, ?)',
+ (item.oid, oid, vfs_mode, git_mode, item_size))
+ stack.append_to_current(filen, vfs_mode, git_mode, oid, item.meta)

def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
# Currently, the workdb must always be ready to commit (see finally below)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
There's no need to track symlinks, special files, etc., since they
don't require splitting, and so it's easy and inexpensive to just
(re)create them when needed.

This will also allow us to simplify the mapping table because we no
longer have the possibility that an oid could map to multiple
items (e.g. an empty file, a fifo, and a block device, since bup
stores all of these as empty blobs distinguished by their modes).

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 44 ++++++++++++++++++++++++++----------------
1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index 2a647951..b3fb3666 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -104,6 +104,19 @@ def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
else:
yield itemname, item

+def rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
+ assert isinstance(name, bytes)
+ target = vfs.readlink(srcrepo, item)
+ git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
+ if isinstance(item.meta, metadata.Metadata):
+ if item.meta.size is None:
+ # must not modify vfs results (see vfs docs)
+ item = vfs.copy_item(item)
+ item.meta.size = len(item.meta.symlink_target)
+ else:
+ assert item.meta.size == len(item.meta.symlink_target)
+ stack.append_to_current(name, item_mode, git_mode, oid, item.meta)
+
def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
stack, wdbc, mapping):
dirn, filen = os.path.split(fullname)
@@ -124,6 +137,20 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
meta = None
stack.push(dir_name, meta)

+ # First, things that can't be affected by the rewrite
+ item_mode = vfs.item_mode(item)
+ if S_ISLNK(item_mode):
+ rewrite_link(item, item_mode, filen, srcrepo, dstrepo, stack)
+ return
+ if not S_ISREG(item_mode) and not S_ISDIR(item_mode):
+ # Everything here (pipes, devices, etc.) should be fully
+ # described by its metadata, and so bup just saves an empty
+ # "placeholder" blob in the git tree (so the tree and .bupm
+ # will match up).
+ git_mode, oid = GIT_MODE_FILE, dstrepo.write_data(b'')
+ stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)
+ return
+
item, oid, vfs_mode, git_mode = \
previous_conversion(dstrepo, item, not filen, wdbc, mapping)

@@ -169,23 +196,6 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
assert item.meta.size == item_size
elif S_ISDIR(vfs_mode):
assert False # handled above
- elif S_ISLNK(vfs_mode):
- target = vfs.readlink(srcrepo, item)
- git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
- if isinstance(item.meta, metadata.Metadata):
- if item.meta.size is None:
- # must not modify vfs results (see vfs docs)
- item = vfs.copy_item(item)
- item.meta.size = len(item.meta.symlink_target)
- else:
- assert item.meta.size == len(item.meta.symlink_target)
- item_size = len(target)
- else:
- # Everything else should be fully described by its metadata,
- # so just record an empty blob, so the paths in the tree and
- # .bupm will match up.
- assert item_size is None
- git_mode, oid = GIT_MODE_FILE, dstrepo.write_data(b'')

wdbc.execute(f'select src, dst, vfs_mode, size from {mapping}'
' where src = ? and vfs_mode = ?',
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
See the changes to bup-get(1) and the release notes for additional
information.

Remove --rewrite-db for now so that we don't have to consider the
potential for "skew" across invocations, e.g. with respect to the
--repair-id. We can always reconsider it later if we like.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 44 +++-
dev/make-splittable-tree | 31 +++
lib/bup/cmd/get.py | 99 +++++----
lib/bup/commit.py | 28 ++-
lib/bup/helpers.py | 6 +
lib/bup/metadata.py | 3 +
lib/bup/repair.py | 46 +++++
lib/bup/rewrite.py | 322 ++++++++++++++++++++++++------
lib/bup/tree.py | 9 +
lib/bup/vfs.py | 8 +
note/main.md | 15 +-
test/ext/test-get-rewrite-missing | 279 ++++++++++++++++++++++++++
test/ext/test-rewrite | 9 +-
test/ext/test-rm | 64 ++----
test/ext/test_split_trees.py | 21 +-
test/int/test_commit.py | 11 +-
wvtest-bash.sh | 28 +++
17 files changed, 844 insertions(+), 179 deletions(-)
create mode 100755 dev/make-splittable-tree
create mode 100644 lib/bup/repair.py
create mode 100755 test/ext/test-get-rewrite-missing

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index bddd0af1..91edc1ec 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -142,12 +142,6 @@ used to help test before/after results.)
above) than `bup get` itself. Please consider validating the
results carefully for now.

-\--rewrite-db=*path*
-: place the rewrite database at *path*. Re-using an existing
- database (e.g. after an interruption) can allow the rewrite to
- resume without repeating expensive operations. By default, a
- transient database will be placed in TMPDIR and removed on exit.
-
\--exclude-rx=*pattern*
: exclude any path matching *pattern*, which must be a Python regular
expression (http://docs.python.org/library/re.html). The pattern
@@ -187,11 +181,30 @@ used to help test before/after results.)
pack.compression or core.compression, or 1 (fast, loose
compression).

-\--missing <fail|ignore>
+\--repair-id ID
+: set the repair session identifier, defaults to a UUID (v4). This
+ identifier will be included in repairs made during the transfer,
+ i.e. via `--missing replace`. Currently, the identifier must be
+ ASCII and must not include control characters or DEL (i.e. must be
+ comprised of bytes >= 20 and < 127).
+
+\--missing <fail|ignore|replace>
: when missing objects are encountered during a transfer, either
- `fail` (exit with nonzero status, the default) or `ignore` them.
- The latter is currently only supported by `--unnamed`, and is
- potentially *dangerous*.
+ `fail` (exit with nonzero status, the default), `ignore` them
+ (currently only supported by `--unnamed`, and potentially
+ *dangerous*), or `replace` them with placeholders (see
+ REPLACEMENTS below).
+
+# REPLACEMENTS
+
+Saves (commits) with missing objects can be repaired by specifying
+`--missing replace` which will substitute synthesized "repair files"
+for any paths with missing objects. There is currently no support for
+retrieving unaffected parts of split files or trees, the entire file
+or tree is replaced with a repair file.
+
+These repair files contain the `--repair-id` and information about
+the replacement.

# EXAMPLES

@@ -249,7 +262,18 @@ used to help test before/after results.)
$ bup rm archives
$ bup gc
$ git --git-dir "$BUP_DIR" branch -m archives-resplit archives
+ #
+ # Repair a single save with missing objects.
+ $ bup get --missing replace --pick archives/latest fixed
+ #
+ # Check that fixed/latest looks OK, perhaps via trial
+ # restores, joining it, etc. (see CAUTION above).
+
+
+# EXIT STATUS

+An exit status of 3 indicates that repairs were needed and were
+successful, and that no other errors occurred.

# SEE ALSO

diff --git a/dev/make-splittable-tree b/dev/make-splittable-tree
new file mode 100755
index 00000000..08d61068
--- /dev/null
+++ b/dev/make-splittable-tree
@@ -0,0 +1,31 @@
+#!/bin/sh
+"""": # -*-python-*-
+python="$(dirname "$0")/python" || exit $?
+exec "$python" "$0" ${1+"$@"}
+"""
+
+from os import environb as environ, mkdir
+from sys import argv, stderr
+import os.path as path, sys
+
+
+# FIXME: change this to request --depth 1 level 2 etc.?
+
+
+if len(argv) != 2:
+ print('Usage: make-splittable-tree WHERE', file=stderr)
+ sys.exit(2)
+
+where = argv[1]
+
+try:
+ mkdir(where)
+except FileExistsError:
+ print(f'error: {where!r} already exists', file=stderr)
+ sys.exit(2)
+
+for i in range(int(environ.get(b'BUP_SPLITTABLE_COUNT', '10000'))):
+ d = f'{where}/some-random-path-name-to-make-the-tree-bigger-{i}'
+ mkdir(d)
+ with open(path.join(d, 'data'), 'w') as f:
+ pass
diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 73c1c493..945e3ffb 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -4,16 +4,19 @@ from collections import namedtuple
from dataclasses import replace as dcreplace
from re import Pattern
from stat import S_ISDIR
+from textwrap import fill
from typing import Optional, Union
+from uuid import uuid4
import os, re, sys, textwrap, time

from bup import client, compat, git, hashsplit, vfs
from bup.commit import commit_message
-from bup.compat import argv_bytes, dataclass
+from bup.compat import argv_bytes, dataclass, get_argvb
from bup.config import derive_repo_addr
from bup.git import MissingObject, get_cat_data, parse_commit, walk_object
from bup.helpers import \
(EXIT_FAILURE,
+ EXIT_RECOVERED,
EXIT_SUCCESS,
debug1,
hostname,
@@ -22,9 +25,11 @@ from bup.helpers import \
nullctx,
parse_num,
parse_rx_excludes,
+ saved_errors,
tty_width)
from bup.io import path_msg
from bup.pwdgrp import userfullname, username
+from bup.repair import MissingConfig, RepairInfo, valid_repair_id
from bup.repo import LocalRepo, make_repo
from bup.rewrite import Rewriter

@@ -54,11 +59,11 @@ argspec = (
('-c, --print-commits', 'output a commit id for each ref set'),
('--print-tags', 'output an id for each tag'),
('--rewrite', 'rewrite data according to destination repo settings'),
- ('--rewrite-db PATH', 'transient rewrite database (in TMPDIR by default)'),
('--exclude-rx REGEX', 'skip paths matching the unanchored regex (may be repeated)'),
('--exclude-rx-from PATH', 'skip --exclude-rx patterns in PATH (may be repeated)'),
('--bwlimit BWLIMIT', 'maximum bytes/sec to transmit to server'),
- ('--missing <fail|ignore>', 'how to handle missing objects (default: fail)'),
+ ('--missing <fail|ignore|replace>', 'behavior for missing objects (default: fail)'),
+ ('--repair-id ID', 'repair session identifier (default: UUID v4)'),
('-0, -1, -2, -3, -4, -5, -6, -7, -8, -9, --compress LEVEL',
'set compression LEVEL (default: 1)'))),

@@ -123,12 +128,6 @@ def require_n_args_or_die(n, args):
assert len(result[0]) == n
return result

-@dataclass(slots=True, frozen=True)
-class MissingConfig:
- mode: Union['fail', 'ignore']
- def __post_init__(self):
- assert self.mode in ('fail', 'ignore')
-
@dataclass(slots=True, frozen=True)
class Spec:
method: str
@@ -153,16 +152,17 @@ def parse_args(args):
opt.print_commits = opt.print_trees = opt.print_tags = False
opt.bwlimit = None
opt.compress = None
- opt.rewrite_db = None
+ opt.repair_info = None
opt.source = opt.remote = None
opt.target_specs = []

# For now, rewriting is a "global" state, i.e. enabled for all
# specs or none. Since we don't want to create a Rewriter until
# we've finished checking the requests (e.g. are past the
- # resolvers, True is used as an intermediate placeholder).
+ # resolvers), the spec's rewriter will be set to True to indicate
+ # that it needs the real Rewriter once we have it.
rewrite = None # None means "didn't specify"
- missing = MissingConfig('fail')
+ missing = 'fail'
exclude_opts = []
remaining = args[1:] # Skip argv[0]
while remaining:
@@ -173,17 +173,24 @@ def parse_args(args):
elif arg in (b'-v', b'--verbose'):
opt.verbose += 1
remaining = remaining[1:]
+ elif arg == b'--missing':
+ (val,), remaining = require_n_args_or_die(1, remaining)
+ if val not in (b'fail', b'ignore', b'replace'):
+ misuse(f'--missing must be fail, ignore, or replace, not {val!r}')
+ missing = val.decode('ascii')
elif arg == b'--ignore-missing':
- missing = MissingConfig('ignore')
+ missing = 'ignore'
remaining = remaining[1:]
- elif arg == b'--missing':
- (missing,), remaining = require_n_args_or_die(1, remaining)
- if missing not in (b'fail', b'ignore'):
- misuse('--missing argument must be fail or ignore')
- missing = MissingConfig(missing.decode('ascii'))
elif arg == b'--no-ignore-missing':
- missing = MissingConfig('fail')
+ missing = 'fail'
remaining = remaining[1:]
+ elif arg == b'--repair-id':
+ (val,), remaining = require_n_args_or_die(1, remaining)
+ if not val:
+ misuse('empty --repair-id')
+ if not valid_repair_id(val):
+ misuse('--repair-id must be ASCII without control characters or DEL')
+ opt.repair_info = RepairInfo(val, command=get_argvb())
elif arg in (b'--ff', b'--append', b'--pick', b'--force-pick',
b'--new-tag', b'--replace', b'--unnamed'):
(ref,), remaining = require_n_args_or_die(1, remaining)
@@ -208,8 +215,6 @@ def parse_args(args):
rewrite, remaining = True, remaining[1:]
elif arg == b'--no-rewrite':
rewrite, remaining = False, remaining[1:]
- elif arg == b'--rewrite-db':
- (opt.rewrite_db,), remaining = require_n_args_or_die(1, remaining)
elif arg in (b'--exclude-rx', b'--exclude-rx-from'): # handled later
(val,), remaining = require_n_args_or_die(1, remaining)
exclude_opts.append((arg, val))
@@ -232,12 +237,14 @@ def parse_args(args):
continue
else:
misuse()
+ if opt.repair_info is None:
+ opt.repair_info = RepairInfo(str(uuid4()).encode('ascii'),
+ command=get_argvb())
excludes = parse_rx_excludes(exclude_opts, misuse)
if excludes and not rewrite:
misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
- opt.target_specs = [dcreplace(x,
- missing=missing,
- excludes=excludes,
+ missing = MissingConfig(mode=missing, repair_info=opt.repair_info)
+ opt.target_specs = [dcreplace(x, missing=missing, excludes=excludes,
rewriter=rewrite)
for x in opt.target_specs]
return opt
@@ -315,8 +322,8 @@ def append_commit(src_loc, parent, src_repo, dest_repo, missing, rewriter,
root, ref, save = path
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
- return rewriter.append_save(path, parent, src_repo, dest_repo, excludes)
-
+ return rewriter.append_save(path, parent, src_repo, dest_repo, missing,
+ excludes)

def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, rewriter,
excludes):
@@ -354,7 +361,7 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, rewriter,
coid = unhexlify(commit)
last_c, tree = rewriter.append_save(path + (entry_for_coid[coid],),
last_c, src_repo, dest_repo,
- excludes)
+ missing, excludes)
assert tree is not None
return last_c, tree

@@ -787,16 +794,8 @@ def log_item(name, type, opt, tree=None, commit=None, tag=None):
last = '/'
log('%s%s\n' % (path_msg(name), last))

-def main(argv):
- opt = parse_args(argv)
- git.check_repo_or_die()
- if opt.source:
- opt.source = argv_bytes(opt.source)
- if opt.bwlimit:
- client.bwlimit = parse_num(opt.bwlimit)
- if not opt.target_specs:
- misuse('no methods specified')

+def get_everything(opt):
with LocalRepo(repo_dir=opt.source) as src_repo, \
make_repo(derive_repo_addr(remote=opt.remote, die=misuse),
compression_level=opt.compress) as dest_repo:
@@ -807,7 +806,7 @@ def main(argv):
# For now (maybe forever), they're all the same
rewrite = opt.target_specs[0].rewriter
assert all(x.rewriter == rewrite for x in opt.target_specs), \
- [x.rewriter for x in opt.target_specs]
+ opt.target_specs

if src_split_cfg != dest_split_cfg and rewrite is None:
misuse('repository configs differ; specify --rewrite or --no-rewrite')
@@ -819,8 +818,8 @@ def main(argv):
# before creating any database via the Rewriter.
target_items = resolve_targets(opt.target_specs, src_repo, dest_repo)

- with (Rewriter(split_cfg=dest_split_cfg, db=opt.rewrite_db) \
- if rewrite else nullctx) as rewriter:
+ with (Rewriter(split_cfg=dest_split_cfg) if rewrite else nullctx) \
+ as rewriter:

target_items = [(x if not x.spec.rewriter
else x._replace(spec=dcreplace(x.spec, rewriter=rewriter)))
@@ -888,3 +887,25 @@ def main(argv):
log('updated %r (%s)\n' % (ref_name, new_hex))
except (git.GitError, client.ClientError) as ex:
note_error('unable to update ref %r: %s\n' % (ref_name, ex))
+
+
+def main(argv):
+ opt = parse_args(argv)
+ git.check_repo_or_die()
+ if opt.source:
+ opt.source = argv_bytes(opt.source)
+ if opt.bwlimit:
+ client.bwlimit = parse_num(opt.bwlimit)
+ if not opt.target_specs:
+ misuse('no methods specified')
+
+ get_everything(opt)
+
+ if opt.repair_info.repair_count() and not saved_errors:
+ msg = ('Repairs were needed and successful; see above. Additional'
+ ' information may be found in the git log. Search for '
+ ' "Repair-ID:" in "git --git-dir REPO log ..." for the related'
+ ' references.\n')
+ log(f'\n{fill(msg, width=tty_width(), break_on_hyphens=False)}\n')
+ return EXIT_RECOVERED
+ return 0
diff --git a/lib/bup/commit.py b/lib/bup/commit.py
index 3601d31a..41f3b847 100644
--- a/lib/bup/commit.py
+++ b/lib/bup/commit.py
@@ -127,9 +127,25 @@ def create_commit_blob(tree, parent,
return b'\n'.join(l)


-def commit_message(message, command):
- message = message.rstrip()
- return b'\n'.join((message,
- b'',
- b'Bup-Version: %s' % version,
- b'Bup-Argv: %s' % b' '.join(map(enc_sh, command))))
+_trailer_rx = re.compile(br'(?m)^[^\t ]+:.*(?:\r\n|\n)*\Z')
+
+def has_trailers(message):
+ # For now, a trailer key is anything not containing an ascii tab
+ # or space followed by a colon (i.e. ignoring trailer.separators,
+ # and only recognizing tab and space as
+ # "whitespace"). cf. git-interpreter-trailers(1).
+ return bool(_trailer_rx.search(message))
+
+
+def commit_message(message, argv, extra_trailers=None):
+ for trailer in extra_trailers or []:
+ assert isinstance(trailer, bytes)
+ for b in trailer: assert b >= 20 and b < 127, trailer
+ parts = [message.rstrip()]
+ if not has_trailers(message):
+ parts.append(b'')
+ parts.extend([b'Bup-Version: %s' % version,
+ b'Bup-Argv: %s' % b' '.join(map(enc_sh, argv))])
+ if extra_trailers:
+ parts.extend(extra_trailers)
+ return b'\n'.join(parts)
diff --git a/lib/bup/helpers.py b/lib/bup/helpers.py
index 9b6123af..beb7dd40 100644
--- a/lib/bup/helpers.py
+++ b/lib/bup/helpers.py
@@ -29,10 +29,16 @@ from bup.options import _tty_width as tty_width
# is asking a question with a yes or no answer. Eventually all
# commands should avoid exiting with 1 for errors.

+# EXIT_RECOVERED indicates something went wrong, but it was possible
+# to recover from the problem; e.g. bup get ... --missing
+# ignore/replace encountered missing objects and was able to handle
+# them as requested without additional errors.
+
EXIT_SUCCESS = 0
EXIT_TRUE = 0
EXIT_FALSE = 1
EXIT_FAILURE = 2
+EXIT_RECOVERED = 3


def dict_subset(dict, keys):
diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 7a275657..54a0cf44 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -229,6 +229,9 @@ class Metadata:
# Timestamps are (sec, ns), relative to 1970-01-01 00:00:00, ns
# must be non-negative and < 10**9.

+ # Consider bup.rewrite (e.g. _blob_replacement()) when making
+ # changes to the records (particularly the common records).
+
def _add_common(self, path, st):
assert(st.st_uid >= 0)
assert(st.st_gid >= 0)
diff --git a/lib/bup/repair.py b/lib/bup/repair.py
new file mode 100644
index 00000000..bf050c66
--- /dev/null
+++ b/lib/bup/repair.py
@@ -0,0 +1,46 @@
+
+from binascii import hexlify
+from typing import Optional, Union
+
+from bup.compat import dataclass
+from bup.io import enc_sh
+
+
+def valid_repair_id(s):
+ assert isinstance(s, bytes), s
+ for b in s:
+ if b < 32 or b > 126:
+ return False
+ return True
+
+
+class RepairInfo:
+ __slots__ = 'id', 'command', '_others', '_replacements'
+ def __init__(self, id, *, command=None):
+ assert valid_repair_id(id)
+ self.id = id
+ self.command = command
+ self._others = 0
+ self._replacements = []
+ def note_repair(self): self._others += 1
+ def path_replaced(self, path, oid, new_oid):
+ self._replacements.append((path, oid, new_oid))
+ def repair_count(self): return len(self._replacements) + self._others
+ def repair_trailers(self):
+ if not self.repair_count():
+ return []
+ trailers = [b'Bup-Repair-ID: ' + self.id]
+ for path, oid, new_oid in self._replacements:
+ trailers.append(b'Bup-Replaced: %s %s'
+ % (hexlify(new_oid), enc_sh(path)))
+ return trailers
+
+
+@dataclass(slots=True, frozen=True)
+class MissingConfig:
+ mode: Union['fail', 'ignore', 'replace']
+ repair_info: Optional[RepairInfo] = None
+ def __post_init__(self):
+ assert self.mode in ('fail', 'ignore', 'replace')
+ if self.mode == 'replace':
+ assert isinstance(self.repair_info, RepairInfo), self.repair_info
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index fc96fcd7..96318235 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -2,18 +2,32 @@
from binascii import hexlify
from contextlib import ExitStack, closing, nullcontext
from itertools import chain
-from os.path import join as pj
+from os.path import join as joinp
from stat import S_ISDIR, S_ISLNK, S_ISREG
+from typing import Any, Sequence
import os, sqlite3, time

from bup import hashsplit, metadata, vfs
+from bup.commit import commit_message
+from bup.compat import dataclass
from bup.git import get_cat_data, parse_commit
from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
from bup.helpers import \
- hostname, path_components, should_rx_exclude_path, temp_dir
-from bup.io import qsql_id
+ (hostname,
+ log,
+ path_components,
+ should_rx_exclude_path,
+ temp_dir)
+from bup.io import path_msg, qsql_id
+from bup.metadata import Metadata
from bup.pwdgrp import userfullname, username
+from bup.repair import MissingConfig
from bup.tree import Stack
+from bup.vfs import Item, MissingObject, default_file_mode
+
+
+# Currently only handles replacing entire vfs-level trees if any
+# consituent object is missing, entire files, and symlinks.


def _fs_path_from_vfs(path):
@@ -27,7 +41,9 @@ def _prep_mapping_table(db, split_cfg):
# This currently only needs to track items that may be split,
# depending on the current repo settings (e.g. files and
# directories); it records the result so we can re-use it if we
- # encounter the item again.
+ # encounter the item again. It explicitly does not store any
+ # rewrites (repairs) because the rewrite id can change across
+ # saves, and because rewrites may change the type (tree to blob).
settings = [str(x) for x in chain.from_iterable(sorted(split_cfg.items()))]
for x in settings: assert '_' not in x
table_id = f'bup_rewrite_mapping_to_bits_{"_".join(settings)}'
@@ -79,36 +95,146 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

-def _vfs_walk_recursively(srcrepo, dstrepo, path, excludes, db, mapping):
+def _path_repaired(path, oid, replacement_oid, missing_oid, repair_info):
+ if repair_info.repair_count() == 0:
+ log(b'repairs needed, repair-id: %s\n' % repair_info.id)
+ fs_path = _fs_path_from_vfs(path)
+ repair_info.path_replaced(fs_path, oid, replacement_oid)
+ ep = path_msg(fs_path)
+ log(f'warning: missing object {missing_oid.hex()} for {ep}\n')
+ log(f'repaired {ep} {oid.hex()} -> {replacement_oid.hex()}\n')
+
+def _blob_replacement(repo, meta, content):
+ # REVIEW: does all this seem reasonable?
+ now = time.time()
+ oid = repo.write_data(content)
+ rm = Metadata()
+ rm.mode = default_file_mode
+ rm.rdev = 0
+ rm.atime = rm.mtime = rm.ctime = now
+ rm.size = len(content)
+ if isinstance(meta, Metadata):
+ rm.uid = meta.uid
+ rm.gid = meta.gid
+ rm.user = meta.user
+ rm.group = meta.group
+ else:
+ rm.uid = rm.gid = 0
+ rm.user = rm.group = b''
+ return Item(oid=oid, meta=rm)
+
+def _replacement_item(repo, item, kind, kind_msg, repair_id, missing_oid):
+ # Currently assumes any trailer manipulations will preserve
+ # trailer ordering so we can have Missing instead of Bup-Missing,
+ # etc., and Missing should always be last.
+ m = [b'This is a replacement for a ', kind_msg, b' that was unreadable\n',
+ b'during a bup repair operation.\n\n',
+ b'Bup-Replacement-Info: ', repair_id, b'\n',
+ b'Replaced: ', kind, b' ', hexlify(item.oid), b'\n',
+ b'Missing: ', hexlify(missing_oid), b'\n']
+ return _blob_replacement(repo, item.meta, b''.join(m))
+
+def _replacement_file_item(repo, item, repair_id, missing_oid):
+ return _replacement_item(repo, item, b'file', b'file',
+ repair_id, missing_oid)
+
+def _replacement_symlink_item(repo, item, repair_id, missing_oid):
+ return _replacement_item(repo, item, b'symlink', b'symbolic link',
+ repair_id, missing_oid)
+
+def _replacement_tree_item(repo, item, repair_id, missing_oid):
+ return _replacement_item(repo, item, b'tree', b'tree',
+ repair_id, missing_oid)
+
+@dataclass(frozen=True, slots=True)
+class IncompleteDir:
+ path: Sequence[Any] # vfs path
+ missing: bytes # MissingObject oid
+
+def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
+ missing):
+ """Yield the paths underneath the given path.
+
+ When unreadable objects are encountered, raise MissingObject if
+ missing.mode is 'fail', otherwise, for missing.mode 'replace',
+ yield an IncompleteDir if the path refers to a missing git tree,
+ or split tree with missing split sub-trees.
+
+ """
+ assert isinstance(missing, MissingConfig), missing
+ assert missing.mode in ('fail', 'replace'), missing
item = path[-1][1]
assert len(path) >= 3
# drop branch/DATE
fs_path_in_save = _fs_path_from_vfs((path[0],) + path[3:])
- for entry in vfs.contents(srcrepo, item):
+
+ if missing.mode == 'fail':
+ entries = vfs.contents(srcrepo, item)
+ else:
+ try:
+ # list(contents()) will return all of a split tree's
+ # entries even if some of the split-tree items (the oids
+ # listed in the split-tree "leaves" are actually
+ # missing. So the list() only ensures that the split tree
+ # itself isn't broken; its contents may be.
+ entries = list(vfs.contents(srcrepo, item))
+ except MissingObject as ex:
+ yield IncompleteDir(path, ex.oid)
+ return
+ for entry in entries:
name, sub_item = entry
sub_path = path + (entry,)
if name in (b'.', b'..'):
continue
- sub_fs_path_in_save = pj(fs_path_in_save, name)
+ sub_fs_path_in_save = joinp(fs_path_in_save, name)
if S_ISDIR(vfs.item_mode(sub_item)):
sub_fs_path_in_save += b'/'
if should_rx_exclude_path(sub_fs_path_in_save, excludes):
continue
- if S_ISDIR(vfs.item_mode(sub_item)):
+ if not S_ISDIR(vfs.item_mode(sub_item)):
+ yield sub_path
+ else:
conv_item, oid, _ = \
_previous_conversion(dstrepo, sub_item, True, db, mapping)
if conv_item is not sub_item:
sub_path = sub_path[:-1] + ((sub_path[-1][0], conv_item),)
- if oid is None:
- yield from _vfs_walk_recursively(srcrepo, dstrepo, sub_path,
- excludes, db, mapping)
- yield sub_path
+ if oid:
+ yield sub_path
+ else:
+ yield from _vfs_walk_dir_recursively(srcrepo, dstrepo, sub_path,
+ excludes, db, mapping,
+ missing)
+ yield path

-def _rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
+def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing):
+ assert isinstance(missing, MissingConfig), missing
+ assert missing.mode in ('fail', 'replace'), missing
+ name, item = path[-1]
assert isinstance(name, bytes)
- target = vfs.readlink(srcrepo, item)
+ have_meta = isinstance(item.meta, metadata.Metadata)
+
+ try:
+ target = vfs.readlink(srcrepo, item)
+ except MissingObject as ex:
+ if have_meta and item.symlink_target is not None:
+ missing.repair_info.note_repair()
+ pm = path_msg(_fs_path_from_vfs(path))
+ log(f'warning: symlink data replaced from metadata for {pm}\n')
+ target = item.symlink_target
+ else:
+ if missing.mode == 'fail':
+ raise ex
+ repair_info = missing.repair_info
+ replacement = _replacement_symlink_item(dstrepo, item,
+ repair_info.id, ex.oid)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, repair_info)
+ assert replacement.meta.mode == default_file_mode
+ stack.append_to_current(name, default_file_mode, default_file_mode,
+ replacement.oid, replacement.meta)
+ return
+
git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
- if isinstance(item.meta, metadata.Metadata):
+ if have_meta:
if item.meta.size is None:
# must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
@@ -117,11 +243,41 @@ def _rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
assert item.meta.size == len(item.meta.symlink_target)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

+def _remember_rewrite(from_oid, to_oid, chunked, size, wdbc, mapping):
+ assert len(from_oid) == 20, from_oid
+ assert len(to_oid) == 20, to_oid
+ wdbc.execute(f'select src, dst, chunked, size from {mapping} where src = ?',
+ (from_oid,))
+ row = wdbc.fetchone()
+ assert wdbc.fetchone() is None
+ if row:
+ assert row == (from_oid, to_oid, chunked, size)
+ else:
+ wdbc.execute(f'insert into {mapping} (src, dst, chunked, size)'
+ ' values (?, ?, ?, ?)',
+ (from_oid, to_oid, chunked, size))
+
def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
- wdbc, mapping):
+ wdbc, mapping, missing):
+ """Returns either None, or, if a directory was missing, the
+ directory path components.
+
+ """
+ assert isinstance(missing, MissingConfig), missing
+ assert missing.mode in ('fail', 'replace'), missing
+
+ if not isinstance(path, IncompleteDir):
+ incomplete = None
+ else:
+ incomplete = path
+ path = incomplete.path
+
# save_path is the vfs path to the save ref, e.g. to branch/DATE
- fs_path = _fs_path_from_vfs(path[3:]) # not including /branch/DATE
- assert not fs_path.startswith(b'/') # because resolve(parent=...)
+
+ fs_path = _fs_path_from_vfs((path[0],) + path[3:]) # not including /branch/DATE
+ assert fs_path.startswith(b'/'), fs_path
+ fs_path = fs_path[1:] # because resolve(parent=...)
+
dirn, filen = os.path.split(b'/' + fs_path)
assert dirn.startswith(b'/')
dirp = path_components(dirn)
@@ -130,27 +286,54 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
while list(stack.path()) > [x[0] for x in dirp]:
stack.pop()

- # If switching to a new sub-tree, start a new sub-tree.
- comp_parent = None
- for path_component in dirp[len(stack):]:
- comp_name, comp_path = path_component
- if comp_parent:
- dir_res = vfs.resolve(srcrepo, comp_name, parent=comp_parent)
- else:
- full_comp_path = b'/'.join([x[0] for x in save_path]) + comp_path
- dir_res = vfs.resolve(srcrepo, full_comp_path)
- meta = dir_res[-1][1].meta
- if not isinstance(meta, metadata.Metadata):
- meta = None
- stack.push(comp_name, meta)
- comp_parent = dir_res
+ def push_parents(parents):
+ # FIXME: add missing object support
+ # If switching to a new sub-tree, start a new sub-tree.
+ comp_parent = None
+ for path_component in parents:
+ comp_name, comp_path = path_component
+ if comp_parent:
+ dir_res = vfs.resolve(srcrepo, comp_name, parent=comp_parent)
+ else:
+ full_comp_path = b'/'.join([x[0] for x in save_path]) + comp_path
+ dir_res = vfs.resolve(srcrepo, full_comp_path)
+ meta = dir_res[-1][1].meta
+ if not isinstance(meta, metadata.Metadata):
+ meta = None
+ stack.push(comp_name, meta)
+ comp_parent = dir_res
+
+ if incomplete:
+ assert missing.mode == 'replace', missing
+ # everything except the dir we're replacing
+ push_parents(dirp[:-1][len(stack):])
+ repair_info = missing.repair_info
+ # For now, wholesale replacement (no attempt to handle
+ # partially readable split trees).
+ rep_item = incomplete.path[-1][1]
+ replacement = _replacement_tree_item(dstrepo, rep_item, repair_info.id,
+ incomplete.missing)
+ # Must not remember repairs because the repair-id (and so blob
+ # content) can vary across saves, i.e. get --rewrite-id is a
+ # contextual argument, and because the type changes from tree
+ # to blob.
+ _path_repaired(path, rep_item.oid, replacement.oid, incomplete.missing,
+ repair_info)
+ assert replacement.meta.mode == default_file_mode, repr(replacement)
+ stack.append_to_current(path[-1][0],
+ replacement.meta.mode, GIT_MODE_FILE,
+ replacement.oid, replacement.meta)
+ return
+
+ push_parents(dirp[len(stack):])

item = path[-1][1]

# First, things that can't be affected by the rewrite
item_mode = vfs.item_mode(item)
if S_ISLNK(item_mode):
- _rewrite_link(item, item_mode, filen, srcrepo, dstrepo, stack)
+ assert filen == path[-1][0]
+ _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing)
return
if not S_ISREG(item_mode) and not S_ISDIR(item_mode):
# Everything here (pipes, devices, etc.) should be fully
@@ -171,7 +354,12 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
if len(stack) == 1:
return # We're at the top level -- keep the current root dir
newtree = stack.pop(override_tree=oid)
- if oid is None:
+ # Don't remember any trees when we're making destructive
+ # repairs because walk will skip the contents for a tree that
+ # has missing objects when it encounters it a second time (for
+ # say the second of two saves during an --append), which will
+ # omit the logging, repair trailers, etc.
+ if oid is None and missing.mode != 'replace':
wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
(item.oid, newtree))
return
@@ -189,29 +377,40 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
nonlocal item_size
item_size += len(data)
return dstrepo.write_data(data)
- with vfs.tree_data_reader(srcrepo, item.oid) as f:
- git_mode, oid = hashsplit.split_to_blob_or_tree(
- write_data, dstrepo.write_tree,
- hashsplit.from_config([f], split_cfg))
+
+ try:
+ with vfs.tree_data_reader(srcrepo, item.oid) as f:
+ git_mode, oid = hashsplit.split_to_blob_or_tree(
+ write_data, dstrepo.write_tree,
+ hashsplit.from_config([f], split_cfg))
+ except MissingObject as ex:
+ # For now, wholesale replacement (no attempt to handle
+ # partially readable split files).
+ if missing.mode == 'fail':
+ raise ex
+ repair_info = missing.repair_info
+ replacement = _replacement_file_item(dstrepo, item, repair_info.id,
+ ex.oid)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, repair_info)
+ # Must not remember repairs because the repair-id (and so blob
+ # content) can vary across saves, i.e. get --rewrite-id is a
+ # contextual argument, and because the type may change from
+ # tree to blob.
+ assert replacement.meta.mode == default_file_mode, repr(replacement)
+ stack.append_to_current(filen, replacement.meta.mode, GIT_MODE_FILE,
+ replacement.oid, replacement.meta)
+ return
+
if isinstance(item.meta, metadata.Metadata):
if item.meta.size is None:
# must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
item.meta.size = item_size
else:
- assert item.meta.size == item_size
+ assert item.meta.size == item_size, (item.meta.size, item_size)
chunked = 1 if S_ISDIR(git_mode) else 0

- wdbc.execute(f'select src, dst, chunked, size from {mapping} where src = ?',
- (item.oid,))
- row = wdbc.fetchone()
- assert wdbc.fetchone() is None
- if row:
- assert row == (item.oid, oid, chunked, item_size)
- else:
- wdbc.execute(f'insert into {mapping} (src, dst, chunked, size)'
- ' values (?, ?, ?, ?)',
- (item.oid, oid, chunked, item_size))
+ _remember_rewrite(item.oid, oid, chunked, item_size, wdbc, mapping)
stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)

class Rewriter:
@@ -240,9 +439,12 @@ class Rewriter:
with self._context:
pass

- def append_save(self, save_path, parent, srcrepo, dstrepo, excludes):
+ def append_save(self, save_path, parent, srcrepo, dstrepo, missing,
+ excludes):
# Strict for now
assert isinstance(parent, (bytes, type(None))), parent
+ assert isinstance(missing, MissingConfig), missing
+ assert missing.mode in ('fail', 'replace'), missing
if parent:
assert len(parent) == 20, parent
assert len(save_path) == 3, (len(save_path), save_path)
@@ -263,11 +465,17 @@ class Rewriter:
# location in the archive being constructed.
stack = Stack(dstrepo, self._split_cfg)

- for path in _vfs_walk_recursively(srcrepo, dstrepo, save_path,
- excludes, dbc, self._mapping):
+ # Relies on the fact that recursion is dfs post-order,
+ # and so if a dir is broken, we'll see that "up
+ # front", and never produce any children.
+
+ for path in _vfs_walk_dir_recursively(srcrepo, dstrepo,
+ save_path, excludes,
+ dbc, self._mapping,
+ missing):
_rewrite_save_item(save_path, path, srcrepo, dstrepo,
self._split_cfg, stack, dbc,
- self._mapping)
+ self._mapping, missing)

while len(stack) > 1: # pop all parts above root folder
stack.pop()
@@ -277,12 +485,14 @@ class Rewriter:
ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
author = ci.author_name + b' <' + ci.author_mail + b'>'
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
+ msg = commit_message(ci.message,
+ missing.repair_info.command,
+ missing.repair_info.repair_trailers())
return (dstrepo.write_commit(tree, parent,
author,
- ci.author_sec,
- ci.author_offset,
+ ci.author_sec, ci.author_offset,
committer, time.time(), None,
- ci.message),
+ msg),
tree)
finally:
self._db_conn.commit() # the workdb is always ready for commit
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 41912323..7d478f5b 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -19,11 +19,19 @@ _empty_metadata = MetadataRO()
class TreeItem:
__slots__ = 'name', 'mode', 'gitmode', 'oid', 'meta'
def __init__(self, name, mode, gitmode, oid, meta):
+ assert isinstance(name, bytes), name
+ assert isinstance(mode, int), mode
+ assert isinstance(gitmode, int), gitmode
+ assert isinstance(oid, bytes), oid
+ if meta is not None:
+ assert isinstance(meta, Metadata), meta
self.name = name
self.mode = mode
self.gitmode = gitmode
self.oid = oid
self.meta = meta or _empty_metadata
+ def __repr__(self):
+ return f'<bup.tree.TreeItem object at 0x{id(self):x} name={self.name!r}>'
def mangled_name(self):
return mangle_name(self.name, self.mode, self.gitmode)

@@ -93,6 +101,7 @@ class StackDir:
self.meta = meta
self.items = []

+
class Stack:
def __init__(self, repo, split_config):
self._stack = []
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 813c5087..3507dcd0 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -919,6 +919,14 @@ def contents(repo, item, names=None, want_meta=True):
raise Exception('unexpected VFS item ' + str(item))

def _resolve_path(repo, path, parent=None, want_meta=True, follow=True):
+ # FIXME: eventually more sophistication than just MissingObject
+ # with an oid, e.g. perhaps the path leading to the missing
+ # object?
+
+ # This arrangment means two repo objects representing the same
+ # physical repo will have duplicate entries in the cache, but we
+ # can't be fooled by any incorrectly matching repository
+ # ids. Shouldn't happen, but...
cache_key = b'res:%d%d%d:%s\0%s' \
% (bool(want_meta), bool(follow), id(repo),
(b'/'.join(x[0] for x in parent) if parent else b''),
diff --git a/note/main.md b/note/main.md
index e26482b8..5218bbd4 100644
--- a/note/main.md
+++ b/note/main.md
@@ -106,12 +106,15 @@ General
e.g. its `bup.split.files` and `bup.split.trees` settings. See
`bup-get`(1) for additional information.

-* `bup get --missing <fail|ignore> ...` can now specify how to handle
- missing objects that are encountered during a transfer. `fail`, the
- default, causes bup to exit with a nonzero status, and `ignore`
- causes bup to skip over them; `ignore` is currently only supported
- by `--unnamed` and is potentially *dangerous*. `--missing ignore` is
- the preferred replacement for the existing `--ignore-missing`.
+* `bup get --missing <fail|ignore|replace> ...` can now specify how to
+ handle missing objects that are encountered during a
+ transfer. `fail`, the default, causes bup to exit with a nonzero
+ status. `ignore` causes bup to skip over them (only supported by
+ `--unnamed` and potentially *dangerous*). `replace` only works with
+ `--rewrite` and replaces paths with missing contents with
+ synthesized "repair files". See bup-get(1) for additional
+ information. `--missing ignore` is the preferred replacement for the
+ existing `--ignore-missing`.

* The default pack compression level can now be configured via either
`pack.compression` or `core.compression`. See `bup-config`(5) for
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
new file mode 100755
index 00000000..9755e466
--- /dev/null
+++ b/test/ext/test-get-rewrite-missing
@@ -0,0 +1,279 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh
+. ./test/lib/btl.sh
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+bup() { "$top/bup" "$@"; }
+
+display-file()
+{
+ local name="$1"
+ printf -- "----- \"%q\" content below -----\n" "$name"
+ cat "$name"
+ printf -- "----- \"%q\" content above -----\n" "$name"
+}
+
+# FIXME: consider checking expected compare-trees differences.
+
+
+WVPASS cd "$tmpdir"
+WVPASS bup init
+WVPASS git config bup.split.trees true
+
+# Keep in mind that all blobs have to be unique enough to avoid
+# punching unintended holes elswewhere when perforating.
+
+WVPASS mkdir -p src/a src/missing-dir
+WVPASS echo 1 > src/a/1
+WVPASS echo 2 > src/a/2
+WVPASS echo 3 > src/a/missing-file
+WVPASS echo 1 > src/missing-dir/1
+WVPASS bup random 1m > src/partial-file
+
+# Right now, make-splittable-tree's files are identical (empty).
+WVPASS "$top/dev/make-splittable-tree" src/split-tree
+WVPASS cd src
+split_tree_data_path="$(WVPASS find split-tree -name data | WVPASS sed -n 11p)"
+WVPASS "$top/dev/python" -m uuid > "$split_tree_data_path"
+WVPASS cd ..
+
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+
+WVPASS readarray -t saves < <(bup ls src)
+save_date="${saves[0]}"
+src_oid="$(git rev-parse src)"
+
+WVPASS bup -d dest-repo init
+WVPASS git --git-dir dest-repo config bup.split.trees true
+WVPASS bup -d dest-repo get -s bup --unnamed "git:$src_oid"
+WVPASS bup -d dest-repo join "$src_oid" > /dev/null
+WVPASS rm -rf dest-repo
+
+# All of the oid files must be newline terminated (for use via cat below)
+WVPASS git ls-tree src | WVPASS grep -E $'\tmissing-dir$' | btl-ent-oid > dir-oid
+WVPASS git ls-tree src:a | WVPASS sed -n 4p | btl-ent-oid > file-oid
+WVPASS git ls-tree src | WVPASS grep -E $'\tpartial-file\.bup$' \
+ | btl-ent-oid > partial-file-oid
+# For now, assume it's at least two levels deep, and 7th is "fine"
+WVPASS git ls-tree -r src:partial-file.bup | WVPASS sed -n 7p \
+ | btl-ent-oid > partial-file-hole
+
+WVPASS git ls-tree src:split-tree | WVPASS grep -E $'\t\.bupd.[0-9]+\.bupd$' \
+ | btl-ent-oid > bupd-oid
+WVPASSEQ 41 $(wc -c < bupd-oid)
+
+WVPASS git ls-tree src | WVPASS grep -E $'\tsplit-tree$' \
+ | btl-ent-oid > split-tree-oid
+WVPASS git ls-tree src:split-tree | WVPASS grep -E $'\t\.bupm$' \
+ | btl-ent-oid > split-tree-bupm-oid
+
+WVPASS git ls-tree -r src:split-tree \
+ | WVPASS grep -F "${split_tree_data_path#split-tree}" > split-tree-blob-info
+btl-ent-oid < split-tree-blob-info > split-tree-blob-oid
+WVPASS cut -f 2 < split-tree-blob-info > split-tree-blob-path
+WVPASSEQ 2 $(tr -dc / < split-tree-blob-path | wc -c) # verify expected depth
+
+
+split_tree_blob_path="$(<split-tree-blob-path)"
+split_tree_l1_name="${split_tree_blob_path%%/*}"
+
+split_tree_l2_name="${split_tree_blob_path#*/}"
+split_tree_l2_name="${split_tree_l2_name%%/*}"
+
+#split_tree_l3_name="${split_tree_blob_path%/*}"
+#split_tree_l3_name="${split_tree_l3_name##*/}"
+
+WVPASS git ls-tree src:split-tree \
+ | WVPASS grep -E $'\t'"$split_tree_l1_name"'$' | btl-ent-oid > split-tree-l1-oid
+WVPASS git ls-tree "$(<split-tree-l1-oid)" \
+ | WVPASS grep -E $'\t'"$split_tree_l2_name"'$' | btl-ent-oid > split-tree-l2-oid
+# WVPASS git ls-tree "$(<split-tree-l2-oid)" \
+# | WVPASS grep -E $'\t'"$split_tree_l3_name"'$' | btl-ent-oid > split-tree-l3-oid
+
+echo "dir: $(< dir-oid)"
+echo "file: $(< file-oid)"
+echo "partial-file: $(< partial-file-oid)"
+echo "partial-file-hole: $(< partial-file-hole)"
+echo "split-tree: $(< split-tree-oid)"
+echo "split-tree-blob: $(< split-tree-blob-oid)"
+echo "split-tree-bupm-oid: $(< split-tree-bupm-oid)"
+# e.g. some-random-path-...-8371/some-random-path-...-8403/data
+echo "split-tree-blob-path: $(< split-tree-blob-path)"
+echo "split-tree-l1-name: $split_tree_l1_name"
+echo "split-tree-l1-oid: $(< split-tree-l1-oid)"
+echo "split-tree-l2-name: $split_tree_l2_name"
+echo "split-tree-l2-oid: $(< split-tree-l2-oid)"
+echo "bupd-oid: $(< bupd-oid)"
+
+WVPASS cp -pPR "$BUP_DIR" bup-complete
+
+WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < dir-oid
+WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < file-oid
+WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < partial-file-hole
+
+WVPASS rm -rf dest-repo bup-tmp
+WVPASS cp -pPR bup bup-tmp
+WVPASS bup -d dest-repo init
+WVPASS git --git-dir dest-repo config bup.split.trees true
+# test rejection of --missing ignore (only supported by --unnamed)
+WVFAIL bup -d dest-repo get -s bup --missing ignore --rewrite --append src
+# test --missing fail
+bup -d dest-repo get -s bup --missing fail --rewrite --append src
+rc=$?
+WVPASSEQ 2 "$rc"
+WVPASS rm -rf bup
+WVPASS cp -pPR bup-tmp bup
+
+repair-to-dest()
+{
+ rm -rf dest-repo
+ WVPASS bup -d dest-repo init
+ WVPASS git --git-dir dest-repo config bup.split.trees true
+ bup -d dest-repo get -s bup --missing replace --rewrite --append src 2> repair.log
+ rc=$?
+ display-file repair.log
+ WVPASSEQ 3 "$rc"
+}
+
+set-repair-id()
+{
+ WVPASSEQ 1 "$(grep -cE '^repairs needed, repair-id: ' repair.log)"
+ repair_id="$(WVPASS grep -E '^repairs needed, repair-id: ' repair.log)"
+ repair_id="${repair_id#repairs needed, repair-id: }"
+}
+
+
+WVSTART 'repair-id is reported'
+repair-to-dest
+set-repair-id
+
+oid_rx='[0-9a-fA-F]{40}'
+missing_file="/src/$save_date/a/missing-file"
+missing_dir="/src/$save_date/missing-dir/"
+missing_partial="/src/$save_date/partial-file"
+missing_split="/src/$save_date/split-tree/"
+
+WVPASS git --git-dir dest-repo ls-tree src \
+ | WVPASS grep -E $'\tmissing-dir$' | btl-ent-oid > dir-replacement-oid
+WVPASS git --git-dir dest-repo ls-tree src:a | WVPASS sed -n 4p \
+ | btl-ent-oid > blob-replacement-oid
+WVPASS git --git-dir dest-repo ls-tree src \
+ | WVPASS grep -E $'\tpartial-file$' \
+ | btl-ent-oid > partial-file-replacement-oid
+
+
+WVSTART 'commit trailers include repairs'
+git --git-dir dest-repo show -s --pretty=email src > repair-msg
+display-file repair-msg
+git interpret-trailers --parse < repair-msg > repair-trailers
+display-file repair-trailers
+bup_ver="$(bup version)"
+readarray -t trailers < repair-trailers
+wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save '
+wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get .* --rewrite '
+wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: $repair_id$"
+wv-match-rx "${trailers[5]}" "^Bup-Replaced: $(< blob-replacement-oid) $missing_file$"
+wv-match-rx "${trailers[6]}" "^Bup-Replaced: $(< dir-replacement-oid) $missing_dir$"
+wv-match-rx "${trailers[7]}" \
+ "^Bup-Replaced: $(< partial-file-replacement-oid) $missing_partial$"
+WVPASSEQ "" "${trailers[8]}" # end-of-line
+unset trailers
+
+# A missing dir .bupm in a non-split repo is indistinguishable from a
+# git created tree.
+
+WVSTART 'missing blobs are rewritten'
+WVPASS grep -E "^repaired $missing_file $oid_rx -> $oid_rx\$" repair.log
+WVPASS git --git-dir dest-repo show src:a/missing-file > blob-replacement
+display-file blob-replacement
+WVPASS grep -E '^This is a replacement for a file' blob-replacement
+WVPASS grep -E "^Bup-Replacement-Info: $repair_id" blob-replacement
+WVPASS grep -E "^Replaced: file $(< file-oid)" blob-replacement
+WVPASS grep -E "^Missing: $(< file-oid)" blob-replacement
+
+
+WVSTART 'missing trees are rewritten'
+WVPASS grep -E "^repaired $missing_dir $oid_rx -> $oid_rx\$" repair.log
+WVPASS git --git-dir dest-repo show src:missing-dir > tree-replacement
+display-file tree-replacement
+WVPASS grep -E '^This is a replacement for a tree' tree-replacement
+WVPASS grep -E "^Bup-Replacement-Info: $repair_id" tree-replacement
+WVPASS grep -E "^Replaced: tree $(< dir-oid)" tree-replacement
+WVPASS grep -E "^Missing: $(< dir-oid)" tree-replacement
+
+
+WVSTART 'incomplete chunked files are rewritten'
+WVPASS grep -E "^repaired $missing_partial $oid_rx -> $oid_rx\$" repair.log
+WVPASS git --git-dir dest-repo show src:partial-file > partial-file-replacement
+display-file partial-file-replacement
+WVPASS grep -E '^This is a replacement for a file' partial-file-replacement
+WVPASS grep -E "^Bup-Replacement-Info: $repair_id" partial-file-replacement
+WVPASS grep -E "^Replaced: file $(< partial-file-oid)" partial-file-replacement
+WVPASS grep -E "^Missing: $(< partial-file-hole)" partial-file-replacement
+
+# FIXME: still need tests for missing split-tree leaf .bupms, missing
+# split-tree mid-level or leaf tree (which would require three
+# levels). No missing .bupd test since that'd just make the split-tree
+# a normal tree.
+
+# FIXME: test trailers for incomplete split trees as above
+
+WVSTART 'incomplete split tree (missing top-level sub-tree)'
+WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < split-tree-l1-oid
+repair-to-dest
+set-repair-id
+WVPASS grep -E "^repaired $missing_split $oid_rx -> $oid_rx\$" repair.log
+WVPASS git --git-dir dest-repo show src:split-tree > split-tree-replacement
+display-file split-tree-replacement
+WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
+WVPASS grep -E "^Bup-Replacement-Info: $repair_id" split-tree-replacement
+WVPASS grep -E "^Replaced: tree $(< split-tree-oid)" split-tree-replacement
+WVPASS grep -E "^Missing: $(< split-tree-l1-oid)" split-tree-replacement
+
+
+WVSTART 'incomplete split tree (missing leaf item)'
+WVPASS rm -rf bup
+WVPASS cp -pPR bup-complete bup
+WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < split-tree-l2-oid
+repair-to-dest
+set-repair-id
+WVPASS grep -E "^repaired /src/$save_date/split-tree/$split_tree_l2_name/ $(< split-tree-l2-oid) ->" \
+ repair.log
+WVPASS git --git-dir dest-repo \
+ show "src:split-tree/$split_tree_l1_name/$split_tree_l2_name" \
+ > split-tree-replacement
+display-file split-tree-replacement
+WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
+WVPASS grep -E "^Bup-Replacement-Info: $repair_id" split-tree-replacement
+WVPASS grep -E "^Replaced: tree $(< split-tree-l2-oid)" split-tree-replacement
+WVPASS grep -E "^Missing: $(< split-tree-l2-oid)" split-tree-replacement
+
+
+WVSTART 'incomplete split tree (missing top-level .bupm)'
+WVPASS rm -rf bup
+WVPASS cp -pPR bup-complete bup
+WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < split-tree-bupm-oid
+repair-to-dest
+set-repair-id
+WVPASS grep -E "^repaired /src/$save_date/split-tree/ $(< split-tree-oid) ->" \
+ repair.log
+WVPASS git --git-dir dest-repo show "src:split-tree" > split-tree-replacement
+display-file split-tree-replacement
+WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
+WVPASS grep -E "^Bup-Replacement-Info: $repair_id" split-tree-replacement
+WVPASS grep -E "^Replaced: tree $(< split-tree-oid)" split-tree-replacement
+WVPASS grep -E "^Missing: $(< split-tree-bupm-oid)" split-tree-replacement
+
+
+WVPASS cd "$top"
+WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
index 47513153..82857942 100755
--- a/test/ext/test-rewrite
+++ b/test/ext/test-rewrite
@@ -48,7 +48,7 @@ compare() {

WVSTART split and rewrite
WVPASS bup split -n split < "$top/test/testfile1"
-WVPASS bup -d "$BUP_DIR2" get --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append: split test
+WVPASS bup -d "$BUP_DIR2" get -s "$BUP_DIR" --append: split test
WVPASS compare "$BUP_DIR" split "$BUP_DIR2" test

WVSTART make multiple saves
@@ -60,7 +60,7 @@ WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"

WVSTART rewrite to different split
WVPASS bup -d "$BUP_DIR" ls -l save
-WVPASS bup -d "$BUP_DIR4" get --rewrite --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append save
+WVPASS bup -d "$BUP_DIR4" get --rewrite -s "$BUP_DIR" --append save
WVPASS compare "$BUP_DIR" save "$BUP_DIR4" save

WVSTART "rewrite unchanged (to remote)"
@@ -103,12 +103,12 @@ WVPASS bup -d "$BUP_DIR" ls -l save/latest/test/sampledata/y/testfile1 |
WVPASS grep -- 158664

# now rewrite again - and then the size should be correct even without augmentation
-WVPASS bup -d "$BUP_DIR4" get --rewrite --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append: save save2
+WVPASS bup -d "$BUP_DIR4" get --rewrite -s "$BUP_DIR" --append: save save2
WVPASS bup+ -d "$BUP_DIR4" ls -l save/latest/test/sampledata/y/testfile1 |
WVPASS grep -- 158664

# and again for the other kind of splitting
-WVPASS bup -d "$BUP_DIR3" get --rewrite --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append: save save2
+WVPASS bup -d "$BUP_DIR3" get --rewrite -s "$BUP_DIR" --append: save save2
WVPASS bup+ -d "$BUP_DIR3" ls -l save2/latest/test/sampledata/y/testfile1 |
WVPASS grep -- 158664

@@ -146,4 +146,5 @@ GIT_DIR="$BUP_DIR" WVPASS git ls-tree -r save-new^ > "$tmpdir/n"
diff -u "$tmpdir/o" "$tmpdir/n"


+WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test-rm b/test/ext/test-rm
index 3c20764a..e84589bd 100755
--- a/test/ext/test-rm
+++ b/test/ext/test-rm
@@ -13,28 +13,6 @@ export GIT_DIR="$tmpdir/bup"
bup() { "$top/bup" "$@"; }
compare-trees() { "$top/dev/compare-trees" "$@"; }

-wv_matches_rx()
-{
- local caller_file=${BASH_SOURCE[0]}
- local caller_line=${BASH_LINENO[0]}
- local src="$caller_file:$caller_line"
- if test $# -ne 2; then
- echo "! $src wv_matches_rx requires 2 arguments FAILED" 1>&2
- return
- fi
- local str="$1"
- local rx="$2"
- echo "Matching:" 1>&2 || exit $?
- echo "$str" | sed 's/^\(.*\)/ \1/' 1>&2 || exit $?
- echo "Against:" 1>&2 || exit $?
- echo "$rx" | sed 's/^\(.*\)/ \1/' 1>&2 || exit $?
- if [[ "$str" =~ ^${rx}$ ]]; then
- echo "! $src regex matches ok" 1>&2 || exit $?
- else
- echo "! $src regex doesn't match FAILED" 1>&2 || exit $?
- fi
-}
-

WVPASS bup init
WVPASS cd "$tmpdir"
@@ -50,13 +28,13 @@ WVPASS "$top"/dev/sync-tree bup/ bup-baseline/
WVPASS bup tick # Make sure we always get the timestamp changes below
WVPASS bup rm --unsafe /src
observed="$(compare-trees bup/ bup-baseline/ | LC_ALL=C sort)" || exit $?
-wv_matches_rx "$observed" \
-'\*deleting[ ]+logs/refs/heads/src
+wv-match-rx "$observed" \
+'^\*deleting[ ]+logs/refs/heads/src
\*deleting[ ]+refs/heads/src(
\.d\.\.t\.\.\.[.]*[ ]+\./)?
\.d\.\.t\.\.\.[.]*[ ]+logs/refs/heads/
\.d\.\.t\.\.\.[.]*[ ]+refs/heads/(
->f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?'
+>f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?$'


WVSTART "rm /foo (one of many)"
@@ -72,13 +50,13 @@ WVPASS "$top"/dev/sync-tree bup/ bup-baseline/
WVPASS bup tick # Make sure we always get the timestamp changes below
WVPASS bup rm --unsafe /src
observed="$(compare-trees bup/ bup-baseline/ | LC_ALL=C sort)" || exit $?
-wv_matches_rx "$observed" \
-"\*deleting[ ]+logs/refs/heads/src
+wv-match-rx "$observed" \
+"^\*deleting[ ]+logs/refs/heads/src
\*deleting[ ]+refs/heads/src(
\.d\.\.t\.\.\.[.]*[ ]+\./)?
\.d\.\.t\.\.\.[.]*[ ]+logs/refs/heads/
\.d\.\.t\.\.\.[.]*[ ]+refs/heads/(
->f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?"
+>f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?$"


WVSTART "rm /foo /bar (multiple of many)"
@@ -94,15 +72,15 @@ WVPASS "$top"/dev/sync-tree bup/ bup-baseline/
WVPASS bup tick # Make sure we always get the timestamp changes below
WVPASS bup rm --unsafe /src-2 /src-4
observed="$(compare-trees bup/ bup-baseline/ | LC_ALL=C sort)" || exit $?
-wv_matches_rx "$observed" \
-"\*deleting[ ]+logs/refs/heads/src-2
+wv-match-rx "$observed" \
+"^\*deleting[ ]+logs/refs/heads/src-2
\*deleting[ ]+logs/refs/heads/src-4
\*deleting[ ]+refs/heads/src-2
\*deleting[ ]+refs/heads/src-4(
\.d\.\.t\.\.\.[.]*[ ]+\./)?
\.d\.\.t\.\.\.[.]*[ ]+logs/refs/heads/
\.d\.\.t\.\.\.[.]*[ ]+refs/heads/(
->f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?"
+>f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?$"


WVSTART "rm /foo /bar (all)"
@@ -112,8 +90,8 @@ WVPASS "$top"/dev/sync-tree bup/ bup-baseline/
WVPASS bup tick # Make sure we always get the timestamp changes below
WVPASS bup rm --unsafe /src /src-2 /src-3 /src-4 /src-5
observed="$(compare-trees bup/ bup-baseline/ | LC_ALL=C sort)" || exit $?
-wv_matches_rx "$observed" \
-"\*deleting[ ]+logs/refs/heads/src
+wv-match-rx "$observed" \
+"^\*deleting[ ]+logs/refs/heads/src
\*deleting[ ]+logs/refs/heads/src-2
\*deleting[ ]+logs/refs/heads/src-3
\*deleting[ ]+logs/refs/heads/src-4
@@ -126,7 +104,7 @@ wv_matches_rx "$observed" \
\.d\.\.t\.\.\.[.]*[ ]+\./)?
\.d\.\.t\.\.\.[.]*[ ]+logs/refs/heads/
\.d\.\.t\.\.\.[.]*[ ]+refs/heads/(
->f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?"
+>f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?$"


WVSTART "rm /foo/bar (lone save - equivalent to rm /foo)"
@@ -143,13 +121,13 @@ WVPASS bup tick # Make sure we always get the timestamp changes below
WVFAIL bup rm --unsafe /src/latest
WVPASS bup rm --unsafe /src/"$save1"
observed="$(compare-trees bup/ bup-baseline/ | LC_ALL=C sort)" || exit $?
-wv_matches_rx "$observed" \
-"\*deleting[ ]+logs/refs/heads/src
+wv-match-rx "$observed" \
+"^\*deleting[ ]+logs/refs/heads/src
\*deleting[ ]+refs/heads/src(
\.d\.\.t\.\.\.[.]*[ ]+\./)?
\.d\.\.t\.\.\.[.]*[ ]+logs/refs/heads/
\.d\.\.t\.\.\.[.]*[ ]+refs/heads/(
->f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?"
+>f\+\+\+\+\+\+\+\+\+[ ]+packed-refs)?$"


verify-changes-caused-by-rewriting-save()
@@ -164,15 +142,15 @@ verify-changes-caused-by-rewriting-save()
new_paths="$(WVPASS comm -13 "$tmpdir/before" "$tmpdir/after")" || exit $?
new_idx="$(echo "$new_paths" | WVPASS grep -E '^\./objects/pack/pack-.*\.idx$' | cut -b 3-)" || exit $?
new_pack="$(echo "$new_paths" | WVPASS grep -E '^\./objects/pack/pack-.*\.pack$' | cut -b 3-)" || exit $?
- wv_matches_rx "$(compare-trees "$after/" "$before/")" \
-">fcst\.\.\.[.]*[ ]+logs/refs/heads/src
+ wv-match-rx "$(compare-trees "$after/" "$before/")" \
+"^>fcst\.\.\.[.]*[ ]+logs/refs/heads/src
\.d\.\.t\.\.\.[.]*[ ]+objects/
\.d\.\.t\.\.\.[.]*[ ]+objects/pack/
>fcst\.\.\.[.]*[ ]+objects/pack/bup\.bloom
>f\+\+\+\+\+\+\+[+]*[ ]+$new_idx
>f\+\+\+\+\+\+\+[+]*[ ]+$new_pack
\.d\.\.t\.\.\.[.]*[ ]+refs/heads/
->fc\.t\.\.\.[.]*[ ]+refs/heads/src"
+>fc\.t\.\.\.[.]*[ ]+refs/heads/src$"
WVPASS rm -rf "$tmpdir"
}

@@ -243,10 +221,10 @@ WVPASS "$top"/dev/sync-tree bup-baseline/ bup/
victim="$(WVPASS bup ls src | tail -n 2 | head -n 1)" || exit $?
WVPASS bup rm --unsafe -vv /src/"$victim"
observed="$(compare-trees bup/ bup-baseline/ | LC_ALL=C sort)" || exit $?
-wv_matches_rx "$observed" \
-"\.d\.\.t\.\.\.[.]*[ ]+refs/heads/
+wv-match-rx "$observed" \
+"^\.d\.\.t\.\.\.[.]*[ ]+refs/heads/
>fc\.t\.\.\.[.]*[ ]+refs/heads/src
->fcst\.\.\.[.]*[ ]+logs/refs/heads/src"
+>fcst\.\.\.[.]*[ ]+logs/refs/heads/src$"
observed=$(git rev-list src | wc -l) || exit $?
WVPASSEQ 2 $observed
WVPASSEQ "$(commit-hash-n 1 bup src)" "$(commit-hash-n 1 bup-baseline src)"
diff --git a/test/ext/test_split_trees.py b/test/ext/test_split_trees.py
index 55a4a2ca..56508204 100755
--- a/test/ext/test_split_trees.py
+++ b/test/ext/test_split_trees.py
@@ -1,5 +1,5 @@

-from os import chdir, environb as environ, mkdir
+from os import chdir, environb as environ
from os.path import join as joinp
import pytest

@@ -9,24 +9,17 @@ import bup.path

bup = bup.path.exe()

-...@pytest.fixture
-def large_tree(tmpdir):
- # pytest fixtures are cached, so there will be only one large tree
- for i in range(10000):
- d = b'%s/some-random-path-name-to-make-the-tree-bigger-%d' % (tmpdir, i)
- mkdir(d)
- with open(joinp(d, b'data'), 'w') as f:
- print('data', file=f)
- return tmpdir
-
-def test_large_tree(tmpdir, large_tree):
+def test_large_tree(tmpdir):
environ[b'GIT_DIR'] = tmpdir + b'/repo'
environ[b'BUP_DIR'] = tmpdir + b'/repo'
+
+ ex((b'dev/make-splittable-tree', joinp(tmpdir, b'src')))
+
chdir(tmpdir)
ex((bup, b'init'))
ex((b'git', b'config', b'bup.split.trees', b'true'))
- ex((bup, b'index', large_tree))
- ex((bup, b'save', b'-n', b'gc-test', b'--strip', large_tree))
+ ex((bup, b'index', b'src'))
+ ex((bup, b'save', b'-n', b'gc-test', b'--strip', b'src'))

bupd = None
for p in exo((b'git', b'ls-tree', b'gc-test')).out.splitlines():
diff --git a/test/int/test_commit.py b/test/int/test_commit.py
index 42081c9e..25a3ceb0 100644
--- a/test/int/test_commit.py
+++ b/test/int/test_commit.py
@@ -6,7 +6,7 @@ import sys
from wvpytest import *

from bup import git
-from bup.commit import _git_date_str, parse_commit
+from bup.commit import _git_date_str, has_trailers, parse_commit
from bup.helpers import readpipe


@@ -128,3 +128,12 @@ def test_git_date_str():
WVPASSEQ(b'0 +0000', _git_date_str(0, 0))
WVPASSEQ(b'0 -0130', _git_date_str(0, -90 * 60))
WVPASSEQ(b'0 +0130', _git_date_str(0, 90 * 60))
+
+
+def test_has_trailers():
+ assert not has_trailers(b'')
+ assert not has_trailers(b'Summary\n')
+ assert not has_trailers(b'Summary\n\nBody')
+ assert not has_trailers(b'Summary\n\nBody\nNot Valid: trailer\n')
+ assert has_trailers(b'Summary\n\nBody\nValid: trailer\n')
+ assert has_trailers(b'Summary\n\nBody\n\nValid: trailer\n')
diff --git a/wvtest-bash.sh b/wvtest-bash.sh
index 9d5803b7..1823f26e 100644
--- a/wvtest-bash.sh
+++ b/wvtest-bash.sh
@@ -60,3 +60,31 @@ WVPIPE()
return 2
fi
}
+
+wv-match-rx()
+{
+ # Atypically the "expected" rx value is second because it's
+ # expected to often be the longer, quoted literal. e.g.
+ # wv-match-rx "$something" \
+ # "many
+ # regex
+ # lines"
+ local caller_file=${BASH_SOURCE[0]}
+ local caller_line=${BASH_LINENO[0]}
+ local src="$caller_file:$caller_line"
+ if test $# -ne 2; then
+ echo "! $src wv_matches_rx requires 2 arguments FAILED" 1>&2
+ return
+ fi
+ local str="$1"
+ local rx="$2"
+ echo "Matching:" 1>&2 || exit $?
+ echo "$str" | sed 's/^\(.*\)/ \1/' 1>&2 || exit $?
+ echo "Against:" 1>&2 || exit $?
+ echo "$rx" | sed 's/^\(.*\)/ \1/' 1>&2 || exit $?
+ if [[ "$str" =~ $rx ]]; then
+ echo "! $src regex matches ok" 1>&2 || exit $?
+ else
+ echo "! $src regex doesn't match FAILED" 1>&2 || exit $?
+ fi
+}
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/rewrite.py | 2 +-
lib/bup/tree.py | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 233cd739..3fbb6773 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -301,7 +301,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# If switching to a new sub-tree, finish the current sub-tree, and
# then we'll establish the sub-tree for the new sub-tree via
# extend_stack for the missing components.
- while list(stack.path()) > [x[0] for x in dir_path]:
+ while stack.path() > [x[0] for x in dir_path]:
stack.pop()

def extend_stack(parents):
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 7424d65d..aedee080 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -112,6 +112,7 @@ class Stack:
return len(self._stack)

def path(self):
+ # Must return a list - callers may compare it via <, >, etc.
return [p.name for p in self._stack]

def push(self, name, meta):
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Now that the previous commit "rewrite: only include regular (S_ISREG)
files and dirs in the db" has elimintated the possibility of two
mapping table entries with the same oid, simplify the db to just rely
on the oid.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 117 ++++++++++++++++++++---------------------
1 file changed, 57 insertions(+), 60 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index b3fb3666..9dce11c5 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -8,7 +8,7 @@ import sqlite3

from bup import hashsplit, git, options, repo, metadata, vfs
from bup.compat import argv_bytes
-from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK
+from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
from bup.helpers import \
(handle_ctrl_c, path_components,
valid_save_name, log,
@@ -33,23 +33,25 @@ exclude-rx-from= skip --exclude-rx patterns in file (may be repeated)
"""

def prep_mapping_table(db, split_cfg):
+ # This currently only needs to track items that may be split,
+ # depending on the current repo settings (e.g. files and
+ # directories); it records the result so we can re-use it if we
+ # encounter the item again.
settings = [str(x) for x in chain.from_iterable(sorted(split_cfg.items()))]
for x in settings: assert '_' not in x
table_id = f'bup_rewrite_mapping_to_bits_{"_".join(settings)}'
table_id = qsql_id(table_id)
db.execute(f'create table if not exists {table_id}'
- ' (src blob,'
+ ' (src blob primary key,'
' dst blob not null,'
- ' vfs_mode integer,'
- ' git_mode integer,'
- ' size integer,'
- ' primary key (src, vfs_mode))'
+ ' chunked integer,' # is this a chunked file
+ ' size integer)' # only for files
' without rowid')
return table_id

def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
- """Return (replacement_item, converted_oid, mode) for the given
- item if any, *and* if the dstrepo has the item.oid. If not,
+ """Return (replacement_item, converted_oid, git_mode) for the
+ given item if any, *and* if the dstrepo has the item.oid. If not,
converted_oid and mode will be None. The replacement_item will
either be item, or an augmented copy of item, (e.g. with a proper
size) that should be used instead of item.
@@ -62,15 +64,15 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
size = None
item_mode = item.meta

- db.execute(f'select dst, vfs_mode, git_mode, size from {mapping}'
- ' where src = ? and vfs_mode = ?',
- (item.oid, item_mode))
+ db.execute(f'select dst, chunked, size from {mapping} where src = ?',
+ (item.oid,))
data = db.fetchone()
if not data:
- return item, None, None, None
+ return item, None, None
assert db.fetchone() is None
- dst, vfs_mode, git_mode, size = data
- assert vfs_mode == item_mode
+ dst, chunked, size = data
+ if chunked:
+ assert S_ISREG(item_mode)
# augment the size if appropriate
if size is not None and isinstance(item.meta, metadata.Metadata):
if item.meta.size is not None:
@@ -80,9 +82,11 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
item.meta.size = size
# if we have it in the DB and in the destination repo, return it
if dstrepo.exists(dst):
- return item, dst, vfs_mode, git_mode
+ if chunked is None: # dir, not file
+ return item, dst, None
+ return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE
# this only happens if you reuse a database
- return item, None, None, None
+ return item, None, None

def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
fullname=b''):
@@ -94,7 +98,7 @@ def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
if should_rx_exclude_path(check_name, excludes):
continue
if S_ISDIR(vfs.item_mode(item)):
- item, oid, _, _ = previous_conversion(dstrepo, item, True, db, mapping)
+ item, oid, _ = previous_conversion(dstrepo, item, True, db, mapping)
if oid is None:
yield from vfs_walk_recursively(srcrepo, dstrepo, item,
excludes, db, mapping,
@@ -151,65 +155,58 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)
return

- item, oid, vfs_mode, git_mode = \
+ item, oid, git_mode = \
previous_conversion(dstrepo, item, not filen, wdbc, mapping)

if not filen:
+ # Since there's no filename, this is a subdir -- finish it.
+ assert S_ISDIR(item_mode)
+ assert git_mode is None, item.oid.hex() # for both exists and not
if len(stack) == 1:
return # We're at the top level -- keep the current root dir
- # Since there's no filename, this is a subdir -- finish it.
newtree = stack.pop(override_tree=oid)
if oid is None:
- assert vfs_mode is None, item.oid.hex()
- assert git_mode is None, item.oid.hex()
- vfs_mode = vfs.item_mode(item)
- wdbc.execute(f'insert into {mapping}'
- ' (src, dst, vfs_mode) values (?, ?, ?)',
- (item.oid, newtree, vfs_mode))
+ wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
+ (item.oid, newtree))
return

- # already converted - oid and mode are known
+ assert S_ISREG(item_mode)
if oid is not None:
- assert vfs_mode is not None, oid.hex()
- assert git_mode is not None, oid.hex()
- stack.append_to_current(filen, vfs_mode, git_mode, oid, item.meta)
+ # already converted - oid and mode are known
+ assert git_mode in (GIT_MODE_TREE, GIT_MODE_FILE)
+ stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)
return

- vfs_mode = vfs.item_mode(item)
item_size = None
- if S_ISREG(vfs_mode):
- item_size = 0
- def write_data(data):
- nonlocal item_size
- item_size += len(data)
- return dstrepo.write_data(data)
- with vfs.tree_data_reader(srcrepo, item.oid) as f:
- git_mode, oid = hashsplit.split_to_blob_or_tree(
- write_data, dstrepo.write_tree,
- hashsplit.from_config([f], split_cfg))
- if isinstance(item.meta, metadata.Metadata):
- if item.meta.size is None:
- # must not modify vfs results (see vfs docs)
- item = vfs.copy_item(item)
- item.meta.size = item_size
- else:
- assert item.meta.size == item_size
- elif S_ISDIR(vfs_mode):
- assert False # handled above
-
- wdbc.execute(f'select src, dst, vfs_mode, size from {mapping}'
- ' where src = ? and vfs_mode = ?',
- (item.oid, vfs_mode))
+ item_size = 0
+ def write_data(data):
+ nonlocal item_size
+ item_size += len(data)
+ return dstrepo.write_data(data)
+ with vfs.tree_data_reader(srcrepo, item.oid) as f:
+ git_mode, oid = hashsplit.split_to_blob_or_tree(
+ write_data, dstrepo.write_tree,
+ hashsplit.from_config([f], split_cfg))
+ if isinstance(item.meta, metadata.Metadata):
+ if item.meta.size is None:
+ # must not modify vfs results (see vfs docs)
+ item = vfs.copy_item(item)
+ item.meta.size = item_size
+ else:
+ assert item.meta.size == item_size
+ chunked = 1 if S_ISDIR(git_mode) else 0
+
+ wdbc.execute(f'select src, dst, chunked, size from {mapping} where src = ?',
+ (item.oid,))
row = wdbc.fetchone()
assert wdbc.fetchone() is None
- if row: # reusing previously populated db
- assert row == (item.oid, oid, vfs_mode, git_mode, item_size)
+ if row:
+ assert row == (item.oid, oid, chunked, item_size)
else:
- wdbc.execute(f'insert into {mapping}'
- ' (src, dst, vfs_mode, git_mode, size)'
- ' values (?, ?, ?, ?, ?)',
- (item.oid, oid, vfs_mode, git_mode, item_size))
- stack.append_to_current(filen, vfs_mode, git_mode, oid, item.meta)
+ wdbc.execute(f'insert into {mapping} (src, dst, chunked, size)'
+ ' values (?, ?, ?, ?)',
+ (item.oid, oid, chunked, item_size))
+ stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
To decrease ambiguity (e.g. RFC-style). Thanks to Stefan Monnier for
pointing out the concern.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
DESIGN.md | 37 +++++++++++++++++++------------------
1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/DESIGN.md b/DESIGN.md
index f4cebe18..2a8328c8 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -531,30 +531,31 @@ hardlinks may not be restored.
Repository Taxonomy
-------------------

-The format of the data that may appear in a repository has varied over
-time, both as a result of intentional changes and earlier bugs.
+The format of the data that might appear in a repository has varied
+over time, both as a result of intentional changes and earlier bugs.

- - A tree object may not have bup created metadata (i.e. may not have
- a `.bupm` file). Perhaps because it was created by git or a version
- of bup before metadata support was added. Eventually that might
- also result from repairs, though for the moment, it's not
+ - A tree object might not have bup created metadata (i.e. might not
+ have a `.bupm` file). Perhaps because it was created by git or a
+ version of bup before metadata support was added. Eventually that
+ might also result from repairs, though for the moment, it's not
possible. The abridgement repair (see below) comes close, but ends
up leaving a `.bupm` with empty entries for everything except ".".

- - A `.bupm` file may be abridged, i.e. have missing entries due to a
- bug introduced in 0.25 by 16f9f9829038f25aec80ebfae3c882a66281e145
- ("save-cmd.py: don't crash when a path disappears between index and
- save") and fixed for 0.30.1 by
- 47891d8951a95b8e0d9ca94387107cdf12ca3d3c ("save: add empty metadata
- if reading fails"). Related: `bup-validate-refs --bupm` and `bup
- get --repair`.
+ - A `.bupm` file might be abridged, i.e. have missing entries due to
+ a bug introduced in 0.25 by
+ 16f9f9829038f25aec80ebfae3c882a66281e145 ("save-cmd.py: don't crash
+ when a path disappears between index and save") and fixed for
+ 0.30.1 by 47891d8951a95b8e0d9ca94387107cdf12ca3d3c ("save: add
+ empty metadata if reading fails"). Related: `bup-validate-refs
+ --bupm` and `bup get --repair`.

- - A `.bupm` may have "empty" entries, i.e. a path's entry in a
+ - A `.bupm` might have "empty" entries, i.e. a path's entry in a
`.bupm` might be the encoding of a `Metadata()` object with no
- attributes. This may be because it's a "." entry for a "synthetic"
- directory (created via save strip/graft operations), or it may be
- due to the fix for the abridgement issue described above, and it
- can also occur as the result of repairs (cf. `bup-get`(1)).
+ attributes. This might be because it's a "." entry for a
+ "synthetic" directory (created via save strip/graft operations), or
+ it could be due to the fix for the abridgement issue described
+ above, and it can also occur as the result of repairs
+ (cf. `bup-get`(1)).

- Repositories created before the introduction of split trees won't
of course have split trees, nor will current repositories with
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Move the missing configuration from opt to the specs since, even
though it's currently (and maybe always will be) global, it's more
accurately a property of the requested transfer, and doing so also
avoids needing extra function arguments to carry that information when
we already have the spec.

Store the missing config in a new MissingConfig dataclass that
supports 'fail' and 'ignore' modes. Change the Spec to a dataclass so
that we can more easily allow missing to default to None until we're
ready to fill it in.

Move validation of the missing config to the method specific
resolvers since they're the domain experts.

All of this prepares for support of repository repairs via "--missing
replace...".

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 8 ++-
lib/bup/cmd/get.py | 114 ++++++++++++++++++++++++-------------
note/main.md | 7 +++
test/ext/test-get-missing | 14 +++--
4 files changed, 93 insertions(+), 50 deletions(-)

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index 44e99914..bddd0af1 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -187,9 +187,11 @@ used to help test before/after results.)
pack.compression or core.compression, or 1 (fast, loose
compression).

-\--ignore-missing
-: ignore missing objects encountered during a transfer. Currently
- only supported by `--unnamed`, and potentially *dangerous*.
+\--missing <fail|ignore>
+: when missing objects are encountered during a transfer, either
+ `fail` (exit with nonzero status, the default) or `ignore` them.
+ The latter is currently only supported by `--unnamed`, and is
+ potentially *dangerous*.

# EXAMPLES

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 885f5475..e78e0df8 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -1,13 +1,15 @@

from binascii import hexlify, unhexlify
from collections import namedtuple
+from dataclasses import replace as dcreplace
from contextlib import nullcontext
from stat import S_ISDIR
+from typing import Optional, Union
import os, sys, textwrap, time

from bup import client, compat, git, hashsplit, vfs
from bup.commit import commit_message
-from bup.compat import argv_bytes
+from bup.compat import argv_bytes, dataclass
from bup.config import derive_repo_addr
from bup.git import MissingObject, get_cat_data, parse_commit, walk_object
from bup.helpers import \
@@ -35,7 +37,8 @@ argspec = (
destination may be specified with -r, and data may be pulled from
a remote repository with the related "bup on HOST get ..."
command. The --exclude-rx and --exclude-rx-from options currently
- only apply to rewrites.""",
+ only apply to rewrites. Currently only --unnammed supports
+ "--missing ignore".""",

('optional arguments:',
(('-h, --help', 'show this help message and exit'),
@@ -54,6 +57,7 @@ argspec = (
('--exclude-rx REGEX', 'skip paths matching the unanchored regex (may be repeated)'),
('--exclude-rx-from PATH', 'skip --exclude-rx patterns in PATH (may be repeated)'),
('--bwlimit BWLIMIT', 'maximum bytes/sec to transmit to server'),
+ ('--missing <fail|ignore>', 'how to handle missing objects (default: fail)'),
('-0, -1, -2, -3, -4, -5, -6, -7, -8, -9, --compress LEVEL',
'set compression LEVEL (default: 1)'))),

@@ -118,7 +122,18 @@ def require_n_args_or_die(n, args):
assert len(result[0]) == n
return result

-Spec = namedtuple('Spec', ('method', 'src', 'dest'))
+@dataclass(slots=True, frozen=True)
+class MissingConfig:
+ mode: Union['fail', 'ignore']
+ def __post_init__(self):
+ assert self.mode in ('fail', 'ignore')
+
+@dataclass(slots=True, frozen=True)
+class Spec:
+ method: str
+ src: bytes
+ dest: bytes
+ missing: Optional[MissingConfig] = None

def spec_msg(s):
if not s.dest:
@@ -135,13 +150,13 @@ def parse_args(args):
opt.print_commits = opt.print_trees = opt.print_tags = False
opt.bwlimit = None
opt.compress = None
- opt.ignore_missing = False
opt.rewrite = None # None means "didn't specify"
opt.rewrite_db = None
opt.rewriter = None # internal, synthetic "option"...
opt.source = opt.remote = None
opt.target_specs = []

+ missing = MissingConfig('fail')
exclude_opts = []
remaining = args[1:] # Skip argv[0]
while remaining:
@@ -153,10 +168,15 @@ def parse_args(args):
opt.verbose += 1
remaining = remaining[1:]
elif arg == b'--ignore-missing':
- opt.ignore_missing = True
+ missing = MissingConfig('ignore')
remaining = remaining[1:]
+ elif arg == b'--missing':
+ (missing,), remaining = require_n_args_or_die(1, remaining)
+ if missing not in (b'fail', b'ignore'):
+ misuse('--missing argument must be fail or ignore')
+ missing = MissingConfig(missing.decode('ascii'))
elif arg == b'--no-ignore-missing':
- opt.ignore_missing = False
+ missing = MissingConfig('fail')
remaining = remaining[1:]
elif arg in (b'--ff', b'--append', b'--pick', b'--force-pick',
b'--new-tag', b'--replace', b'--unnamed'):
@@ -209,16 +229,15 @@ def parse_args(args):
opt.exclude_rxs = parse_rx_excludes(exclude_opts, misuse)
if opt.exclude_rxs and not opt.rewrite:
misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
- for target in opt.target_specs:
- if opt.ignore_missing and target.method != 'unnamed':
- misuse('currently only --unnamed allows --ignore-missing')
+ opt.target_specs = [dcreplace(x, missing=missing) for x in opt.target_specs]
return opt

# FIXME: client error handling (remote exceptions, etc.)

# FIXME: walk_object in in git.py doesn't support opt.verbose. Do we
# need to adjust for that here?
-def get_random_item(name, hash, src_repo, dest_repo, opt):
+def get_random_item(name, hash, src_repo, dest_repo, missing):
+ assert missing.mode in ('fail', 'ignore'), missing
def already_seen(oid):
return dest_repo.exists(unhexlify(oid))
def get_ref(oidx, include_data=False):
@@ -228,7 +247,7 @@ def get_random_item(name, hash, src_repo, dest_repo, opt):
include_data=True, result='item'):
assert isinstance(item, git.WalkItem)
if item.data is False:
- if not opt.ignore_missing:
+ if missing.mode == 'fail':
raise MissingObject(item.oid)
note_error(f'skipping missing source object {item.oid.hex()}\n')
continue
@@ -256,14 +275,14 @@ def get_random_item(name, hash, src_repo, dest_repo, opt):
dest_repo.just_write(item.oid, item.type, item.data)


-def transfer_commit(name, hash, parent, src_repo, dest_repo, opt):
+def transfer_commit(name, hash, parent, src_repo, dest_repo, missing):
now = time.time()
items = parse_commit(get_cat_data(src_repo.cat(hash), b'commit'))
tree = unhexlify(items.tree)
author = b'%s <%s>' % (items.author_name, items.author_mail)
author_time = (items.author_sec, items.author_offset)
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
- get_random_item(name, hexlify(tree), src_repo, dest_repo, opt)
+ get_random_item(name, hexlify(tree), src_repo, dest_repo, missing)
c = dest_repo.write_commit(tree, parent,
author, items.author_sec, items.author_offset,
committer, now, None,
@@ -271,12 +290,12 @@ def transfer_commit(name, hash, parent, src_repo, dest_repo, opt):
return c, tree


-def append_commit(src_loc, parent, src_repo, dest_repo, opt):
+def append_commit(src_loc, parent, src_repo, dest_repo, missing, opt):
if not opt.rewrite:
assert isinstance(src_loc, (bytes, Loc))
oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
return transfer_commit(None, # unused
- oidx, parent, src_repo, dest_repo, opt)
+ oidx, parent, src_repo, dest_repo, missing)

# Friendlier checking was done during resolve_*
assert isinstance(src_loc, Loc), src_loc
@@ -289,14 +308,14 @@ def append_commit(src_loc, parent, src_repo, dest_repo, opt):
opt.exclude_rxs)


-def append_commits(src_loc, dest_hash, src_repo, dest_repo, opt):
+def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, opt):
if not opt.rewrite:
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()
last_c, tree = dest_hash, None
for commit in commits:
last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
- opt)
+ missing, opt)
assert tree is not None
return last_c, tree

@@ -418,7 +437,7 @@ def validate_vfs_path(p, spec):
return p


-def resolve_src(spec, src_repo, *, allow=None, ignore_missing=False):
+def resolve_src(spec, src_repo, *, allow=None):
assert allow in (None, 'git')
spec_args = spec_msg(spec)
if spec.src.startswith(b'git:'):
@@ -432,7 +451,7 @@ def resolve_src(spec, src_repo, *, allow=None, ignore_missing=False):
misuse('cannot fetch entire repository for %s' % spec_args)
if src.type == 'tags':
misuse('cannot fetch entire /.tag directory for %s' % spec_args)
- if not (src or ignore_missing):
+ if not (src or spec.missing.mode == 'ignore'):
misuse('cannot find source for %s' % spec_args)
debug1('src: %s\n' % loc_desc(src))
return src
@@ -453,11 +472,11 @@ def resolve_branch_dest(spec, src, src_repo, dest_repo):
if not spec.dest:
# Pick a default dest.
if src.type == 'branch':
- spec = spec._replace(dest=spec.src)
+ spec = dcreplace(spec, dest=spec.src)
elif src.type == 'save':
- spec = spec._replace(dest=get_save_branch(src_repo, spec.src))
+ spec = dcreplace(spec, dest=get_save_branch(src_repo, spec.src))
elif src.path.startswith(b'/.tag/'): # Dest defaults to the same.
- spec = spec._replace(dest=spec.src)
+ spec = dcreplace(spec, dest=spec.src)

spec_args = spec_msg(spec)
if not spec.dest:
@@ -482,6 +501,8 @@ def resolve_branch_dest(spec, src, src_repo, dest_repo):


def resolve_ff(spec, src_repo, dest_repo):
+ if spec.missing.mode == 'ignore':
+ misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if src.type == 'tree':
@@ -501,7 +522,8 @@ def handle_ff(item, src_repo, dest_repo, opt):
dest_oidx = hexlify(item.dest.hash) if item.dest.hash else None
if not dest_oidx or dest_oidx in src_repo.rev_list(src_oidx):
# Can fast forward.
- get_random_item(item.spec.src, src_oidx, src_repo, dest_repo, opt)
+ get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
+ item.spec.missing)
commit_items = parse_commit(get_cat_data(src_repo.cat(src_oidx), b'commit'))
return item.src.hash, unhexlify(commit_items.tree)
misuse('destination is not an ancestor of source for %s'
@@ -511,6 +533,8 @@ def handle_ff(item, src_repo, dest_repo, opt):


def resolve_append(spec, src_repo, dest_repo, *, rewrite):
+ if spec.missing.mode == 'ignore':
+ misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
if src.type not in ('branch', 'save', 'commit', 'tree'):
misuse('source for %s must be a branch, save, commit, or tree, not %s'
@@ -539,7 +563,8 @@ def handle_append(item, src_repo, dest_repo, opt):
src_oidx = hexlify(item.src.hash)
if opt.rewrite:
misuse(f'rewrite cannot yet promote tree to commit for {spec_msg(item.spec)}')
- get_random_item(item.spec.src, src_oidx, src_repo, dest_repo, opt)
+ get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
+ item.spec.missing)
parent = item.dest.hash
msg = commit_message(b'bup get', compat.get_argvb())
userline = b'%s <%s@%s>' % (userfullname(), username(), hostname())
@@ -550,10 +575,13 @@ def handle_append(item, src_repo, dest_repo, opt):
return commit, item.src.hash
if item.dest.hash:
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
- return append_commits(item.src, item.dest.hash, src_repo, dest_repo, opt)
+ return append_commits(item.src, item.dest.hash, src_repo, dest_repo,
+ item.spec.missing, opt)


def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
+ if spec.missing.mode == 'ignore':
+ misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if src.type == 'tree':
@@ -563,9 +591,9 @@ def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
% (spec_args, src.type))
if not spec.dest:
if src.path.startswith(b'/.tag/'):
- spec = spec._replace(dest=spec.src)
+ spec = dcreplace(spec, dest=spec.src)
elif src.type == 'save':
- spec = spec._replace(dest=get_save_branch(src_repo, spec.src))
+ spec = dcreplace(spec, dest=get_save_branch(src_repo, spec.src))
if not spec.dest:
misuse('no destination provided for %s' % spec_args)
if rewrite:
@@ -593,18 +621,21 @@ def handle_pick(item, src_repo, dest_repo, opt):
# if the dest is committish, make it the parent
if item.dest.type in ('branch', 'commit', 'save'):
return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
- opt)
+ item.spec.missing, opt)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
- return append_commit(item.src, None, src_repo, dest_repo, opt)
+ return append_commit(item.src, None, src_repo, dest_repo, item.spec.missing,
+ opt)


def resolve_new_tag(spec, src_repo, dest_repo):
+ if spec.missing.mode == 'ignore':
+ misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if not spec.dest and src.path.startswith(b'/.tag/'):
- spec = spec._replace(dest=src.path)
+ spec = dcreplace(spec, dest=src.path)
if not spec.dest:
misuse('no destination (implicit or explicit) for %s' % spec_args)
dest = find_vfs_item(spec.dest, dest_repo)
@@ -622,16 +653,18 @@ def handle_new_tag(item, src_repo, dest_repo, opt):
assert item.spec.method == 'new-tag'
assert item.dest.path.startswith(b'/.tag/')
get_random_item(item.spec.src, hexlify(item.src.hash),
- src_repo, dest_repo, opt)
+ src_repo, dest_repo, item.spec.missing)
return (item.src.hash,)


def resolve_replace(spec, src_repo, dest_repo):
+ if spec.missing.mode == 'ignore':
+ misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if not spec.dest:
if src.path.startswith(b'/.tag/') or src.type == 'branch':
- spec = spec._replace(dest=spec.src)
+ spec = dcreplace(spec, dest=spec.src)
if not spec.dest:
misuse('no destination provided for %s' % spec_args)
dest = find_vfs_item(spec.dest, dest_repo)
@@ -652,19 +685,20 @@ def handle_replace(item, src_repo, dest_repo, opt):
assert(item.spec.method == 'replace')
if item.dest.path.startswith(b'/.tag/'):
get_random_item(item.spec.src, hexlify(item.src.hash),
- src_repo, dest_repo, opt)
+ src_repo, dest_repo, item.spec.missing)
return (item.src.hash,)
assert(item.dest.type == 'branch' or not item.dest.type)
src_oidx = hexlify(item.src.hash)
- get_random_item(item.spec.src, src_oidx, src_repo, dest_repo, opt)
+ get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
+ item.spec.missing)
commit_items = parse_commit(get_cat_data(src_repo.cat(src_oidx), b'commit'))
return item.src.hash, unhexlify(commit_items.tree)


-def resolve_unnamed(spec, src_repo, dest_repo, *, ignore_missing):
+def resolve_unnamed(spec, src_repo, dest_repo):
if spec.dest:
misuse('destination name given for %s' % spec_msg(spec))
- src = resolve_src(spec, src_repo, allow='git', ignore_missing=ignore_missing)
+ src = resolve_src(spec, src_repo, allow='git')
if src:
return Target(spec=spec, src=src, dest=None)
return None
@@ -672,11 +706,11 @@ def resolve_unnamed(spec, src_repo, dest_repo, *, ignore_missing):

def handle_unnamed(item, src_repo, dest_repo, opt):
get_random_item(item.spec.src, hexlify(item.src.hash),
- src_repo, dest_repo, opt)
+ src_repo, dest_repo, item.spec.missing)
return (None,)


-def resolve_targets(specs, src_repo, dest_repo, *, ignore_missing, rewrite):
+def resolve_targets(specs, src_repo, dest_repo, *, rewrite):
resolved_items = []
common_args = src_repo, dest_repo
for spec in specs:
@@ -694,8 +728,7 @@ def resolve_targets(specs, src_repo, dest_repo, *, ignore_missing, rewrite):
elif spec.method == 'replace':
resolved_items.append(resolve_replace(spec, *common_args))
elif spec.method == 'unnamed':
- tgt = resolve_unnamed(spec, *common_args,
- ignore_missing=ignore_missing)
+ tgt = resolve_unnamed(spec, *common_args)
if tgt:
resolved_items.append(tgt)
else: # Should be impossible -- prevented by the option parser.
@@ -765,7 +798,6 @@ def main(argv):
# broken cases).
target_items = resolve_targets(opt.target_specs,
src_repo, dest_repo,
- ignore_missing=opt.ignore_missing,
rewrite=opt.rewrite)
if opt.rewrite:
for item in target_items:
diff --git a/note/main.md b/note/main.md
index f750891c..e26482b8 100644
--- a/note/main.md
+++ b/note/main.md
@@ -106,6 +106,13 @@ General
e.g. its `bup.split.files` and `bup.split.trees` settings. See
`bup-get`(1) for additional information.

+* `bup get --missing <fail|ignore> ...` can now specify how to handle
+ missing objects that are encountered during a transfer. `fail`, the
+ default, causes bup to exit with a nonzero status, and `ignore`
+ causes bup to skip over them; `ignore` is currently only supported
+ by `--unnamed` and is potentially *dangerous*. `--missing ignore` is
+ the preferred replacement for the existing `--ignore-missing`.
+
* The default pack compression level can now be configured via either
`pack.compression` or `core.compression`. See `bup-config`(5) for
additional information.
diff --git a/test/ext/test-get-missing b/test/ext/test-get-missing
index 9d048b6e..6de4455f 100755
--- a/test/ext/test-get-missing
+++ b/test/ext/test-get-missing
@@ -39,12 +39,14 @@ WVFAIL bup -d dest-repo get -s bup --unnamed "git:$src_oid" 2>&1 | tee get.log
WVPASS grep -E 'raise MissingObject' get.log
WVPASS rm -rf dest-repo

-WVPASS bup -d dest-repo init
-WVFAIL bup -d dest-repo get --ignore-missing -s bup \
- --unnamed "git:$src_oid" 2>&1 \
- | tee get.log
-WVPASSEQ 1 "$(grep -cF "skipping missing source object ${bupm_oid}" get.log)"
-WVPASS rm -rf dest-repo
+for args in '--missing ignore' --ignore-missing; do
+ WVPASS bup -d dest-repo init
+ WVFAIL bup -d dest-repo get $args -s bup --unnamed "git:$src_oid" 2>&1 \
+ | tee get.log
+ WVPASSEQ 1 "$(grep -cF "skipping missing source object ${bupm_oid}" get.log)"
+ WVPASS rm -rf dest-repo
+done
+

WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Add it to the spec since it's part of the specification, also because
we're eventually going to become target-specific.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 135 ++++++++++++++++++++++++-------------------
test/ext/test_get.py | 10 +++-
2 files changed, 83 insertions(+), 62 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 2c286ca2..73c1c493 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -2,7 +2,6 @@
from binascii import hexlify, unhexlify
from collections import namedtuple
from dataclasses import replace as dcreplace
-from contextlib import nullcontext
from re import Pattern
from stat import S_ISDIR
from typing import Optional, Union
@@ -20,6 +19,7 @@ from bup.helpers import \
hostname,
log,
note_error,
+ nullctx,
parse_num,
parse_rx_excludes,
tty_width)
@@ -136,6 +136,7 @@ class Spec:
dest: bytes
missing: Optional[MissingConfig] = None
excludes: Optional[list[Pattern]] = None
+ rewriter: Optional[Union[bool, Rewriter]] = None

def spec_msg(s):
if not s.dest:
@@ -152,12 +153,15 @@ def parse_args(args):
opt.print_commits = opt.print_trees = opt.print_tags = False
opt.bwlimit = None
opt.compress = None
- opt.rewrite = None # None means "didn't specify"
opt.rewrite_db = None
- opt.rewriter = None # internal, synthetic "option"...
opt.source = opt.remote = None
opt.target_specs = []

+ # For now, rewriting is a "global" state, i.e. enabled for all
+ # specs or none. Since we don't want to create a Rewriter until
+ # we've finished checking the requests (e.g. are past the
+ # resolvers, True is used as an intermediate placeholder).
+ rewrite = None # None means "didn't specify"
missing = MissingConfig('fail')
exclude_opts = []
remaining = args[1:] # Skip argv[0]
@@ -201,9 +205,9 @@ def parse_args(args):
elif arg == b'--print-tags':
opt.print_tags, remaining = True, remaining[1:]
elif arg == b'--rewrite':
- opt.rewrite, remaining = True, remaining[1:]
+ rewrite, remaining = True, remaining[1:]
elif arg == b'--no-rewrite':
- opt.rewrite, remaining = False, remaining[1:]
+ rewrite, remaining = False, remaining[1:]
elif arg == b'--rewrite-db':
(opt.rewrite_db,), remaining = require_n_args_or_die(1, remaining)
elif arg in (b'--exclude-rx', b'--exclude-rx-from'): # handled later
@@ -229,9 +233,12 @@ def parse_args(args):
else:
misuse()
excludes = parse_rx_excludes(exclude_opts, misuse)
- if excludes and not opt.rewrite:
+ if excludes and not rewrite:
misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
- opt.target_specs = [dcreplace(x, missing=missing, excludes=excludes)
+ opt.target_specs = [dcreplace(x,
+ missing=missing,
+ excludes=excludes,
+ rewriter=rewrite)
for x in opt.target_specs]
return opt

@@ -293,8 +300,9 @@ def transfer_commit(name, hash, parent, src_repo, dest_repo, missing):
return c, tree


-def append_commit(src_loc, parent, src_repo, dest_repo, missing, excludes, opt):
- if not opt.rewrite:
+def append_commit(src_loc, parent, src_repo, dest_repo, missing, rewriter,
+ excludes):
+ if not rewriter:
assert isinstance(src_loc, (bytes, Loc)), src_loc
oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
return transfer_commit(None, # unused
@@ -307,18 +315,18 @@ def append_commit(src_loc, parent, src_repo, dest_repo, missing, excludes, opt):
root, ref, save = path
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
- return opt.rewriter.append_save(path, parent, src_repo, dest_repo, excludes)
+ return rewriter.append_save(path, parent, src_repo, dest_repo, excludes)


-def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, excludes,
- opt):
- if not opt.rewrite:
+def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, rewriter,
+ excludes):
+ if not rewriter:
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()
last_c, tree = dest_hash, None
for commit in commits:
last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
- missing, excludes, opt)
+ missing, rewriter, excludes)
assert tree is not None
return last_c, tree

@@ -344,9 +352,9 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, excludes,
last_c, tree = dest_hash, None
for commit in commits:
coid = unhexlify(commit)
- last_c, tree = opt.rewriter.append_save(path + (entry_for_coid[coid],),
- last_c, src_repo, dest_repo,
- excludes)
+ last_c, tree = rewriter.append_save(path + (entry_for_coid[coid],),
+ last_c, src_repo, dest_repo,
+ excludes)
assert tree is not None
return last_c, tree

@@ -504,6 +512,8 @@ def resolve_branch_dest(spec, src, src_repo, dest_repo):


def resolve_ff(spec, src_repo, dest_repo):
+ if spec.rewriter:
+ misuse(f'--{spec.method} cannot rewrite (use --pick)')
if spec.missing.mode == 'ignore':
misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
@@ -518,7 +528,7 @@ def resolve_ff(spec, src_repo, dest_repo):
return Target(spec=spec, src=src, dest=dest)


-def handle_ff(item, src_repo, dest_repo, opt):
+def handle_ff(item, src_repo, dest_repo):
assert item.spec.method == 'ff'
assert item.src.type in ('branch', 'save', 'commit')
src_oidx = hexlify(item.src.hash)
@@ -535,7 +545,7 @@ def handle_ff(item, src_repo, dest_repo, opt):
return None


-def resolve_append(spec, src_repo, dest_repo, *, rewrite):
+def resolve_append(spec, src_repo, dest_repo):
if spec.missing.mode == 'ignore':
misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
@@ -543,7 +553,7 @@ def resolve_append(spec, src_repo, dest_repo, *, rewrite):
misuse('source for %s must be a branch, save, commit, or tree, not %s'
% (spec_msg(spec), src.type))
spec, dest = resolve_branch_dest(spec, src, src_repo, dest_repo)
- if rewrite:
+ if spec.rewriter:
def vpm(path):
return path_msg(b"/".join(x[0] for x in src_path))
if not isinstance(src, Loc):
@@ -558,13 +568,13 @@ def resolve_append(spec, src_repo, dest_repo, *, rewrite):
return Target(spec=spec, src=src, dest=dest)


-def handle_append(item, src_repo, dest_repo, opt):
+def handle_append(item, src_repo, dest_repo):
assert item.spec.method == 'append'
assert item.src.type in ('branch', 'save', 'commit', 'tree')
assert item.dest.type == 'branch' or not item.dest.type
if item.src.type == 'tree':
src_oidx = hexlify(item.src.hash)
- if opt.rewrite:
+ if item.spec.rewriter:
misuse(f'rewrite cannot yet promote tree to commit for {spec_msg(item.spec)}')
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
item.spec.missing)
@@ -579,10 +589,11 @@ def handle_append(item, src_repo, dest_repo, opt):
if item.dest.hash:
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
return append_commits(item.src, item.dest.hash, src_repo, dest_repo,
- item.spec.missing, item.spec.excludes, opt)
+ item.spec.missing, item.spec.rewriter,
+ item.spec.excludes)


-def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
+def resolve_pick(spec, src_repo, dest_repo):
if spec.missing.mode == 'ignore':
misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
@@ -599,7 +610,7 @@ def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
spec = dcreplace(spec, dest=get_save_branch(src_repo, spec.src))
if not spec.dest:
misuse('no destination provided for %s' % spec_args)
- if rewrite:
+ if spec.rewriter:
if src.type != 'save':
misuse(f'cannot currently --rewrite a {src.type}')
dest = find_vfs_item(spec.dest, dest_repo)
@@ -617,22 +628,25 @@ def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
return Target(spec=spec, src=src, dest=dest)


-def handle_pick(item, src_repo, dest_repo, opt):
+def handle_pick(item, src_repo, dest_repo):
assert item.spec.method in ('pick', 'force-pick')
assert item.src.type in ('save', 'commit')
if item.dest.hash:
# if the dest is committish, make it the parent
if item.dest.type in ('branch', 'commit', 'save'):
return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
- item.spec.missing, item.spec.excludes, opt)
+ item.spec.missing, item.spec.rewriter,
+ item.spec.excludes)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
return append_commit(item.src, None, src_repo, dest_repo, item.spec.missing,
- item.spec.excludes, opt)
+ item.spec.rewriter, item.spec.excludes)


def resolve_new_tag(spec, src_repo, dest_repo):
+ if spec.rewriter:
+ misuse(f'--{spec.method} cannot currently rewrite')
if spec.missing.mode == 'ignore':
misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
@@ -652,7 +666,7 @@ def resolve_new_tag(spec, src_repo, dest_repo):
return Target(spec=spec, src=src, dest=dest)


-def handle_new_tag(item, src_repo, dest_repo, opt):
+def handle_new_tag(item, src_repo, dest_repo):
assert item.spec.method == 'new-tag'
assert item.dest.path.startswith(b'/.tag/')
get_random_item(item.spec.src, hexlify(item.src.hash),
@@ -661,6 +675,8 @@ def handle_new_tag(item, src_repo, dest_repo, opt):


def resolve_replace(spec, src_repo, dest_repo):
+ if spec.rewriter:
+ misuse(f'--{spec.method} cannot currently rewrite')
if spec.missing.mode == 'ignore':
misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
@@ -684,7 +700,7 @@ def resolve_replace(spec, src_repo, dest_repo):
return Target(spec=spec, src=src, dest=dest)


-def handle_replace(item, src_repo, dest_repo, opt):
+def handle_replace(item, src_repo, dest_repo):
assert(item.spec.method == 'replace')
if item.dest.path.startswith(b'/.tag/'):
get_random_item(item.spec.src, hexlify(item.src.hash),
@@ -699,6 +715,8 @@ def handle_replace(item, src_repo, dest_repo, opt):


def resolve_unnamed(spec, src_repo, dest_repo):
+ if spec.rewriter:
+ misuse(f'--{spec.method} cannot currently rewrite')
if spec.dest:
misuse('destination name given for %s' % spec_msg(spec))
src = resolve_src(spec, src_repo, allow='git')
@@ -707,13 +725,13 @@ def resolve_unnamed(spec, src_repo, dest_repo):
return None


-def handle_unnamed(item, src_repo, dest_repo, opt):
+def handle_unnamed(item, src_repo, dest_repo):
get_random_item(item.spec.src, hexlify(item.src.hash),
src_repo, dest_repo, item.spec.missing)
return (None,)


-def resolve_targets(specs, src_repo, dest_repo, *, rewrite):
+def resolve_targets(specs, src_repo, dest_repo):
resolved_items = []
common_args = src_repo, dest_repo
for spec in specs:
@@ -721,11 +739,9 @@ def resolve_targets(specs, src_repo, dest_repo, *, rewrite):
if spec.method == 'ff':
resolved_items.append(resolve_ff(spec, *common_args))
elif spec.method == 'append':
- resolved_items.append(resolve_append(spec, *common_args,
- rewrite=rewrite))
+ resolved_items.append(resolve_append(spec, *common_args))
elif spec.method in ('pick', 'force-pick'):
- resolved_items.append(resolve_pick(spec, *common_args,
- rewrite=rewrite))
+ resolved_items.append(resolve_pick(spec, *common_args))
elif spec.method == 'new-tag':
resolved_items.append(resolve_new_tag(spec, *common_args))
elif spec.method == 'replace':
@@ -778,6 +794,8 @@ def main(argv):
opt.source = argv_bytes(opt.source)
if opt.bwlimit:
client.bwlimit = parse_num(opt.bwlimit)
+ if not opt.target_specs:
+ misuse('no methods specified')

with LocalRepo(repo_dir=opt.source) as src_repo, \
make_repo(derive_repo_addr(remote=opt.remote, die=misuse),
@@ -786,32 +804,27 @@ def main(argv):
src_split_cfg = hashsplit.configuration(src_repo.config_get)
dest_split_cfg = hashsplit.configuration(dest_repo.config_get)

- if src_split_cfg != dest_split_cfg and opt.rewrite is None:
+ # For now (maybe forever), they're all the same
+ rewrite = opt.target_specs[0].rewriter
+ assert all(x.rewriter == rewrite for x in opt.target_specs), \
+ [x.rewriter for x in opt.target_specs]
+
+ if src_split_cfg != dest_split_cfg and rewrite is None:
misuse('repository configs differ; specify --rewrite or --no-rewrite')

- opt.rewriter = \
- Rewriter(split_cfg=dest_split_cfg, db=opt.rewrite_db) \
- if opt.rewrite else None
-
- with opt.rewriter or nullcontext():
-
- # Resolve and validate all sources and destinations,
- # implicit or explicit, and do it up-front, so we can
- # fail before we start writing (for any obviously
- # broken cases).
- target_items = resolve_targets(opt.target_specs,
- src_repo, dest_repo,
- rewrite=opt.rewrite)
- if opt.rewrite:
- for item in target_items:
- if item.spec.method in ('append', 'force-pick', 'pick'):
- continue
- elif item.spec.method == 'ff':
- misuse(f'--ff cannot rewrite (use --pick)')
- elif item.spec.method in ('new-tag', 'replace', 'unnamed'):
- misuse(f'--{item.spec.method} cannot currently rewrite')
- else:
- assert False, f'unexpected method {item.spec.method}'
+ # Resolve and validate all sources and destinations, implicit
+ # or explicit, combinations of methods and modes (rewrite,
+ # missing, etc.) and do it up-front, so we can fail before we
+ # start writing (for any obviously broken cases). Do this
+ # before creating any database via the Rewriter.
+ target_items = resolve_targets(opt.target_specs, src_repo, dest_repo)
+
+ with (Rewriter(split_cfg=dest_split_cfg, db=opt.rewrite_db) \
+ if rewrite else nullctx) as rewriter:
+
+ target_items = [(x if not x.spec.rewriter
+ else x._replace(spec=dcreplace(x.spec, rewriter=rewriter)))
+ for x in target_items]

updated_refs = {} # ref_name -> (original_ref, tip_commit(bin))
no_ref_info = (None, None)
@@ -843,7 +856,7 @@ def main(argv):
cur_ref = cur_ref or dest_hash

handler = handlers[item.spec.method]
- item_result = handler(item, src_repo, dest_repo, opt)
+ item_result = handler(item, src_repo, dest_repo)
if len(item_result) > 1:
new_id, tree = item_result
else:
diff --git a/test/ext/test_get.py b/test/ext/test_get.py
index 27bf3a24..75206972 100644
--- a/test/ext/test_get.py
+++ b/test/ext/test_get.py
@@ -8,7 +8,7 @@ import os, pytest, re, sys

from bup import compat, path
from bup.compat import environ
-from bup.helpers import bquote, unlink
+from bup.helpers import EXIT_FAILURE, bquote, unlink
from bup.io import byte_stream
from buptest import ex, exo
from wvpytest import wvcheck, wvfail, wvmsg, wvpass, wvpasseq, wvpassne, wvstart
@@ -297,6 +297,14 @@ def run_get(disposition, method, what=None, given=None, rewrite=False):
return _run_get(disposition, method, what, rewrite)

def _test_universal(get_disposition, src_info):
+ if get_disposition == 'get':
+ wvstart("can't do nothing")
+ rmrf(b'get-dest')
+ ex((bup_cmd, b'-d', b'get-dest', b'init'))
+ exr = exo((bup_cmd, b'-d', b'get-dest', b'get'),
+ check=False, stderr=PIPE)
+ wvpasseq(EXIT_FAILURE, exr.rc)
+ verify_rx(br'no methods specified', exr.err)
methods = (b'--ff', b'--append', b'--pick', b'--force-pick', b'--new-tag',
b'--replace', b'--unnamed')
for method in methods:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index 27bef880..2a647951 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -13,6 +13,8 @@ from bup.helpers import \
(handle_ctrl_c, path_components,
valid_save_name, log,
parse_rx_excludes,
+ qprogress,
+ reprogress,
should_rx_exclude_path)
from bup.io import path_msg, qsql_id
from bup.tree import Stack
@@ -230,12 +232,15 @@ def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
# Maintain a stack of information representing the current
# location in the archive being constructed.
parent = None
+ i, n = 0, len(commits)
for commit, (tree, timestamp) in commits:
+ i += 1
stack = Stack(dstrepo, split_cfg)

commit_name = commit_oid_name[unhexlify(commit)]
- log(b'Rewriting /%s/%s/ (%s)...\n'
- % (src, commit_name, commit[:12]))
+ pm = f'{path_msg(src)}/{path_msg(commit_name)}'
+ orig_oidm = commit[:12].decode("ascii")
+ qprogress(f'{i}/{n} {orig_oidm} {pm}\r')

citem = vfs.Commit(meta=vfs.default_dir_mode, oid=tree,
coid=commit)
@@ -263,6 +268,10 @@ def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
ci.committer_offset,
ci.message)
parent = newref
+ new_oidm = newref.hex()[:12]
+ log(f'{orig_oidm} -> {new_oidm} {pm}\n')
+ reprogress()
+
dstrepo.update_ref(dstref, newref, None)
finally:
workdb.commit() # the workdb is always ready for commit
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
When walking the source tree, yield paths that have the correct/full
metadata for all the items, including the directory itself if the path
is a directory. This allows us to drop the calls to resolve() the
metadata.

When we encounter a directory that's already been converted, just
provide the oid so we can call append_to_current() directly to add it
to the parent.

Thanks to Johannes Berg for investigating and diagnosing the problem
and then devising a similar solution that lead directly to this one.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/rewrite.py | 160 +++++++++++++++++++++++++--------------------
lib/bup/tree.py | 1 +
2 files changed, 89 insertions(+), 72 deletions(-)

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 96318235..233cd739 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -5,19 +5,14 @@ from itertools import chain
from os.path import join as joinp
from stat import S_ISDIR, S_ISLNK, S_ISREG
from typing import Any, Sequence
-import os, sqlite3, time
+import sqlite3, time

from bup import hashsplit, metadata, vfs
from bup.commit import commit_message
from bup.compat import dataclass
from bup.git import get_cat_data, parse_commit
from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
-from bup.helpers import \
- (hostname,
- log,
- path_components,
- should_rx_exclude_path,
- temp_dir)
+from bup.helpers import hostname, log, should_rx_exclude_path, temp_dir
from bup.io import path_msg, qsql_id
from bup.metadata import Metadata
from bup.pwdgrp import userfullname, username
@@ -152,8 +147,12 @@ class IncompleteDir:
missing: bytes # MissingObject oid

def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
- missing):
- """Yield the paths underneath the given path.
+ missing, *, _replacement_parents=None):
+ """Yield information about the paths underneath the given path.
+
+ Yield (src_path, replacement_dir), where src_path is a vfs_path
+ and replacement_dir is be the replacement tree oid for a src_path
+ representing a directory that has already been rewritten.

When unreadable objects are encountered, raise MissingObject if
missing.mode is 'fail', otherwise, for missing.mode 'replace',
@@ -163,6 +162,9 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
"""
assert isinstance(missing, MissingConfig), missing
assert missing.mode in ('fail', 'replace'), missing
+ if _replacement_parents is None:
+ _replacement_parents = tuple([])
+
item = path[-1][1]
assert len(path) >= 3
# drop branch/DATE
@@ -179,32 +181,48 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
# itself isn't broken; its contents may be.
entries = list(vfs.contents(srcrepo, item))
except MissingObject as ex:
- yield IncompleteDir(path, ex.oid)
+ yield IncompleteDir(path, ex.oid), None
return
+
+ path_w_meta = None
for entry in entries:
name, sub_item = entry
- sub_path = path + (entry,)
- if name in (b'.', b'..'):
+ if name == b'.':
+ # contents() promises this
+ assert path_w_meta is None, 'two "." dir entries encountered?!'
+ # Create version of path with its real metadata, not the
+ # contents() placeholder mode for dirs.
+ assert isinstance(entry[1].meta, Metadata), entry
+ dir_name, dir_item = path[-1]
+ path_w_meta = path[:-1] \
+ + ((dir_name, dir_item._replace(meta=entry[1].meta)),)
continue
sub_fs_path_in_save = joinp(fs_path_in_save, name)
if S_ISDIR(vfs.item_mode(sub_item)):
sub_fs_path_in_save += b'/'
if should_rx_exclude_path(sub_fs_path_in_save, excludes):
continue
+ assert path_w_meta is not None, '"." not before children in dir'
+ sub_path = path_w_meta + (entry,)
if not S_ISDIR(vfs.item_mode(sub_item)):
- yield sub_path
+ yield sub_path, None
else:
conv_item, oid, _ = \
_previous_conversion(dstrepo, sub_item, True, db, mapping)
+ assert conv_item.oid == sub_item.oid
if conv_item is not sub_item:
sub_path = sub_path[:-1] + ((sub_path[-1][0], conv_item),)
if oid:
- yield sub_path
+ yield sub_path, oid
else:
+ sub_rpath = _replacement_parents + (conv_item.oid,)
yield from _vfs_walk_dir_recursively(srcrepo, dstrepo, sub_path,
excludes, db, mapping,
- missing)
- yield path
+ missing,
+ _replacement_parents=sub_rpath)
+ assert path_w_meta is not None, f'{path_msg(fs_path_in_save)} has no "."'
+ assert isinstance(path_w_meta[-1][1].meta, Metadata), path_w_meta
+ yield path_w_meta, None

def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing):
assert isinstance(missing, MissingConfig), missing
@@ -257,8 +275,8 @@ def _remember_rewrite(from_oid, to_oid, chunked, size, wdbc, mapping):
' values (?, ?, ?, ?)',
(from_oid, to_oid, chunked, size))

-def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
- wdbc, mapping, missing):
+def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
+ split_cfg, stack, wdbc, mapping, missing):
"""Returns either None, or, if a directory was missing, the
directory path components.

@@ -272,41 +290,28 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
incomplete = path
path = incomplete.path

- # save_path is the vfs path to the save ref, e.g. to branch/DATE
-
- fs_path = _fs_path_from_vfs((path[0],) + path[3:]) # not including /branch/DATE
- assert fs_path.startswith(b'/'), fs_path
- fs_path = fs_path[1:] # because resolve(parent=...)
-
- dirn, filen = os.path.split(b'/' + fs_path)
- assert dirn.startswith(b'/')
- dirp = path_components(dirn)
+ # save_path is the full vfs save path e.g. branch/DATE.
+ fs_path = path[2:] # drop everything before the save
+ assert isinstance(fs_path[0][1], vfs.Commit), fs_path[0]
+ name, item = path[-1]
+ item_mode = vfs.item_mode(item)
+ is_dir = S_ISDIR(item_mode)
+ dir_path = fs_path if is_dir else fs_path[:-1]

- # If switching to a new sub-tree, finish the current sub-tree.
- while list(stack.path()) > [x[0] for x in dirp]:
+ # If switching to a new sub-tree, finish the current sub-tree, and
+ # then we'll establish the sub-tree for the new sub-tree via
+ # extend_stack for the missing components.
+ while list(stack.path()) > [x[0] for x in dir_path]:
stack.pop()

- def push_parents(parents):
- # FIXME: add missing object support
- # If switching to a new sub-tree, start a new sub-tree.
- comp_parent = None
- for path_component in parents:
- comp_name, comp_path = path_component
- if comp_parent:
- dir_res = vfs.resolve(srcrepo, comp_name, parent=comp_parent)
- else:
- full_comp_path = b'/'.join([x[0] for x in save_path]) + comp_path
- dir_res = vfs.resolve(srcrepo, full_comp_path)
- meta = dir_res[-1][1].meta
- if not isinstance(meta, metadata.Metadata):
- meta = None
- stack.push(comp_name, meta)
- comp_parent = dir_res
-
- if incomplete:
+ def extend_stack(parents):
+ for parent in parents:
+ stack.push(parent[0], parent[1].meta)
+
+ if incomplete: # must be a dir
+ assert replacement_dir is None, replacement_dir
assert missing.mode == 'replace', missing
- # everything except the dir we're replacing
- push_parents(dirp[:-1][len(stack):])
+ extend_stack(dir_path[len(stack):-1])
repair_info = missing.repair_info
# For now, wholesale replacement (no attempt to handle
# partially readable split trees).
@@ -325,14 +330,9 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
replacement.oid, replacement.meta)
return

- push_parents(dirp[len(stack):])
-
- item = path[-1][1]
-
# First, things that can't be affected by the rewrite
- item_mode = vfs.item_mode(item)
if S_ISLNK(item_mode):
- assert filen == path[-1][0]
+ extend_stack(dir_path[len(stack):])
_rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing)
return
if not S_ISREG(item_mode) and not S_ISDIR(item_mode):
@@ -340,35 +340,50 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
# described by its metadata, and so bup just saves an empty
# "placeholder" blob in the git tree (so the tree and .bupm
# will match up).
+ extend_stack(dir_path[len(stack):])
git_mode, oid = GIT_MODE_FILE, dstrepo.write_data(b'')
- stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)
+ stack.append_to_current(name, item_mode, git_mode, oid, item.meta)
return

- item, oid, git_mode = \
- _previous_conversion(dstrepo, item, not filen, wdbc, mapping)
-
- if not filen:
- # Since there's no filename, this is a subdir -- finish it.
+ if is_dir: # dirs come after their contents, so finish up
+ assert is_dir, path
assert S_ISDIR(item_mode)
- assert git_mode is None, item.oid.hex() # for both exists and not
+ if replacement_dir is not None:
+ # This is a directory that we've already converted; don't
+ # push/pop it, just add the previously generated tree to
+ # the parent.
+ extend_stack(dir_path[len(stack):-1]) # establish the parent
+ dir_name, dir_item = dir_path[-1]
+ stack.append_to_current(dir_name, GIT_MODE_TREE, GIT_MODE_TREE,
+ replacement_dir, None)
+ return
+ extend_stack(dir_path[len(stack):]) # establish the parent
if len(stack) == 1:
return # We're at the top level -- keep the current root dir
- newtree = stack.pop(override_tree=oid)
+ newtree = stack.pop()
+ assert len(item.oid) == 20, item.oid
+ assert len(newtree) == 20, newtree
# Don't remember any trees when we're making destructive
# repairs because walk will skip the contents for a tree that
# has missing objects when it encounters it a second time (for
# say the second of two saves during an --append), which will
# omit the logging, repair trailers, etc.
- if oid is None and missing.mode != 'replace':
+ if not missing.mode == 'replace':
wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
(item.oid, newtree))
return

+ extend_stack(dir_path[len(stack):])
+
+ item, oid, git_mode = \
+ _previous_conversion(dstrepo, item, is_dir, wdbc, mapping)
+ item_mode = vfs.item_mode(item)
+
assert S_ISREG(item_mode)
if oid is not None:
# already converted - oid and mode are known
assert git_mode in (GIT_MODE_TREE, GIT_MODE_FILE)
- stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)
+ stack.append_to_current(name, item_mode, git_mode, oid, item.meta)
return

item_size = None
@@ -397,7 +412,7 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
# contextual argument, and because the type may change from
# tree to blob.
assert replacement.meta.mode == default_file_mode, repr(replacement)
- stack.append_to_current(filen, replacement.meta.mode, GIT_MODE_FILE,
+ stack.append_to_current(name, replacement.meta.mode, GIT_MODE_FILE,
replacement.oid, replacement.meta)
return

@@ -411,7 +426,7 @@ def _rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack,
chunked = 1 if S_ISDIR(git_mode) else 0

_remember_rewrite(item.oid, oid, chunked, item_size, wdbc, mapping)
- stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)
+ stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

class Rewriter:
def __init__(self, *, split_cfg, db=None):
@@ -469,11 +484,12 @@ class Rewriter:
# and so if a dir is broken, we'll see that "up
# front", and never produce any children.

- for path in _vfs_walk_dir_recursively(srcrepo, dstrepo,
- save_path, excludes,
- dbc, self._mapping,
- missing):
- _rewrite_save_item(save_path, path, srcrepo, dstrepo,
+ for path, replacement_dir \
+ in _vfs_walk_dir_recursively(srcrepo, dstrepo, save_path,
+ excludes, dbc, self._mapping,
+ missing):
+ _rewrite_save_item(save_path, path, replacement_dir,
+ srcrepo, dstrepo,
self._split_cfg, stack, dbc,
self._mapping, missing)

diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 7d478f5b..7424d65d 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -115,6 +115,7 @@ class Stack:
return [p.name for p in self._stack]

def push(self, name, meta):
+ assert isinstance(meta, (Metadata, type(None))), meta
self._stack.append(StackDir(name, meta))

def _clean(self, tree):
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 945e3ffb..89af04bb 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -236,7 +236,7 @@ def parse_args(args):
# FIXME
continue
else:
- misuse()
+ misuse(f'unrecognized argument: {path_msg(arg)}')
if opt.repair_info is None:
opt.repair_info = RepairInfo(str(uuid4()).encode('ascii'),
command=get_argvb())
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Thanks to Johannes Berg for the suggestion.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index 9dce11c5..1a6f7086 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -73,6 +73,9 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
dst, chunked, size = data
if chunked:
assert S_ISREG(item_mode)
+ if not dstrepo.exists(dst):
+ # only happens if you reuse a database
+ return item, None, None
# augment the size if appropriate
if size is not None and isinstance(item.meta, metadata.Metadata):
if item.meta.size is not None:
@@ -80,13 +83,10 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
else: # must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
item.meta.size = size
- # if we have it in the DB and in the destination repo, return it
- if dstrepo.exists(dst):
- if chunked is None: # dir, not file
- return item, dst, None
- return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE
- # this only happens if you reuse a database
- return item, None, None
+ # it's in the DB and in the destination repo
+ if chunked is None: # dir, not file
+ return item, dst, None
+ return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
fullname=b''):
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:33 PM (3 days ago) Dec 10
to bup-...@googlegroups.com, Johannes Berg
From: Johannes Berg <joha...@sipsolutions.net>

Add an initial, unfinished bup rewrite command, primarily intended for
now to allow resplitting a branch with respect to the (current)
destination repository configuration (e.g. for bup.split.files and
bup.split.trees).

Signed-off-by: Johannes Berg <joha...@sipsolutions.net>
Reviewed-by: Rob Browning <r...@defaultvalue.org>
[r...@defaultvalue.org: adjust commit message]
[r...@defaultvalue.org: drop vestigial stdout flush]
[r...@defaultvalue.org: switch to log f-strings, adjust a few asserts, etc.]
[r...@defaultvalue.org: adjust test quoting; check args counts; etc.]
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 279 +++++++++++++++++++++++++++++++++++++++++
lib/bup/tree.py | 8 +-
test/ext/test-rewrite | 148 ++++++++++++++++++++++
3 files changed, 433 insertions(+), 2 deletions(-)
create mode 100755 lib/bup/cmd/rewrite.py
create mode 100755 test/ext/test-rewrite

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
new file mode 100755
index 00000000..105ff99a
--- /dev/null
+++ b/lib/bup/cmd/rewrite.py
@@ -0,0 +1,279 @@
+
+from binascii import hexlify, unhexlify
+from contextlib import closing
+from stat import S_ISDIR, S_ISLNK, S_ISREG
+import os
+import sqlite3
+
+from bup import hashsplit, git, options, repo, metadata, vfs
+from bup.compat import argv_bytes
+from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK
+from bup.helpers import \
+ (handle_ctrl_c, path_components,
+ valid_save_name, log,
+ parse_rx_excludes,
+ should_rx_exclude_path)
+from bup.io import path_msg
+from bup.tree import Stack
+from bup.repo import make_repo
+from bup.config import derive_repo_addr, ConfigError
+
+
+optspec = """
+bup rewrite -s srcrepo <branch-name>
+--
+s,source= source repository
+r,remote= remote destination repository
+work-db= work database filename (required, can be deleted after running)
+exclude-rx= skip paths matching the unanchored regex (may be repeated)
+exclude-rx-from= skip --exclude-rx patterns in file (may be repeated)
+"""
+
+
+def converted_already(dstrepo, item, vfs_dir, db, mapping):
+ size = -1 # irrelevant
+ mode = item.meta
+ if isinstance(item.meta, metadata.Metadata):
+ size = item.meta.size
+ mode = item.meta.mode
+ # if we know the size, and the oid exists already (small file w/o
+ # hashsplit) then simply return it can't do that if it's a
+ # directory, since it might exist but in the non-augmented
+ # version, so dirs always go through the database lookup
+
+ # FIXME: this seems wrong - what if we're splitting in-repo to smaller chunks?
+ #if not vfs_dir and size is not None and dstrepo.exists(item.oid):
+ # return item.oid, mode
+ db.execute(f'select dst, mode, size from {mapping} where src = ?',
+ (item.oid,))
+ data = db.fetchone()
+ # if it's not found, then we don't know anything
+ if not data:
+ return None, None
+ dst, mode, size = data
+ # augment the size if appropriate
+ if size is not None and isinstance(item.meta, metadata.Metadata):
+ assert item.meta.size is None or item.meta.size == size
+ item.meta.size = size
+ # if we have it in the DB and in the destination repo, return it
+ if dstrepo.exists(dst):
+ return dst, mode
+ # this only happens if you reuse a database
+ return None, None
+
+def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
+ fullname=b''):
+ for name, item in vfs.contents(srcrepo, vfs_item):
+ if name in (b'.', b'..'):
+ continue
+ itemname = fullname + b'/' + name
+ check_name = itemname + (b'/' if S_ISDIR(vfs.item_mode(item)) else b'')
+ if should_rx_exclude_path(check_name, excludes):
+ continue
+ if S_ISDIR(vfs.item_mode(item)):
+ if converted_already(dstrepo, item, True, db, mapping)[0] is None:
+ yield from vfs_walk_recursively(srcrepo, dstrepo, item,
+ excludes, db, mapping,
+ fullname=itemname)
+ # and the dir itself
+ yield itemname + b'/', item
+ else:
+ yield itemname, item
+
+def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
+ stack, wdbc, mapping):
+ dirn, filen = os.path.split(fullname)
+ assert dirn.startswith(b'/')
+ dirp = path_components(dirn)
+
+ # If switching to a new sub-tree, finish the current sub-tree.
+ while list(stack.path()) > [x[0] for x in dirp]:
+ stack.pop()
+
+ # If switching to a new sub-tree, start a new sub-tree.
+ for path_component in dirp[len(stack):]:
+ dir_name, fs_path = path_component
+
+ dir_item = vfs.resolve(srcrepo, src + b'/' + commit_name + b'/' + fs_path)
+ meta = dir_item[-1][1].meta
+ if not isinstance(meta, metadata.Metadata):
+ meta = None
+ stack.push(dir_name, meta)
+
+ oid, mode = converted_already(dstrepo, item, not filen, wdbc, mapping)
+
+ if not filen:
+ if len(stack) == 1:
+ return # We're at the top level -- keep the current root dir
+ # Since there's no filename, this is a subdir -- finish it.
+ newtree = stack.pop(override_tree=oid)
+ if oid is None:
+ wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
+ (item.oid, newtree))
+ return
+
+ vfs_mode = vfs.item_mode(item)
+
+ # already converted - id is known, item.meta was updated if needed
+ # (in converted_already()), and the proper new mode was returned
+ if oid is not None:
+ assert mode is not None, oid
+ stack.append_to_current(filen, vfs_mode, mode, oid, item.meta)
+ return
+
+ item_size = None
+ size_augmented = False
+ if S_ISREG(vfs_mode):
+ item_size = 0
+ def write_data(data):
+ nonlocal item_size
+ item_size += len(data)
+ return dstrepo.write_data(data)
+ with vfs.tree_data_reader(srcrepo, item.oid) as f:
+ mode, oid = hashsplit.split_to_blob_or_tree(
+ write_data, dstrepo.write_tree,
+ hashsplit.from_config([f], split_cfg))
+ if isinstance(item.meta, metadata.Metadata):
+ if item.meta.size is None:
+ item.meta.size = item_size
+ size_augmented = True
+ else:
+ assert item.meta.size == item_size
+ elif S_ISDIR(vfs_mode):
+ assert False # handled above
+ elif S_ISLNK(vfs_mode):
+ target = vfs.readlink(srcrepo, item)
+ mode, oid = (GIT_MODE_SYMLINK, dstrepo.write_symlink(target))
+ if isinstance(item.meta, metadata.Metadata):
+ if item.meta.size is None:
+ item.meta.size = len(item.meta.symlink_target)
+ size_augmented = True
+ else:
+ assert item.meta.size == len(item.meta.symlink_target)
+ item_size = len(target)
+ else:
+ # Everything else should be fully described by its metadata,
+ # so just record an empty blob, so the paths in the tree and
+ # .bupm will match up.
+ mode, oid = (GIT_MODE_FILE, dstrepo.write_data(b''))
+
+ if size_augmented or oid != item.oid:
+ wdbc.execute(f'insert into {mapping} (src, dst, mode, size)'
+ ' values (?, ?, ?, ?)',
+ (item.oid, oid, mode, item_size))
+ stack.append_to_current(filen, vfs_mode, mode, oid, item.meta)
+
+def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
+ # Currently, the workdb must always be ready to commit (see finally below)
+ srcref = b'refs/heads/%s' % src
+ dstref = b'refs/heads/%s' % dst
+ if dstrepo.read_ref(dstref) is not None:
+ fatal(f'branch already exists: {path_msg(dst)}')
+ try:
+ split_cfg = hashsplit.configuration(dstrepo.config_get)
+ except ConfigError as ex:
+ fatal(ex)
+ split_trees = dstrepo.config_get(b'bup.split.trees', opttype='bool')
+
+ vfs_branch = vfs.resolve(srcrepo, src)
+ item = vfs_branch[-1][1]
+ commit_oid_name = {
+ c[1].coid: c[0]
+ for c in vfs.contents(srcrepo, item)
+ if isinstance(c[1], vfs.Commit)
+ }
+ commits = list(srcrepo.rev_list(hexlify(item.oid), parse=vfs.parse_rev,
+ format=b'%T %at'))
+ commits.reverse()
+ with closing(workdb.cursor()) as wdbc:
+ try:
+ tablename = 'mapping_to_bits'
+ for k, v in split_cfg.items():
+ tablename += f'_{k}_{v}'
+ workdb.execute(f"create table if not exists {tablename}"
+ ' (src blob primary key,'
+ ' dst blob not null,'
+ ' mode integer,'
+ ' size integer)'
+ ' without rowid')
+
+ # Maintain a stack of information representing the current
+ # location in the archive being constructed.
+ parent = None
+ for commit, (tree, timestamp) in commits:
+ stack = Stack(dstrepo, split_cfg)
+
+ commit_name = commit_oid_name[unhexlify(commit)]
+ log(b'Rewriting /%s/%s/ (%s)...\n'
+ % (src, commit_name, commit[:12]))
+
+ citem = vfs.Commit(meta=vfs.default_dir_mode, oid=tree,
+ coid=commit)
+ for fullname, item in vfs_walk_recursively(srcrepo, dstrepo,
+ citem, excludes,
+ wdbc, tablename):
+ rewrite_item(item, commit_name, fullname, srcrepo, src,
+ dstrepo, split_cfg, stack, wdbc, tablename)
+
+ while len(stack) > 1: # pop all parts above root folder
+ stack.pop()
+ tree = stack.pop() # and the root to get the tree
+
+ commit_it = srcrepo.cat(commit)
+ next(commit_it)
+ ci = git.parse_commit(b''.join(commit_it))
+ author = ci.author_name + b' <' + ci.author_mail + b'>'
+ committer = ci.committer_name + b' <' + ci.committer_mail + b'>'
+ newref = dstrepo.write_commit(tree, parent,
+ author,
+ ci.author_sec,
+ ci.author_offset,
+ committer,
+ ci.committer_sec,
+ ci.committer_offset,
+ ci.message)
+ parent = newref
+ dstrepo.update_ref(dstref, newref, None)
+ finally:
+ workdb.commit() # the workdb is always ready for commit
+
+def main(argv):
+
+ handle_ctrl_c()
+
+ o = options.Options(optspec)
+ opt, flags, extra = o.parse_bytes(argv[1:])
+
+ if len(extra) != 1:
+ o.fatal('no branch name given')
+
+ exclude_rxs = parse_rx_excludes(flags, o.fatal)
+
+ src = argv_bytes(extra[0])
+ if b':' in src:
+ src, dst = src.split(b':', 1)
+ else:
+ dst = src
+ if not valid_save_name(src):
+ o.fatal(f'invalid branch name: {path_msg(src)}')
+ if not valid_save_name(dst):
+ o.fatal(f'invalid branch name: {path_msg(dst)}')
+
+ if opt.remote:
+ opt.remote = argv_bytes(opt.remote)
+
+ if not opt.work_db:
+ o.fatal('--work-db argument is required')
+
+ workdb_conn = sqlite3.connect(opt.work_db)
+ workdb_conn.text_factory = bytes
+
+ # FIXME: support remote source repos ... probably after we unify
+ # the handling?
+ # Leave db commits to the sub-functions doing the work.
+ with repo.LocalRepo(argv_bytes(opt.source)) as srcrepo, \
+ make_repo(derive_repo_addr(remote=opt.remote, die=o.fatal)) as dstrepo, \
+ closing(workdb_conn):
+ rewrite_branch(srcrepo, src, dstrepo, dst, exclude_rxs, workdb_conn,
+ o.fatal)
+
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index ba450e2e..41912323 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -8,7 +8,7 @@ from bup.hashsplit import \
GIT_MODE_FILE,
split_to_blob_or_tree)
from bup.helpers import add_error
-from bup.metadata import MetadataRO
+from bup.metadata import Metadata, MetadataRO
from bup.io import path_msg
from bup.git import shalist_item_sort_key, mangle_name
from bup._helpers import RecordHashSplitter
@@ -123,7 +123,11 @@ class Stack:

def _write_tree(self, dir_meta, items, add_meta=True):
shalist = []
- if add_meta:
+ # This might be False if doing a 'bup rewrite' where the original is
+ # from an old repo without metadata, or created by 'bup split'.
+ meta_ok = all(isinstance(entry.meta, Metadata)
+ for entry in items if entry.mode != GIT_MODE_TREE)
+ if add_meta and meta_ok:
metalist = [(b'', _empty_metadata if dir_meta is None else dir_meta)]
metalist += [(shalist_item_sort_key((entry.mode, entry.name, None)),
entry.meta)
diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
new file mode 100755
index 00000000..46fb85eb
--- /dev/null
+++ b/test/ext/test-rewrite
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+. wvtest.sh
+. wvtest-bup.sh
+. dev/lib.sh
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+top="$(WVPASS realpath "$top")" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+export BUP_DIR="$tmpdir/bup"
+export BUP_DIR2="$tmpdir/bup2"
+export BUP_DIR3="$tmpdir/bup3"
+export BUP_DIR4="$tmpdir/bup4"
+export BUP_DIR5="$tmpdir/bup5"
+
+bup() { "$top/bup" "$@"; }
+
+WVPASS cd "$tmpdir"
+
+WVPASS bup init
+WVPASS bup -d "$BUP_DIR2" init
+WVPASS bup -d "$BUP_DIR3" init
+WVPASS bup -d "$BUP_DIR4" init
+
+WVPASS git config -f "$BUP_DIR4/config" bup.split.trees true
+WVPASS git config -f "$BUP_DIR4/config" bup.split.files legacy:14
+
+extract_all() {
+ WVPASSEQ $# 3
+ local repo="$1" ref="$2" dest="$3"
+ saves=$(bup -d "$repo" ls "$ref" | grep ^2) # good for another ~1000 years
+ for save in $saves ; do
+ bup -d "$repo" restore -q -C "$tmpdir/restore/$dest" "$ref/$save"
+ done
+}
+
+compare() {
+ WVPASSEQ $# 4
+ local src="$1" src_ref="$2" dst="$3" dst_ref="$4"
+ WVPASS extract_all "$src" "$src_ref" orig
+ WVPASS extract_all "$dst" "$dst_ref" new
+ touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
+ WVPASS "$top/dev/compare-trees" \
+ "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
+ WVPASS rm -rf "$tmpdir/restore"
+}
+
+WVSTART split and rewrite
+WVPASS bup split -n split < "$top/test/testfile1"
+WVPASS bup -d "$BUP_DIR2" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" split:test
+WVPASS compare "$BUP_DIR" split "$BUP_DIR2" test
+
+WVSTART make multiple saves
+WVPASS bup index "$top/test/sampledata"
+WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
+WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
+WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
+WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
+
+WVSTART rewrite to different split
+WVPASS bup -d "$BUP_DIR4" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" save
+WVPASS compare "$BUP_DIR" save "$BUP_DIR4" save
+
+WVSTART "rewrite unchanged (to remote)"
+WVPASS bup rewrite -r ":$BUP_DIR3" --work-db "$tmpdir/db" -s "$BUP_DIR" save
+WVPASS compare "$BUP_DIR" save "$BUP_DIR3" save
+WVPASSEQ "$(GIT_DIR=$BUP_DIR WVPASS git rev-parse save)" \
+ "$(GIT_DIR=$BUP_DIR3 WVPASS git rev-parse save)"
+
+WVSTART rewrite after size not stored
+# now do a hack to save without saving the size in metadata ...
+WVPASS mkdir -p "$tmpdir/mod"
+cat > "$tmpdir/mod/metadata_encode_no_size.py" << EOF
+from bup import metadata, vfs
+
+_orig_encode_common = metadata.Metadata._encode_common
+def _new_encode_common(self):
+ self.size = None
+ print("encoding common with self.size None")
+ return _orig_encode_common(self)
+metadata.Metadata._encode_common = _new_encode_common
+
+vfs._compute_item_size = lambda repo, item: -1122334455
+EOF
+
+bup+()
+{
+ PYTHONPATH="$tmpdir/mod" bup --import-py-module metadata_encode_no_size "$@"
+}
+
+# force a re-save of the testfile1 to get it w/o size
+WVPASS bup index --fake-invalid "$top/test/sampledata/y/testfile1"
+WVPASS bup+ -d "$BUP_DIR" save -n save --strip-path="$top" \
+ "$top/test/sampledata"
+
+# check that we get the "unknown" size out
+WVPASS bup+ -d "$BUP_DIR" ls -l save/latest/test/sampledata/y/testfile1 |
+ WVPASS grep -- -1122334455
+# and that augmentation worked
+WVPASS bup -d "$BUP_DIR" ls -l save/latest/test/sampledata/y/testfile1 |
+ WVPASS grep -- 158664
+
+# now rewrite again - and then the size should be correct even without augmentation
+WVPASS bup -d "$BUP_DIR4" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" save:save2
+WVPASS bup+ -d "$BUP_DIR4" ls -l save/latest/test/sampledata/y/testfile1 |
+ WVPASS grep -- 158664
+
+# and again for the other kind of splitting
+WVPASS bup -d "$BUP_DIR3" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" save:save2
+WVPASS bup+ -d "$BUP_DIR3" ls -l save/latest/test/sampledata/y/testfile1 |
+ WVPASS grep -- 158664
+
+WVSTART rewrite with excluded files
+WVPASS bup -d "$BUP_DIR5" init
+WVPASS bup -d "$BUP_DIR5" rewrite --work-db "$tmpdir/db2" -s "$BUP_DIR4" \
+ --exclude-rx ^/test/sampledata/y/ save
+WVPASS extract_all "$BUP_DIR4" "save" "orig"
+WVPASS extract_all "$BUP_DIR5" "save" "new"
+rm -rf "$tmpdir/restore/orig/"*"/test/sampledata/y/"
+# that rm -rf changed timestamps - just ignore them
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/orig/"*"/test/sampledata/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"*"/test/sampledata/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
+WVPASS "$top/dev/compare-trees" "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
+WVPASS rm -rf "$tmpdir/restore"
+
+WVSTART "rewrite with excluded files (in repo)"
+WVPASS git config -f "$BUP_DIR/config" bup.split.trees true
+WVPASS git config -f "$BUP_DIR/config" bup.split.files legacy:14
+WVPASS bup -d "$BUP_DIR" rewrite --work-db "$tmpdir/db3" -s "$BUP_DIR" \
+ --exclude-rx ^/test/sampledata/y/ save:save-new
+WVPASS extract_all "$BUP_DIR" "save" "orig"
+WVPASS extract_all "$BUP_DIR" "save-new" "new"
+rm -rf "$tmpdir/restore/orig/"*"/test/sampledata/y/"
+# that rm -rf changed timestamps - just ignore them
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/orig/"*"/test/sampledata/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"*"/test/sampledata/"
+touch -r "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
+WVPASS "$top/dev/compare-trees" "$tmpdir/restore/orig/" "$tmpdir/restore/new/"
+WVPASS rm -rf "$tmpdir/restore"
+GIT_DIR="$BUP_DIR" WVPASS git ls-tree -r save^ > "$tmpdir/o"
+GIT_DIR="$BUP_DIR" WVPASS git ls-tree -r save-new^ > "$tmpdir/n"
+# FIXME: analyse the diff properly
+diff -u "$tmpdir/o" "$tmpdir/n"
+
+
+WVPASS rm -rf "$tmpdir"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Blobs and chunked file trees' content can't be affected by excludes,
so don't drop them when excludes change. Just drop the trees
representing real directories (including split tree nodes).

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/rewrite.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 94fc0424..5b2a6335 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -38,7 +38,6 @@ def _fs_path_from_vfs(path):
return fs
return fs + b'/'

-
def _prep_mapping_table(db, split_cfg):
# This currently only needs to track items that may be split,
# depending on the current repo settings (e.g. files and
@@ -515,9 +514,10 @@ class Rewriter:
if self._current_excludes != excludes:
# Whenever the excludes change, remembered tree
# rewrites may become incorrect. We could just
- # drop the trees if we had an indicator, but for
- # now just drop everything.
- dbc.execute(f'delete from {self._mapping}')
+ # drop the affected trees if we had an indicator,
+ # but for now just drop them all.
+ dbc.execute(f'delete from {self._mapping}'
+ ' where chunked is NULL')
self._current_excludes = excludes

# Relies on the fact that recursion is dfs post-order,
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
The ignore option only makes sense when not rewriting (i.e. for
traditional "get" transfers), and also only works right now for
--unnamed, so instead of acting as if the three missing "modes" are
interchangeable make the distinction clear by handling them
separately. This also prepares for dropping --missing MODE in favor of
a simple --repair option.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 122 ++++++++++++++++++++++++++-------------------
lib/bup/repair.py | 12 ++---
lib/bup/rewrite.py | 80 ++++++++++++-----------------
3 files changed, 108 insertions(+), 106 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index c34cede9..c559277f 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -29,7 +29,7 @@ from bup.helpers import \
tty_width)
from bup.io import path_msg
from bup.pwdgrp import userfullname, username
-from bup.repair import MissingConfig, RepairInfo, valid_repair_id
+from bup.repair import RepairConfig, RepairInfo, valid_repair_id
from bup.repo import LocalRepo, make_repo
from bup.rewrite import Rewriter

@@ -134,9 +134,13 @@ class Spec:
method: str
src: bytes
dest: bytes
- missing: Optional[MissingConfig] = None
+ ignore_missing: bool
+ repair: Optional[RepairConfig] = None
excludes: Optional[list[Pattern]] = None
rewriter: Optional[Union[bool, Rewriter]] = None
+ def __post_init__(self):
+ assert not (self.ignore_missing and self.repair), \
+ (self.ignore_missing, self.repair)

def spec_msg(s):
if not s.dest:
@@ -163,20 +167,24 @@ def parse_args(args):
# resolvers), the spec's rewriter will be set to True to indicate
# that it needs the real Rewriter once we have it.
rewrite = None # None means "didn't specify", False means "said no"
- missing = 'fail'
exclude_opts = []
+ ignore_missing = False
+ repair = False
repair_id = None
def make_spec(method, src, dest):
nonlocal repair_id
+ assert not (ignore_missing and repair), (ignore_missing, repair)
excludes = parse_rx_excludes(exclude_opts, misuse)
if excludes and not rewrite:
misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
- if repair_id is None:
- repair_id = str(uuid4()).encode('ascii')
+ rc = None
+ if rewrite:
+ if repair_id is None:
+ repair_id = str(uuid4()).encode('ascii')
+ rc = RepairConfig(id=repair_id, destructive=repair,
+ info=opt.repair_info)
return Spec(method=method, src=src, dest=dest, excludes=excludes,
- rewriter=rewrite,
- missing=MissingConfig(id=repair_id, mode=missing,
- repair_info=opt.repair_info))
+ rewriter=rewrite, ignore_missing=ignore_missing, repair=rc)

pending_method_context = {} # dict to preserve insertion order
remaining = args[1:] # Skip argv[0]
@@ -191,17 +199,27 @@ def parse_args(args):
elif arg == b'--missing':
pending_method_context[arg] = True
(val,), remaining = require_n_args_or_die(1, remaining)
- if val not in (b'fail', b'ignore', b'replace'):
+ if val == b'fail':
+ ignore_missing = False
+ repair = False
+ elif val == b'ignore':
+ if repair:
+ misuse('--ignore-missing and --repair are incompatible')
+ ignore_missing = True
+ elif val == b'replace':
+ if ignore_missing:
+ misuse('--ignore-missing and --repair are incompatible')
+ repair = True
+ else:
misuse(f'--missing must be fail, ignore, or replace, not {val!r}')
- missing = val.decode('ascii')
elif arg == b'--ignore-missing':
+ if repair:
+ misuse('--ignore-missing and --repair are incompatible')
pending_method_context[arg] = True
- missing = 'ignore'
- remaining = remaining[1:]
+ ignore_missing, remaining = True, remaining[1:]
elif arg == b'--no-ignore-missing':
pending_method_context[arg] = True
- missing = 'fail'
- remaining = remaining[1:]
+ ignore_missing, remaining = False, remaining[1:]
elif arg == b'--repair-id':
pending_method_context[arg] = True
(val,), remaining = require_n_args_or_die(1, remaining)
@@ -212,14 +230,17 @@ def parse_args(args):
repair_id = val
elif arg in (b'--ff', b'--append', b'--pick', b'--force-pick',
b'--new-tag', b'--replace', b'--unnamed'):
+ if ignore_missing and arg != b'--unnamed':
+ misuse('currently only --unnamed allows --missing ignore')
(ref,), remaining = require_n_args_or_die(1, remaining)
opt.target_specs.append(make_spec(method=arg[2:].decode('ascii'),
src=ref, dest=None))
pending_method_context = {}
elif arg in (b'--ff:', b'--append:', b'--pick:', b'--force-pick:',
b'--new-tag:', b'--replace:'):
+ if ignore_missing and arg != b'--unnamed':
+ misuse('currently only --unnamed allows --missing ignore')
(ref, dest), remaining = require_n_args_or_die(2, remaining)
- args_after_last_method = remaining
opt.target_specs.append(make_spec(method=arg[2:-1].decode('ascii'),
src=ref, dest=dest))
pending_method_context = {}
@@ -274,8 +295,7 @@ def parse_args(args):

# FIXME: walk_object in in git.py doesn't support opt.verbose. Do we
# need to adjust for that here?
-def get_random_item(name, hash, src_repo, dest_repo, missing):
- assert missing.mode in ('fail', 'ignore'), missing
+def get_random_item(name, hash, src_repo, dest_repo, ignore_missing):
def already_seen(oid):
return dest_repo.exists(unhexlify(oid))
def get_ref(oidx, include_data=False):
@@ -285,7 +305,7 @@ def get_random_item(name, hash, src_repo, dest_repo, missing):
include_data=True, result='item'):
assert isinstance(item, git.WalkItem)
if item.data is False:
- if missing.mode == 'fail':
+ if not ignore_missing:
raise MissingObject(item.oid)
note_error(f'skipping missing source object {item.oid.hex()}\n')
continue
@@ -313,14 +333,14 @@ def get_random_item(name, hash, src_repo, dest_repo, missing):
dest_repo.just_write(item.oid, item.type, item.data)


-def transfer_commit(name, hash, parent, src_repo, dest_repo, missing):
+def transfer_commit(name, hash, parent, src_repo, dest_repo, ignore_missing):
now = time.time()
items = parse_commit(get_cat_data(src_repo.cat(hash), b'commit'))
tree = unhexlify(items.tree)
author = b'%s <%s>' % (items.author_name, items.author_mail)
author_time = (items.author_sec, items.author_offset)
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
- get_random_item(name, hexlify(tree), src_repo, dest_repo, missing)
+ get_random_item(name, hexlify(tree), src_repo, dest_repo, ignore_missing)
c = dest_repo.write_commit(tree, parent,
author, items.author_sec, items.author_offset,
committer, now, None,
@@ -328,13 +348,14 @@ def transfer_commit(name, hash, parent, src_repo, dest_repo, missing):
return c, tree


-def append_commit(src_loc, parent, src_repo, dest_repo, missing, rewriter,
- excludes):
+def append_commit(src_loc, parent, src_repo, dest_repo, rewriter, excludes,
+ repair_config, ignore_missing):
if not rewriter:
assert isinstance(src_loc, (bytes, Loc)), src_loc
oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
return transfer_commit(None, # unused
- oidx, parent, src_repo, dest_repo, missing)
+ oidx, parent, src_repo, dest_repo,
+ ignore_missing)

# Friendlier checking was done during resolve_*
assert isinstance(src_loc, Loc), src_loc
@@ -343,18 +364,19 @@ def append_commit(src_loc, parent, src_repo, dest_repo, missing, rewriter,
root, ref, save = path
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
- return rewriter.append_save(path, parent, src_repo, dest_repo, missing,
- excludes)
+ return rewriter.append_save(path, parent, src_repo, dest_repo, excludes,
+ repair_config)

-def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, rewriter,
- excludes):
+def append_commits(src_loc, dest_hash, src_repo, dest_repo, rewriter, excludes,
+ repair_config, ignore_missing):
if not rewriter:
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()
last_c, tree = dest_hash, None
for commit in commits:
last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
- missing, rewriter, excludes)
+ rewriter, excludes, repair_config,
+ ignore_missing)
assert tree is not None
return last_c, tree

@@ -382,7 +404,7 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, missing, rewriter,
coid = unhexlify(commit)
last_c, tree = rewriter.append_save(path + (entry_for_coid[coid],),
last_c, src_repo, dest_repo,
- missing, excludes)
+ excludes, repair_config)
assert tree is not None
return last_c, tree

@@ -490,7 +512,7 @@ def resolve_src(spec, src_repo, *, allow=None):
misuse('cannot fetch entire repository for %s' % spec_args)
if src.type == 'tags':
misuse('cannot fetch entire /.tag directory for %s' % spec_args)
- if not (src or spec.missing.mode == 'ignore'):
+ if not (src or spec.ignore_missing):
misuse('cannot find source for %s' % spec_args)
debug1('src: %s\n' % loc_desc(src))
return src
@@ -542,8 +564,7 @@ def resolve_branch_dest(spec, src, src_repo, dest_repo):
def resolve_ff(spec, src_repo, dest_repo):
if spec.rewriter:
misuse(f'--{spec.method} cannot rewrite (use --pick or --append)')
- if spec.missing.mode == 'ignore':
- misuse('currently only --unnamed allows --missing ignore')
+ assert not spec.ignore_missing
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if src.type == 'tree':
@@ -564,7 +585,7 @@ def handle_ff(item, src_repo, dest_repo):
if not dest_oidx or dest_oidx in src_repo.rev_list(src_oidx):
# Can fast forward.
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
- item.spec.missing)
+ item.spec.ignore_missing)
commit_items = parse_commit(get_cat_data(src_repo.cat(src_oidx), b'commit'))
return item.src.hash, unhexlify(commit_items.tree)
misuse('destination is not an ancestor of source for %s'
@@ -574,8 +595,7 @@ def handle_ff(item, src_repo, dest_repo):


def resolve_append(spec, src_repo, dest_repo):
- if spec.missing.mode == 'ignore':
- misuse('currently only --unnamed allows --missing ignore')
+ assert not spec.ignore_missing
src = resolve_src(spec, src_repo)
if src.type not in ('branch', 'save', 'commit', 'tree'):
misuse('source for %s must be a branch, save, commit, or tree, not %s'
@@ -605,7 +625,7 @@ def handle_append(item, src_repo, dest_repo):
if item.spec.rewriter:
misuse(f'rewrite cannot yet promote tree to commit for {spec_msg(item.spec)}')
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
- item.spec.missing)
+ item.spec.ignore_missing)
parent = item.dest.hash
msg = commit_message(b'bup get', compat.get_argvb())
userline = b'%s <%s@%s>' % (userfullname(), username(), hostname())
@@ -617,13 +637,12 @@ def handle_append(item, src_repo, dest_repo):
if item.dest.hash:
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
return append_commits(item.src, item.dest.hash, src_repo, dest_repo,
- item.spec.missing, item.spec.rewriter,
- item.spec.excludes)
+ item.spec.rewriter, item.spec.excludes,
+ item.spec.repair, item.spec.ignore_missing)


def resolve_pick(spec, src_repo, dest_repo):
- if spec.missing.mode == 'ignore':
- misuse('currently only --unnamed allows --missing ignore')
+ assert not spec.ignore_missing
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if src.type == 'tree':
@@ -663,20 +682,20 @@ def handle_pick(item, src_repo, dest_repo):
# if the dest is committish, make it the parent
if item.dest.type in ('branch', 'commit', 'save'):
return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
- item.spec.missing, item.spec.rewriter,
- item.spec.excludes)
+ item.spec.rewriter, item.spec.excludes,
+ item.spec.repair, item.spec.ignore_missing)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
- return append_commit(item.src, None, src_repo, dest_repo, item.spec.missing,
- item.spec.rewriter, item.spec.excludes)
+ return append_commit(item.src, None, src_repo, dest_repo,
+ item.spec.rewriter, item.spec.excludes,
+ item.spec.repair, item.spec.ignore_missing)


def resolve_new_tag(spec, src_repo, dest_repo):
+ assert not spec.ignore_missing
if spec.rewriter:
misuse(f'--{spec.method} cannot currently rewrite')
- if spec.missing.mode == 'ignore':
- misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if not spec.dest and src.path.startswith(b'/.tag/'):
@@ -698,15 +717,14 @@ def handle_new_tag(item, src_repo, dest_repo):
assert item.spec.method == 'new-tag'
assert item.dest.path.startswith(b'/.tag/')
get_random_item(item.spec.src, hexlify(item.src.hash),
- src_repo, dest_repo, item.spec.missing)
+ src_repo, dest_repo, item.spec.ignore_missing)
return (item.src.hash,)


def resolve_replace(spec, src_repo, dest_repo):
+ assert not spec.ignore_missing
if spec.rewriter:
misuse(f'--{spec.method} cannot currently rewrite')
- if spec.missing.mode == 'ignore':
- misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if not spec.dest:
@@ -732,12 +750,12 @@ def handle_replace(item, src_repo, dest_repo):
assert(item.spec.method == 'replace')
if item.dest.path.startswith(b'/.tag/'):
get_random_item(item.spec.src, hexlify(item.src.hash),
- src_repo, dest_repo, item.spec.missing)
+ src_repo, dest_repo, item.spec.ignore_missing)
return (item.src.hash,)
assert(item.dest.type == 'branch' or not item.dest.type)
src_oidx = hexlify(item.src.hash)
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
- item.spec.missing)
+ item.spec.ignore_missing)
commit_items = parse_commit(get_cat_data(src_repo.cat(src_oidx), b'commit'))
return item.src.hash, unhexlify(commit_items.tree)

@@ -755,7 +773,7 @@ def resolve_unnamed(spec, src_repo, dest_repo):

def handle_unnamed(item, src_repo, dest_repo):
get_random_item(item.spec.src, hexlify(item.src.hash),
- src_repo, dest_repo, item.spec.missing)
+ src_repo, dest_repo, item.spec.ignore_missing)
return (None,)


diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index 5ee7294e..e9c840db 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -1,6 +1,6 @@

from binascii import hexlify
-from typing import Optional, Union
+from typing import Optional

from bup.compat import dataclass
from bup.io import enc_sh
@@ -37,12 +37,10 @@ class RepairInfo:


@dataclass(slots=True, frozen=True)
-class MissingConfig:
+class RepairConfig:
id: bytes
- mode: Union['fail', 'ignore', 'replace']
- repair_info: Optional[RepairInfo] = None
+ destructive: bool # Allow repairs that lose data (e.g. replacements)
+ info: Optional[RepairInfo] = None
def __post_init__(self):
assert valid_repair_id(self.id)
- assert self.mode in ('fail', 'ignore', 'replace')
- if self.mode == 'replace':
- assert isinstance(self.repair_info, RepairInfo), self.repair_info
+ assert isinstance(self.info, RepairInfo)
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 5b2a6335..ee9952f6 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -23,7 +23,6 @@ from bup.io import path_msg, qsql_id
from bup.metadata import Metadata
from bup.path import xdg_cache
from bup.pwdgrp import userfullname, username
-from bup.repair import MissingConfig
from bup.tree import Stack
from bup.vfs import Item, MissingObject, default_exec_mode, default_file_mode

@@ -96,12 +95,11 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

-def _path_repaired(path, oid, replacement_oid, missing_oid, repair_id,
- repair_info):
- if repair_info.repair_count() == 0:
- log(b'repairs needed, repair-id: %s\n' % repair_id)
+def _path_repaired(path, oid, replacement_oid, missing_oid, repair_config):
+ if repair_config.info.repair_count() == 0:
+ log(b'repairs needed, repair-id: %s\n' % repair_config.id)
fs_path = _fs_path_from_vfs(path)
- repair_info.path_replaced(fs_path, oid, replacement_oid)
+ repair_config.info.path_replaced(fs_path, oid, replacement_oid)
ep = path_msg(fs_path)
log(f'warning: missing object {missing_oid.hex()} for {ep}\n')
log(f'repaired {ep} {oid.hex()} -> {replacement_oid.hex()}\n')
@@ -154,7 +152,7 @@ class IncompleteDir:
missing: bytes # MissingObject oid

def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
- missing, *, _replacement_parents=None):
+ repair_config, *, _replacement_parents=None):
"""Yield information about the paths underneath the given path.

Yield (src_path, replacement_dir), where src_path is a vfs_path
@@ -162,13 +160,11 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
representing a directory that has already been rewritten.

When unreadable objects are encountered, raise MissingObject if
- missing.mode is 'fail', otherwise, for missing.mode 'replace',
- yield an IncompleteDir if the path refers to a missing git tree,
- or split tree with missing split sub-trees.
+ there is no repair_config, otherwise, yield an IncompleteDir if
+ the path refers to a missing git tree, or split tree with missing
+ split sub-trees.

"""
- assert isinstance(missing, MissingConfig), missing
- assert missing.mode in ('fail', 'replace'), missing
if _replacement_parents is None:
_replacement_parents = tuple([])

@@ -177,7 +173,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
# drop branch/DATE
fs_path_in_save = _fs_path_from_vfs((path[0],) + path[3:])

- if missing.mode == 'fail':
+ if not repair_config.destructive:
entries = vfs.contents(srcrepo, item)
else:
try:
@@ -229,15 +225,13 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
sub_rpath = _replacement_parents + (conv_item.oid,)
yield from _vfs_walk_dir_recursively(srcrepo, dstrepo, sub_path,
excludes, db, mapping,
- missing,
+ repair_config,
_replacement_parents=sub_rpath)
assert path_w_meta is not None, f'{path_msg(fs_path_in_save)} has no "."'
assert isinstance(path_w_meta[-1][1].meta, (Metadata, int)), path_w_meta
yield path_w_meta, None

-def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing):
- assert isinstance(missing, MissingConfig), missing
- assert missing.mode in ('fail', 'replace'), missing
+def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repair_config):
name, item = path[-1]
assert isinstance(name, bytes)
have_meta = isinstance(item.meta, metadata.Metadata)
@@ -246,18 +240,17 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing):
target = vfs.readlink(srcrepo, item)
except MissingObject as ex:
if have_meta and item.symlink_target is not None:
- missing.repair_info.note_repair()
+ repair_config.info.note_repair()
pm = path_msg(_fs_path_from_vfs(path))
log(f'warning: symlink data replaced from metadata for {pm}\n')
target = item.symlink_target
else:
- if missing.mode == 'fail':
+ if not repair_config.destructive:
raise ex
- repair_info = missing.repair_info
- replacement = _replacement_symlink_item(dstrepo, item, missing.id,
- ex.oid)
- _path_repaired(path, item.oid, replacement.oid, ex.oid, missing.id,
- repair_info)
+ replacement = _replacement_symlink_item(dstrepo, item,
+ repair_config.id, ex.oid)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid,
+ repair_config)
assert replacement.meta.mode == default_file_mode
stack.append_to_current(name, default_file_mode, default_file_mode,
replacement.oid, replacement.meta)
@@ -298,14 +291,11 @@ def _maybe_exec_mode(git_mode, meta):
return git_mode

def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
- split_cfg, stack, wdbc, mapping, missing):
+ split_cfg, stack, wdbc, mapping, repair_config):
"""Returns either None, or, if a directory was missing, the
directory path components.

"""
- assert isinstance(missing, MissingConfig), missing
- assert missing.mode in ('fail', 'replace'), missing
-
if not isinstance(path, IncompleteDir):
incomplete = None
else:
@@ -332,20 +322,20 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,

if incomplete: # must be a dir
assert replacement_dir is None, replacement_dir
- assert missing.mode == 'replace', missing
+ assert repair_config, repair_config
extend_stack(dir_path[len(stack):-1])
- repair_info = missing.repair_info
# For now, wholesale replacement (no attempt to handle
# partially readable split trees).
rep_item = incomplete.path[-1][1]
- replacement = _replacement_tree_item(dstrepo, rep_item, missing.id,
+ replacement = _replacement_tree_item(dstrepo, rep_item,
+ repair_config.id,
incomplete.missing)
# Must not remember repairs because the repair-id (and so blob
# content) can vary across saves, i.e. get --rewrite-id is a
# contextual argument, and because the type changes from tree
# to blob.
_path_repaired(path, rep_item.oid, replacement.oid, incomplete.missing,
- missing.id, repair_info)
+ repair_config)
assert replacement.meta.mode == default_file_mode, repr(replacement)
stack.append_to_current(path[-1][0],
replacement.meta.mode, GIT_MODE_FILE,
@@ -355,7 +345,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# First, things that can't be affected by the rewrite
if S_ISLNK(item_mode):
extend_stack(dir_path[len(stack):])
- _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing)
+ _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repair_config)
return
if not S_ISREG(item_mode) and not S_ISDIR(item_mode):
# Everything here (pipes, devices, etc.) should be fully
@@ -390,7 +380,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# has missing objects when it encounters it a second time (for
# say the second of two saves during an --append), which will
# omit the logging, repair trailers, etc.
- if not missing.mode == 'replace':
+ if not repair_config.destructive:
wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
(item.oid, newtree))
return
@@ -424,17 +414,15 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
except MissingObject as ex:
# For now, wholesale replacement (no attempt to handle
# partially readable split files).
- if missing.mode == 'fail':
+ if not repair_config.destructive:
raise ex
- repair_info = missing.repair_info
- replacement = _replacement_file_item(dstrepo, item, missing.id,
+ replacement = _replacement_file_item(dstrepo, item, repair_config.id,
ex.oid)
# Must not remember repairs because the repair-id (and so blob
# content) can vary across saves, i.e. get --rewrite-id is a
# contextual argument, and because the type may change from
# tree to blob.
- _path_repaired(path, item.oid, replacement.oid, ex.oid, missing.id,
- repair_info)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, repair_config)
assert replacement.meta.mode == default_file_mode, repr(replacement)
stack.append_to_current(name, replacement.meta.mode, GIT_MODE_FILE,
replacement.oid, replacement.meta)
@@ -484,12 +472,10 @@ class Rewriter:
with self._context:
pass

- def append_save(self, save_path, parent, srcrepo, dstrepo, missing,
- excludes):
+ def append_save(self, save_path, parent, srcrepo, dstrepo, excludes,
+ repair_config):
# Strict for now
assert isinstance(parent, (bytes, type(None))), parent
- assert isinstance(missing, MissingConfig), missing
- assert missing.mode in ('fail', 'replace'), missing
if parent:
assert len(parent) == 20, parent
assert all(isinstance(x, Pattern) for x in excludes)
@@ -527,11 +513,11 @@ class Rewriter:
for path, replacement_dir \
in _vfs_walk_dir_recursively(srcrepo, dstrepo, save_path,
excludes, dbc, self._mapping,
- missing):
+ repair_config):
_rewrite_save_item(save_path, path, replacement_dir,
srcrepo, dstrepo,
self._split_cfg, stack, dbc,
- self._mapping, missing)
+ self._mapping, repair_config)

while len(stack) > 1: # pop all parts above root folder
stack.pop()
@@ -541,8 +527,8 @@ class Rewriter:
ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
author = ci.author_name + b' <' + ci.author_mail + b'>'
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
- trailers = missing.repair_info.repair_trailers(missing.id)
- msg = commit_message(ci.message, missing.repair_info.command,
+ trailers = repair_config.info.repair_trailers(repair_config.id)
+ msg = commit_message(ci.message, repair_config.info.command,
trailers)
return (dstrepo.write_commit(tree, parent,
author,
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/vfs.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index b82db022..e5fd1467 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -417,7 +417,9 @@ def item_mode(item):
m = item.meta
if isinstance(m, Metadata):
return m.mode
- return m
+ elif isinstance(m, int):
+ return m
+ raise TypeError(f'not integer or Metadata {m!r}')

def _read_dir_meta(bupm):
# May be empty because save writes unmodified Metadata() entries
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
e.g. empty_metadata

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/metadata.py | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index f7273593..91d34bca 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -1047,13 +1047,16 @@ def summary_bytes(meta, numeric_ids = False, classification = None,
human_readable = False):
"""Return bytes containing the "ls -l" style listing for meta.
Classification may be "all", "type", or None."""
- user_str = group_str = size_or_dev_str = b'?'
- symlink_target = None
- mode_str = b'?' * 10
+ user_str = group_str = b'?'
mtime_str = b'????-??-?? ??:??'
classification_str = b'?'
- if meta:
- name = meta.path
+ if not meta:
+ name = b''
+ mode_str = b'?' * 10
+ symlink_target = None
+ size_or_dev_str = b'?'
+ else:
+ name = meta.path or b''
mode_str = xstat.mode_str(meta.mode).encode('ascii')
symlink_target = meta.symlink_target
if meta.mtime is not None:
@@ -1082,7 +1085,6 @@ def summary_bytes(meta, numeric_ids = False, classification = None,
xstat.classification_str(meta.mode,
classification == 'all').encode()

- name = name or b''
if classification:
name += classification_str
if symlink_target:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 3 +++
lib/bup/cmd/get.py | 3 +++
test/ext/test-get-excludes | 34 ++++++++++++++++++++++++++++++++++
3 files changed, 40 insertions(+)
create mode 100755 test/ext/test-get-excludes

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index f4127769..ef260845 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -166,6 +166,9 @@ used to help test before/after results.)
(may be repeated). Ignore completely empty lines. Only supported
when rewriting.

+\--no-excludes
+: forget any previous `--exclude-rx` or `--exclude-rx-from` options.
+
-v, \--verbose
: increase verbosity (can be used more than once). With
`-v`, print the name of every item fetched, with `-vv` add
diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 972a8e0c..62d9e7c3 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -61,6 +61,7 @@ argspec = (
('--rewrite', 'rewrite data according to destination repo settings'),
('--exclude-rx REGEX', 'skip paths matching the unanchored regex (may be repeated)'),
('--exclude-rx-from PATH', 'skip --exclude-rx patterns in PATH (may be repeated)'),
+ ('--no-excludes', 'forget any preceeding exclude options'),
('--bwlimit BWLIMIT', 'maximum bytes/sec to transmit to server'),
('--missing <fail|ignore|replace>', 'behavior for missing objects (default: fail)'),
('--repair-id ID', 'repair session identifier (default: UUID v4)'),
@@ -218,6 +219,8 @@ def parse_args(args):
elif arg in (b'--exclude-rx', b'--exclude-rx-from'): # handled later
(val,), remaining = require_n_args_or_die(1, remaining)
exclude_opts.append((arg, val))
+ elif arg == b'--no-excludes':
+ exclude_opts, remaining = [], remaining[1:]
elif arg in (b'-0', b'-1', b'-2', b'-3', b'-4', b'-5', b'-6', b'-7',
b'-8', b'-9'):
opt.compress = int(arg[1:])
diff --git a/test/ext/test-get-excludes b/test/ext/test-get-excludes
new file mode 100755
index 00000000..f592234e
--- /dev/null
+++ b/test/ext/test-get-excludes
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh || exit $?
+. test/lib/btl.sh || exit $?
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+bup() { "$top/bup" "$@"; }
+
+WVPASS cd "$tmpdir"
+WVPASS bup init
+
+WVPASS mkdir -p src/a
+WVPASS echo 1 > src/a/one
+WVPASS echo 2 > src/a/two
+WVPASS echo 3 > src/a/three
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+
+WVSTART '--rewrite --exclude-rx'
+WVPASS bup get --rewrite --exclude-rx 't.*' --pick: src/latest dst
+WVPASSEQ 'one' "$(bup ls dst/latest/a)"
+
+WVSTART '--rewrite --exclude-rx --no-excludes'
+WVPASS bup get --rewrite --exclude-rx 't.*' --no-excludes --pick: src/latest dst
+WVPASSEQ $'one\nthree\ntwo' "$(bup ls dst/latest/a)"
+
+WVPASS cd "$top"

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
test/ext/test-get-missing | 49 +++++++++++++++++++++++++++++++--------
1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/test/ext/test-get-missing b/test/ext/test-get-missing
index d098d8bf..e89aa0b2 100755
--- a/test/ext/test-get-missing
+++ b/test/ext/test-get-missing
@@ -12,40 +12,69 @@ export GIT_DIR="$tmpdir/bup"

bup() { "$top/bup" "$@"; }

+
WVPASS cd "$tmpdir"
WVPASS bup init

-WVPASS mkdir -p src/a
+WVPASS mkdir -p src/a src/b
WVPASS echo 1 > src/a/1
WVPASS echo 2 > src/a/2
WVPASS echo 3 > src/a/3
+WVPASS echo 1 > src/b/1
+WVPASS echo 2 > src/b/2
+WVPASS echo 3 > src/b/3
WVPASS bup index src
WVPASS bup save --strip -n src src

-src_oid="$(git rev-parse src)"
+a_oid="$(git rev-parse src:a)"
+b_oid="$(git rev-parse src:b)"

WVPASS bup -d dest-repo init
-WVPASS bup -d dest-repo get -s bup --unnamed "git:$src_oid"
-WVPASS bup -d dest-repo join "$src_oid" > /dev/null
+WVPASS bup -d dest-repo get -s bup --unnamed "git:$a_oid"
+WVPASS bup -d dest-repo join "$a_oid" > /dev/null
WVPASS rm -rf dest-repo

bupm_oid="$(WVPIPE git ls-tree src:a | WVPASS head -1 | WVPASS btl-ent-oid)" \
|| exit $?
-echo "$bupm_oid" | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR"
+bupm2_oid="$(WVPIPE git ls-tree src:b | WVPASS head -1 | WVPASS btl-ent-oid)" \
+ || exit $?
+echo -e "$bupm_oid\n$bupm2_oid" \
+ | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR"

+
+WVSTART 'get incomplete tree without --ignore-missing'
WVPASS bup -d dest-repo init
-WVFAIL bup -d dest-repo get -s bup --unnamed "git:$src_oid" 2>&1 | tee get.log
-# For now...
-WVPASS grep -E 'raise MissingObject' get.log
+WVFAIL bup -d dest-repo get -s bup --unnamed "git:$a_oid" 2>&1 | tee get.log
+WVPASS grep -F 'raise MissingObject' get.log # For now...
+WVFAIL bup -d dest-repo get -s bup --ignore-missing --no-ignore-missing \
+ --unnamed "git:$a_oid" 2>&1 | tee get.log
+WVPASS grep -F 'raise MissingObject' get.log # For now...
WVPASS rm -rf dest-repo

+
+WVSTART 'get incomplete tree with --ignore-missing'
WVPASS bup -d dest-repo init
-WVFAIL bup -d dest-repo get -s bup --ignore-missing --unnamed "git:$src_oid" 2>&1 \
- | tee get.log
+WVEXPRC 2 eval 'bup -d dest-repo get -s bup --ignore-missing ' \
+ '--unnamed "git:$a_oid" 2>&1 | tee get.log'
WVPASSEQ 1 "$(grep -cF "skipping missing source object ${bupm_oid}" get.log)"
WVPASS rm -rf dest-repo


+WVSTART 'multiple incomplete tree gets with differing ignores'
+WVPASS bup -d dest-repo init
+WVEXPRC 2 eval 'bup -d dest-repo get -s bup' \
+ ' --ignore-missing --unnamed "git:$a_oid"'\
+ ' --no-ignore-missing --unnamed "git:$b_oid"'\
+ ' 2>&1 | tee get.log'
+WVPASSEQ 1 "$(grep -cF "skipping missing source object ${bupm_oid}" get.log)"
+log_msg_line="$(WVPASS grep -nF "skipping missing source object ${bupm_oid}" get.log)"
+log_msg_line="${log_msg_line%%:*}"
+WVPASS grep -F 'raise MissingObject' get.log # For now...
+raise_msg_line="$(WVPASS grep -nF "raise MissingObject" get.log)"
+raise_msg_line="${raise_msg_line%%:*}"
+WVPASS test "$log_msg_line" -lt "$raise_msg_line"
+WVPASS rm -rf dest-repo
+

WVPASS cd "$top"

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 89af04bb..f76465e9 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -520,7 +520,7 @@ def resolve_branch_dest(spec, src, src_repo, dest_repo):

def resolve_ff(spec, src_repo, dest_repo):
if spec.rewriter:
- misuse(f'--{spec.method} cannot rewrite (use --pick)')
+ misuse(f'--{spec.method} cannot rewrite (use --pick or --append)')
if spec.missing.mode == 'ignore':
misuse('currently only --unnamed allows --missing ignore')
src = resolve_src(spec, src_repo)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Since it would be surprising for bup get to ignore the destination
repository settings (bup.split.trees, etc.) when creating new saves,
add a new --rewrite option and refuse to run when the source and
destination configs differ materially and --rewrite hasn't been
specified. Support --no-rewrite to opt out. Only the append and pick
related methods allow rewriting.

Specifying --rewrite always causes rewriting, even if the source and
destination repository settings are the same. This allows rewriting
within the same repository, and rewriting with respect to the current
defaults for things that have changed but don't have configuration
options like splitting .bupm files.

To handle the rewriting, rely on the new rewrite code, moved to
bup.rewrite, to handle the work, and remove the rewrite command since
it's equivalent to a "bup get append" to a new branch.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 62 +++++++
lib/bup/cmd/get.py | 199 +++++++++++++++++++----
lib/bup/helpers.py | 14 +-
lib/bup/{cmd => }/rewrite.py | 247 +++++++++++-----------------
note/main.md | 5 +
test/ext/test-rewrite | 25 +--
test/ext/test_get.py | 304 ++++++++++++++++++++++++-----------
7 files changed, 557 insertions(+), 299 deletions(-)
rename lib/bup/{cmd => }/rewrite.py (52%)

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index d1e43c4b..44e99914 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -26,8 +26,12 @@ For example:

bup get -s /source/repo --ff foo
bup get -s /source/repo --ff: foo/latest bar
+ bup get -s /source/repo --pick: foo/2010-10-10-101010 bar
bup get -s /source/repo --pick: foo/2010-10-10-101010 .tag/bar

+The behavior of any given METHOD is determined in part by the *ref*
+and *dest* types, i.e. branch, save, tag, etc.
+
As a special case, if *ref* names the "latest" save symlink, then bup
will act exactly as if the save that "latest" points to had been
specified, rather than the "latest" symlink itself, so `bup get
@@ -127,6 +131,45 @@ used to help test before/after results.)
\--print-tags
: for each updated tag, print the new git id.

+\--rewrite, \--no-rewrite
+: rewrite the data according to the destination repository
+ configuration, e.g. its `bup.split.files`, and `bup.split.trees`
+ values. Currently, one of these options must be specified whenever
+ the source and destination repository configurations differ in a
+ relevant way, and so far, this option is only supported for
+ appends and picks. Note that while tested, this option is
+ relatively new and so warrants even more caution (see CAUTION
+ above) than `bup get` itself. Please consider validating the
+ results carefully for now.
+
+\--rewrite-db=*path*
+: place the rewrite database at *path*. Re-using an existing
+ database (e.g. after an interruption) can allow the rewrite to
+ resume without repeating expensive operations. By default, a
+ transient database will be placed in TMPDIR and removed on exit.
+
+\--exclude-rx=*pattern*
+: exclude any path matching *pattern*, which must be a Python regular
+ expression (http://docs.python.org/library/re.html). The pattern
+ will be compared against the full path, without anchoring, so
+ "x/y" will match "ox/yard" or "box/yards". To exclude the
+ contents of /tmp, but not the directory itself, use
+ "^/tmp/.". (may be repeated)
+
+ Examples:
+
+ * '/foo$' - exclude any file named foo
+ * '/foo/$' - exclude any directory named foo
+ * '/foo/.' - exclude the content of any directory named foo
+ * '^/tmp/.' - exclude root-level /tmp's content, but not /tmp itself
+
+ Only supported when rewriting.
+
+\--exclude-rx-from=*filename*
+: read --exclude-rx patterns from *filename*, one pattern per-line
+ (may be repeated). Ignore completely empty lines. Only supported
+ when rewriting.
+
-v, \--verbose
: increase verbosity (can be used more than once). With
`-v`, print the name of every item fetched, with `-vv` add
@@ -187,6 +230,25 @@ used to help test before/after results.)
# Append only the /home directory from archives/latest to only-home.
$ bup get -s "$BUP_DIR" --append: archives/latest/home only-home

+ # Resplit (rewrite) the archives branch. Note that, done all at
+ # once, this may require additional space up to the size of the
+ # archives branch. The pick methods can do the rewriting more
+ # selectively or incrementally. (Assume BUP_DIR has no split
+ # settings.)
+ #
+ $ bup config bup.split.trees true
+ $ bup config bup.split.files legacy:16
+ $ bup get --append: archives archives-resplit
+ #
+ # Check that archives-resplit looks OK, perhaps via trial
+ # restores, joining it, etc. (see CAUTION above), and once
+ # satisfied, perhaps...
+ #
+ $ bup rm archives
+ $ bup gc
+ $ git --git-dir "$BUP_DIR" branch -m archives-resplit archives
+
+
# SEE ALSO

`bup-on`(1), `bup-tag`(1), `ssh_config`(5)
diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 7a6c8b64..265bd59c 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -1,10 +1,11 @@

from binascii import hexlify, unhexlify
from collections import namedtuple
+from contextlib import ExitStack, closing
from stat import S_ISDIR
-import os, sys, textwrap, time
+import os, sys, textwrap, sqlite3, time

-from bup import compat, git, client, vfs
+from bup import client, compat, git, hashsplit, rewrite, vfs
from bup.commit import commit_message
from bup.compat import argv_bytes
from bup.config import derive_repo_addr
@@ -17,19 +18,24 @@ from bup.helpers import \
log,
note_error,
parse_num,
+ parse_rx_excludes,
+ temp_dir,
tty_width)
from bup.io import path_msg
from bup.pwdgrp import userfullname, username
from bup.repo import LocalRepo, make_repo

+
argspec = (
"usage: bup get [-s source] [-r remote] (<--ff|--append|...> REF [DEST])...",

- """Transfer data from a source repository to a destination repository
- according to the methods specified (--ff, --ff:, --append, etc.).
- Both repositories default to BUP_DIR. A remote destination may be
- specified with -r, and data may be pulled from a remote repository
- with the related "bup on HOST get ..." command.""",
+ """Transfer data from a source repository to a destination
+ repository according to the methods specified (--ff, --ff:,
+ --append, etc.). Both repositories default to BUP_DIR. A remote
+ destination may be specified with -r, and data may be pulled from
+ a remote repository with the related "bup on HOST get ..."
+ command. The --exclude-rx and --exclude-rx-from options currently
+ only apply to rewrites.""",

('optional arguments:',
(('-h, --help', 'show this help message and exit'),
@@ -43,6 +49,10 @@ argspec = (
('-t --print-trees', 'output a tree id for each ref set'),
('-c, --print-commits', 'output a commit id for each ref set'),
('--print-tags', 'output an id for each tag'),
+ ('--rewrite', 'rewrite data according to destination repo settings'),
+ ('--rewrite-db PATH', 'transient rewrite database (in TMPDIR by default)'),
+ ('--exclude-rx REGEX', 'skip paths matching the unanchored regex (may be repeated)'),
+ ('--exclude-rx-from PATH', 'skip --exclude-rx patterns in PATH (may be repeated)'),
('--bwlimit BWLIMIT', 'maximum bytes/sec to transmit to server'),
('-0, -1, -2, -3, -4, -5, -6, -7, -8, -9, --compress LEVEL',
'set compression LEVEL (default: 1)'))),
@@ -126,9 +136,12 @@ def parse_args(args):
opt.bwlimit = None
opt.compress = None
opt.ignore_missing = False
+ opt.rewrite = None # None means "didn't specify"
+ opt.rewrite_db = None
opt.source = opt.remote = None
opt.target_specs = []

+ exclude_opts = []
remaining = args[1:] # Skip argv[0]
while remaining:
arg = remaining[0]
@@ -164,6 +177,15 @@ def parse_args(args):
opt.print_trees, remaining = True, remaining[1:]
elif arg == b'--print-tags':
opt.print_tags, remaining = True, remaining[1:]
+ elif arg == b'--rewrite':
+ opt.rewrite, remaining = True, remaining[1:]
+ elif arg == b'--no-rewrite':
+ opt.rewrite, remaining = False, remaining[1:]
+ elif arg == b'--rewrite-db':
+ (opt.rewrite_db,), remaining = require_n_args_or_die(1, remaining)
+ elif arg in (b'--exclude-rx', b'--exclude-rx-from'): # handled later
+ (val,), remaining = require_n_args_or_die(1, remaining)
+ exclude_opts.append((arg, val))
elif arg in (b'-0', b'-1', b'-2', b'-3', b'-4', b'-5', b'-6', b'-7',
b'-8', b'-9'):
opt.compress = int(arg[1:])
@@ -183,6 +205,9 @@ def parse_args(args):
continue
else:
misuse()
+ opt.exclude_rxs = parse_rx_excludes(exclude_opts, misuse)
+ if opt.exclude_rxs and not opt.rewrite:
+ misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
for target in opt.target_specs:
if opt.ignore_missing and target.method != 'unnamed':
misuse('currently only --unnamed allows --ignore-missing')
@@ -230,7 +255,7 @@ def get_random_item(name, hash, src_repo, dest_repo, opt):
dest_repo.just_write(item.oid, item.type, item.data)


-def append_commit(name, hash, parent, src_repo, dest_repo, opt):
+def transfer_commit(name, hash, parent, src_repo, dest_repo, opt):
now = time.time()
items = parse_commit(get_cat_data(src_repo.cat(hash), b'commit'))
tree = unhexlify(items.tree)
@@ -245,12 +270,67 @@ def append_commit(name, hash, parent, src_repo, dest_repo, opt):
return c, tree


-def append_commits(commits, src_name, dest_hash, src_repo, dest_repo, opt):
+def append_commit(src_loc, parent, src_repo, dest_repo, opt):
+ if not opt.rewrite:
+ assert isinstance(src_loc, (bytes, Loc))
+ oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
+ return transfer_commit(None, # unused
+ oidx, parent, src_repo, dest_repo, opt)
+
+ # Friendlier checking was done during resolve_*
+ assert isinstance(src_loc, Loc), src_loc
+ path = src_loc.vfs_path
+ assert len(path) == 3, path
+ root, ref, save = path
+ assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
+ assert isinstance(ref[1], vfs.RevList), path
+ return rewrite.append_save(path, parent, src_repo, dest_repo,
+ opt.dest_split_cfg, opt.exclude_rxs,
+ # FIXME: ...
+ opt.rewrite_db_conn,
+ opt.rewrite_db_mapping)
+
+
+def append_commits(src_loc, dest_hash, src_repo, dest_repo, opt):
+ if not opt.rewrite:
+ commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
+ commits.reverse()
+ last_c, tree = dest_hash, None
+ for commit in commits:
+ last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
+ opt)
+ assert tree is not None
+ return last_c, tree
+
+ # Friendlier checking was done during resolve_*
+ assert isinstance(src_loc, Loc), src_loc
+ assert src_loc.type in ('branch', 'commit', 'save'), src_loc
+ path = src_loc.vfs_path
+ assert len(path) == 2, path
+ root, ref = path
+ assert isinstance(ref[1], vfs.RevList), ref[1]
+
+ # We need both the VFS name (YYYY-MM-DD[-N]), and the rev-list
+ # order, so for now, cross-reference rev-list with contents().
+ entry_for_coid = {}
+ for entry in vfs.contents(src_repo, path[1][1]):
+ if entry[0] in (b'.', b'..', b'latest'):
+ continue
+ entry_for_coid[entry[1].coid] = entry
+
+ commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
+ commits.reverse()
+
last_c, tree = dest_hash, None
for commit in commits:
- last_c, tree = append_commit(src_name, commit, last_c,
- src_repo, dest_repo, opt)
- assert(tree is not None)
+ coid = unhexlify(commit)
+ last_c, tree = rewrite.append_save(path + (entry_for_coid[coid],),
+ last_c, src_repo, dest_repo,
+ opt.dest_split_cfg, opt.exclude_rxs,
+ # FIXME: ...
+ opt.rewrite_db_conn,
+ opt.rewrite_db_mapping)
+ assert tree is not None
return last_c, tree


@@ -266,14 +346,15 @@ def find_git_item(ref, repo):
return GitLoc(ref, unhexlify(oidx), typ)


-Loc = namedtuple('Loc', ['type', 'hash', 'path'])
-default_loc = Loc(None, None, None)
+Loc = namedtuple('Loc', ['type', 'hash', 'path', 'vfs_path'])
+default_loc = Loc(None, None, None, None)

def find_vfs_item(name, repo):
res = repo.resolve(name, follow=False, want_meta=False)
leaf_name, leaf_item = res[-1]
if not leaf_item:
return None
+ vfs_path = res
kind = type(leaf_item)
if kind == vfs.Root:
kind = 'root'
@@ -309,11 +390,11 @@ def find_vfs_item(name, repo):
% (path_msg(name), res))
path = b'/'.join(name for name, item in res)
if hasattr(leaf_item, 'coid'):
- result = Loc(type=kind, hash=leaf_item.coid, path=path)
+ result = Loc(type=kind, hash=leaf_item.coid, path=path, vfs_path=vfs_path)
elif hasattr(leaf_item, 'oid'):
- result = Loc(type=kind, hash=leaf_item.oid, path=path)
+ result = Loc(type=kind, hash=leaf_item.oid, path=path, vfs_path=vfs_path)
else:
- result = Loc(type=kind, hash=None, path=path)
+ result = Loc(type=kind, hash=None, path=path, vfs_path=vfs_path)
return result


@@ -434,12 +515,24 @@ def handle_ff(item, src_repo, dest_repo, opt):
return None


-def resolve_append(spec, src_repo, dest_repo):
+def resolve_append(spec, src_repo, dest_repo, *, rewrite):
src = resolve_src(spec, src_repo)
if src.type not in ('branch', 'save', 'commit', 'tree'):
misuse('source for %s must be a branch, save, commit, or tree, not %s'
% (spec_msg(spec), src.type))
spec, dest = resolve_branch_dest(spec, src, src_repo, dest_repo)
+ if rewrite:
+ def vpm(path):
+ return path_msg(b"/".join(x[0] for x in src_path))
+ if not isinstance(src, Loc):
+ misuse(f'cannot currently rewrite git location {src}')
+ src_path = src.vfs_path
+ if len(src_path) != 2:
+ misuse(f'cannot append {vpm(src_path)}')
+ root, src_ref = src_path
+ if not isinstance(src_ref[1], vfs.RevList):
+ misuse(f'cannot append {vpm(src_path)} saves'
+ f' ({path_msg(src_ref[0])} is a {type(src_ref[1])})')
return Target(spec=spec, src=src, dest=dest)


@@ -447,8 +540,10 @@ def handle_append(item, src_repo, dest_repo, opt):
assert item.spec.method == 'append'
assert item.src.type in ('branch', 'save', 'commit', 'tree')
assert item.dest.type == 'branch' or not item.dest.type
- src_oidx = hexlify(item.src.hash)
if item.src.type == 'tree':
+ src_oidx = hexlify(item.src.hash)
+ if opt.rewrite:
+ misuse(f'rewrite cannot yet promote tree to commit for {spec_msg(item.spec)}')
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo, opt)
parent = item.dest.hash
msg = commit_message(b'bup get', compat.get_argvb())
@@ -458,22 +553,19 @@ def handle_append(item, src_repo, dest_repo, opt):
userline, now, None,
userline, now, None, msg)
return commit, item.src.hash
- commits = list(src_repo.rev_list(src_oidx))
- commits.reverse()
if item.dest.hash:
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
- return append_commits(commits, item.spec.src, item.dest.hash,
- src_repo, dest_repo, opt)
+ return append_commits(item.src, item.dest.hash, src_repo, dest_repo, opt)


-def resolve_pick(spec, src_repo, dest_repo):
+def resolve_pick(spec, src_repo, dest_repo, *, rewrite):
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if src.type == 'tree':
misuse('%s is impossible; can only --append a tree' % spec_args)
if src.type not in ('commit', 'save'):
misuse('%s impossible; can only pick a commit or save, not %s'
- % (spec_args, src.type))
+ % (spec_args, src.type))
if not spec.dest:
if src.path.startswith(b'/.tag/'):
spec = spec._replace(dest=spec.src)
@@ -481,6 +573,9 @@ def resolve_pick(spec, src_repo, dest_repo):
spec = spec._replace(dest=get_save_branch(src_repo, spec.src))
if not spec.dest:
misuse('no destination provided for %s' % spec_args)
+ if rewrite:
+ if src.type != 'save':
+ misuse(f'cannot currently --rewrite a {src.type}')
dest = find_vfs_item(spec.dest, dest_repo)
if not dest:
cp = validate_vfs_path(cleanup_vfs_path(spec.dest), spec)
@@ -499,16 +594,15 @@ def resolve_pick(spec, src_repo, dest_repo):
def handle_pick(item, src_repo, dest_repo, opt):
assert item.spec.method in ('pick', 'force-pick')
assert item.src.type in ('save', 'commit')
- src_oidx = hexlify(item.src.hash)
if item.dest.hash:
# if the dest is committish, make it the parent
if item.dest.type in ('branch', 'commit', 'save'):
- return append_commit(item.spec.src, src_oidx, item.dest.hash,
- src_repo, dest_repo, opt)
+ return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
+ opt)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
- return append_commit(item.spec.src, src_oidx, None, src_repo, dest_repo, opt)
+ return append_commit(item.src, None, src_repo, dest_repo, opt)


def resolve_new_tag(spec, src_repo, dest_repo):
@@ -587,7 +681,7 @@ def handle_unnamed(item, src_repo, dest_repo, opt):
return (None,)


-def resolve_targets(specs, src_repo, dest_repo, *, ignore_missing):
+def resolve_targets(specs, src_repo, dest_repo, *, ignore_missing, rewrite):
resolved_items = []
common_args = src_repo, dest_repo
for spec in specs:
@@ -595,9 +689,11 @@ def resolve_targets(specs, src_repo, dest_repo, *, ignore_missing):
if spec.method == 'ff':
resolved_items.append(resolve_ff(spec, *common_args))
elif spec.method == 'append':
- resolved_items.append(resolve_append(spec, *common_args))
+ resolved_items.append(resolve_append(spec, *common_args,
+ rewrite=rewrite))
elif spec.method in ('pick', 'force-pick'):
- resolved_items.append(resolve_pick(spec, *common_args))
+ resolved_items.append(resolve_pick(spec, *common_args,
+ rewrite=rewrite))
elif spec.method == 'new-tag':
resolved_items.append(resolve_new_tag(spec, *common_args))
elif spec.method == 'replace':
@@ -652,16 +748,49 @@ def main(argv):
if opt.bwlimit:
client.bwlimit = parse_num(opt.bwlimit)

- with make_repo(derive_repo_addr(remote=opt.remote, die=misuse),
+ with LocalRepo(repo_dir=opt.source) as src_repo, \
+ make_repo(derive_repo_addr(remote=opt.remote, die=misuse),
compression_level=opt.compress) as dest_repo:
- with LocalRepo(repo_dir=opt.source) as src_repo:
+
+ src_split_cfg = hashsplit.configuration(src_repo.config_get)
+ opt.dest_split_cfg = hashsplit.configuration(dest_repo.config_get)
+
+ if src_split_cfg != opt.dest_split_cfg and opt.rewrite is None:
+ misuse('repository configs differ; specify --rewrite or --no-rewrite')
+
+ ctx = ExitStack()
+ if opt.rewrite:
+ if not opt.rewrite_db:
+ rwdb_tmpdir = ctx.enter_context(temp_dir(prefix='bup-rewrite-'))
+ opt.rewrite_db = f'{rwdb_tmpdir}/db'
+ rwdb_conn = sqlite3.connect(opt.rewrite_db)
+ rwdb_conn.text_factory = bytes
+ ctx.enter_context(closing(rwdb_conn))
+ opt.rewrite_db_conn = rwdb_conn # FIXME: ...
+ with closing(rwdb_conn.cursor()) as rwdb_cur:
+ opt.rewrite_db_mapping = \
+ rewrite.prep_mapping_table(rwdb_cur, opt.dest_split_cfg)
+
+ with ctx:
+
# Resolve and validate all sources and destinations,
# implicit or explicit, and do it up-front, so we can
# fail before we start writing (for any obviously
# broken cases).
target_items = resolve_targets(opt.target_specs,
src_repo, dest_repo,
- ignore_missing=opt.ignore_missing)
+ ignore_missing=opt.ignore_missing,
+ rewrite=opt.rewrite)
+ if opt.rewrite:
+ for item in target_items:
+ if item.spec.method in ('append', 'force-pick', 'pick'):
+ continue
+ elif item.spec.method == 'ff':
+ misuse(f'--ff cannot rewrite (use --pick)')
+ elif item.spec.method in ('new-tag', 'replace', 'unnamed'):
+ misuse(f'--{item.spec.method} cannot currently rewrite')
+ else:
+ assert False, f'unexpected method {item.spec.method}'

updated_refs = {} # ref_name -> (original_ref, tip_commit(bin))
no_ref_info = (None, None)
diff --git a/lib/bup/helpers.py b/lib/bup/helpers.py
index c81a1dfc..9b6123af 100644
--- a/lib/bup/helpers.py
+++ b/lib/bup/helpers.py
@@ -4,7 +4,7 @@ from collections import namedtuple
from contextlib import ExitStack, nullcontext
from ctypes import sizeof, c_void_p
from math import floor
-from os import environ
+from os import environ, fsencode
from random import SystemRandom
from subprocess import PIPE, Popen
from tempfile import mkdtemp
@@ -1012,13 +1012,17 @@ def parse_rx_excludes(options, fatal):
excluded_patterns = []

for flag in options:
- (option, parameter) = flag
- if option == '--exclude-rx':
+ option, parameter = flag
+ if isinstance(option, str):
+ option = fsencode(option)
+ if isinstance(parameter, str):
+ parameter = fsencode(parameter)
+ if option == b'--exclude-rx':
try:
- excluded_patterns.append(re.compile(argv_bytes(parameter)))
+ excluded_patterns.append(re.compile(parameter))
except re.error as ex:
fatal('invalid --exclude-rx pattern (%r): %s' % (parameter, ex))
- elif option == '--exclude-rx-from':
+ elif option == b'--exclude-rx-from':
try:
f = open(resolve_parent(parameter), 'rb')
except IOError as e:
diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/rewrite.py
similarity index 52%
rename from lib/bup/cmd/rewrite.py
rename to lib/bup/rewrite.py
index 1a6f7086..e65fa506 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -1,36 +1,25 @@

-from binascii import hexlify, unhexlify
+from binascii import hexlify
from contextlib import closing
from itertools import chain
+from os.path import join as pj
from stat import S_ISDIR, S_ISLNK, S_ISREG
import os
-import sqlite3

-from bup import hashsplit, git, options, repo, metadata, vfs
-from bup.compat import argv_bytes
+from bup import hashsplit, metadata, vfs
+from bup.git import get_cat_data, parse_commit
from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
-from bup.helpers import \
- (handle_ctrl_c, path_components,
- valid_save_name, log,
- parse_rx_excludes,
- qprogress,
- reprogress,
- should_rx_exclude_path)
-from bup.io import path_msg, qsql_id
+from bup.helpers import path_components, should_rx_exclude_path
+from bup.io import qsql_id
from bup.tree import Stack
-from bup.repo import make_repo
-from bup.config import derive_repo_addr, ConfigError


-optspec = """
-bup rewrite -s srcrepo <branch-name>
---
-s,source= source repository
-r,remote= remote destination repository
-work-db= work database filename (required, can be deleted after running)
-exclude-rx= skip paths matching the unanchored regex (may be repeated)
-exclude-rx-from= skip --exclude-rx patterns in file (may be repeated)
-"""
+def _fs_path_from_vfs(path):
+ fs = b'/'.join(x[0] for x in path)
+ if not S_ISDIR(vfs.item_mode(path[-1][1])):
+ return fs
+ return fs + b'/'
+

def prep_mapping_table(db, split_cfg):
# This currently only needs to track items that may be split,
@@ -88,25 +77,30 @@ def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

-def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
- fullname=b''):
- for name, item in vfs.contents(srcrepo, vfs_item):
+def vfs_walk_recursively(srcrepo, dstrepo, path, excludes, db, mapping):
+ item = path[-1][1]
+ assert len(path) >= 3
+ # drop branch/DATE
+ fs_path_in_save = _fs_path_from_vfs((path[0],) + path[3:])
+ for entry in vfs.contents(srcrepo, item):
+ name, sub_item = entry
+ sub_path = path + (entry,)
if name in (b'.', b'..'):
continue
- itemname = fullname + b'/' + name
- check_name = itemname + (b'/' if S_ISDIR(vfs.item_mode(item)) else b'')
- if should_rx_exclude_path(check_name, excludes):
+ sub_fs_path_in_save = pj(fs_path_in_save, name)
+ if S_ISDIR(vfs.item_mode(sub_item)):
+ sub_fs_path_in_save += b'/'
+ if should_rx_exclude_path(sub_fs_path_in_save, excludes):
continue
- if S_ISDIR(vfs.item_mode(item)):
- item, oid, _ = previous_conversion(dstrepo, item, True, db, mapping)
+ if S_ISDIR(vfs.item_mode(sub_item)):
+ conv_item, oid, _ = \
+ previous_conversion(dstrepo, sub_item, True, db, mapping)
+ if conv_item is not sub_item:
+ sub_path = sub_path[:-1] + ((sub_path[-1][0], conv_item),)
if oid is None:
- yield from vfs_walk_recursively(srcrepo, dstrepo, item,
- excludes, db, mapping,
- fullname=itemname)
- # and the dir itself
- yield itemname + b'/', item
- else:
- yield itemname, item
+ yield from vfs_walk_recursively(srcrepo, dstrepo, sub_path,
+ excludes, db, mapping)
+ yield sub_path

def rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
assert isinstance(name, bytes)
@@ -121,9 +115,12 @@ def rewrite_link(item, item_mode, name, srcrepo, dstrepo, stack):
assert item.meta.size == len(item.meta.symlink_target)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

-def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
- stack, wdbc, mapping):
- dirn, filen = os.path.split(fullname)
+def rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg, stack, wdbc,
+ mapping):
+ # save_path is the vfs path to the save ref, e.g. to branch/DATE
+ fs_path = _fs_path_from_vfs(path[3:]) # not including /branch/DATE
+ assert not fs_path.startswith(b'/') # because resolve(parent=...)
+ dirn, filen = os.path.split(b'/' + fs_path)
assert dirn.startswith(b'/')
dirp = path_components(dirn)

@@ -132,14 +129,21 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
stack.pop()

# If switching to a new sub-tree, start a new sub-tree.
+ comp_parent = None
for path_component in dirp[len(stack):]:
- dir_name, fs_path = path_component
-
- dir_item = vfs.resolve(srcrepo, src + b'/' + commit_name + b'/' + fs_path)
- meta = dir_item[-1][1].meta
+ comp_name, comp_path = path_component
+ if comp_parent:
+ dir_res = vfs.resolve(srcrepo, comp_name, parent=comp_parent)
+ else:
+ full_comp_path = b'/'.join([x[0] for x in save_path]) + comp_path
+ dir_res = vfs.resolve(srcrepo, full_comp_path)
+ meta = dir_res[-1][1].meta
if not isinstance(meta, metadata.Metadata):
meta = None
- stack.push(dir_name, meta)
+ stack.push(comp_name, meta)
+ comp_parent = dir_res
+
+ item = path[-1][1]

# First, things that can't be affected by the rewrite
item_mode = vfs.item_mode(item)
@@ -208,118 +212,51 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
(item.oid, oid, chunked, item_size))
stack.append_to_current(filen, item_mode, git_mode, oid, item.meta)

-def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):
- # Currently, the workdb must always be ready to commit (see finally below)
- srcref = b'refs/heads/%s' % src
- dstref = b'refs/heads/%s' % dst
- if dstrepo.read_ref(dstref) is not None:
- fatal(f'branch already exists: {path_msg(dst)}')
- try:
- split_cfg = hashsplit.configuration(dstrepo.config_get)
- except ConfigError as ex:
- fatal(ex)
- split_trees = dstrepo.config_get(b'bup.split.trees', opttype='bool')

- vfs_branch = vfs.resolve(srcrepo, src)
- item = vfs_branch[-1][1]
- if not item:
- fatal(f'cannot access {path_msg(src)} in source\n')
- commit_oid_name = {
- c[1].coid: c[0]
- for c in vfs.contents(srcrepo, item)
- if isinstance(c[1], vfs.Commit)
- }
- commits = list(srcrepo.rev_list(hexlify(item.oid), parse=vfs.parse_rev,
- format=b'%T %at'))
- commits.reverse()
- with closing(workdb.cursor()) as wdbc:
+def append_save(save_path, parent, srcrepo, dstrepo, split_cfg,
+ excludes, workdb, mapping):
+ # Strict for now
+ assert isinstance(parent, (bytes, type(None))), parent
+ if parent:
+ assert len(parent) == 20, parent
+ assert len(save_path) == 3, (len(save_path), save_path)
+ assert isinstance(save_path[1][1], vfs.RevList)
+ leaf_name, leaf_item = save_path[2]
+ if isinstance(leaf_item, vfs.FakeLink):
+ # For now, vfs.contents() does not resolve the one FakeLink
+ assert leaf_name == b'latest', save_path
+ res = srcrepo.resolve(leaf_item.target, parent=save_path[:-1],
+ follow=False, want_meta=False)
+ leaf_name, leaf_item = res[-1]
+ save_path = res
+ assert isinstance(leaf_item, vfs.Commit), leaf_item
+ # Currently, the workdb must always be ready to commit (see finally below)
+ with closing(workdb.cursor()) as dbc:
try:
- mapping = prep_mapping_table(wdbc, split_cfg)
-
# Maintain a stack of information representing the current
# location in the archive being constructed.
- parent = None
- i, n = 0, len(commits)
- for commit, (tree, timestamp) in commits:
- i += 1
- stack = Stack(dstrepo, split_cfg)
-
- commit_name = commit_oid_name[unhexlify(commit)]
- pm = f'{path_msg(src)}/{path_msg(commit_name)}'
- orig_oidm = commit[:12].decode("ascii")
- qprogress(f'{i}/{n} {orig_oidm} {pm}\r')
-
- citem = vfs.Commit(meta=vfs.default_dir_mode, oid=tree,
- coid=commit)
- for fullname, item in vfs_walk_recursively(srcrepo, dstrepo,
- citem, excludes,
- wdbc, mapping):
- rewrite_item(item, commit_name, fullname, srcrepo, src,
- dstrepo, split_cfg, stack, wdbc, mapping)
-
- while len(stack) > 1: # pop all parts above root folder
- stack.pop()
- tree = stack.pop() # and the root to get the tree
-
- commit_it = srcrepo.cat(commit)
- next(commit_it)
- ci = git.parse_commit(b''.join(commit_it))
- author = ci.author_name + b' <' + ci.author_mail + b'>'
- committer = ci.committer_name + b' <' + ci.committer_mail + b'>'
- newref = dstrepo.write_commit(tree, parent,
- author,
- ci.author_sec,
- ci.author_offset,
- committer,
- ci.committer_sec,
- ci.committer_offset,
- ci.message)
- parent = newref
- new_oidm = newref.hex()[:12]
- log(f'{orig_oidm} -> {new_oidm} {pm}\n')
- reprogress()
-
- dstrepo.update_ref(dstref, newref, None)
+ stack = Stack(dstrepo, split_cfg)
+ for path in vfs_walk_recursively(srcrepo, dstrepo, save_path,
+ excludes, dbc, mapping):
+ rewrite_save_item(save_path, path, srcrepo, dstrepo, split_cfg,
+ stack, dbc, mapping)
+
+ while len(stack) > 1: # pop all parts above root folder
+ stack.pop()
+ tree = stack.pop() # and the root to get the tree
+
+ save_oidx = hexlify(save_path[2][1].coid)
+ ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
+ author = ci.author_name + b' <' + ci.author_mail + b'>'
+ committer = ci.committer_name + b' <' + ci.committer_mail + b'>'
+ return (dstrepo.write_commit(tree, parent,
+ author,
+ ci.author_sec,
+ ci.author_offset,
+ committer,
+ ci.committer_sec,
+ ci.committer_offset,
+ ci.message),
+ tree)
finally:
workdb.commit() # the workdb is always ready for commit
-
-def main(argv):
-
- handle_ctrl_c()
-
- o = options.Options(optspec)
- opt, flags, extra = o.parse_bytes(argv[1:])
-
- if len(extra) != 1:
- o.fatal('no branch name given')
-
- exclude_rxs = parse_rx_excludes(flags, o.fatal)
-
- src = argv_bytes(extra[0])
- if b':' in src:
- src, dst = src.split(b':', 1)
- else:
- dst = src
- if not valid_save_name(src):
- o.fatal(f'invalid branch name: {path_msg(src)}')
- if not valid_save_name(dst):
- o.fatal(f'invalid branch name: {path_msg(dst)}')
-
- if opt.remote:
- opt.remote = argv_bytes(opt.remote)
-
- if not opt.work_db:
- o.fatal('--work-db argument is required')
-
- workdb_conn = sqlite3.connect(opt.work_db)
- workdb_conn.text_factory = bytes
-
- # FIXME: support remote source repos ... probably after we unify
- # the handling?
- # Leave db commits to the sub-functions doing the work.
- with repo.LocalRepo(argv_bytes(opt.source)) as srcrepo, \
- make_repo(derive_repo_addr(remote=opt.remote, die=o.fatal)) as dstrepo, \
- closing(workdb_conn):
- rewrite_branch(srcrepo, src, dstrepo, dst, exclude_rxs, workdb_conn,
- o.fatal)
-
diff --git a/note/main.md b/note/main.md
index 3b7732d0..f750891c 100644
--- a/note/main.md
+++ b/note/main.md
@@ -101,6 +101,11 @@ General
when large directories change (e.g. large active Maildirs). See
`bup-config`(5) for additional information.

+* `bup get` picks and appends can `--rewrite` the data being
+ transferred to respect the destination repository's configuration,
+ e.g. its `bup.split.files` and `bup.split.trees` settings. See
+ `bup-get`(1) for additional information.
+
* The default pack compression level can now be configured via either
`pack.compression` or `core.compression`. See `bup-config`(5) for
additional information.
diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
index 46fb85eb..47513153 100755
--- a/test/ext/test-rewrite
+++ b/test/ext/test-rewrite
@@ -48,7 +48,7 @@ compare() {

WVSTART split and rewrite
WVPASS bup split -n split < "$top/test/testfile1"
-WVPASS bup -d "$BUP_DIR2" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" split:test
+WVPASS bup -d "$BUP_DIR2" get --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append: split test
WVPASS compare "$BUP_DIR" split "$BUP_DIR2" test

WVSTART make multiple saves
@@ -59,14 +59,15 @@ WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"
WVPASS bup save -n save --strip-path="$top" "$top/test/sampledata"

WVSTART rewrite to different split
-WVPASS bup -d "$BUP_DIR4" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" save
+WVPASS bup -d "$BUP_DIR" ls -l save
+WVPASS bup -d "$BUP_DIR4" get --rewrite --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append save
WVPASS compare "$BUP_DIR" save "$BUP_DIR4" save

WVSTART "rewrite unchanged (to remote)"
-WVPASS bup rewrite -r ":$BUP_DIR3" --work-db "$tmpdir/db" -s "$BUP_DIR" save
+WVPASS bup get -r ":$BUP_DIR3" -s "$BUP_DIR" --append save
WVPASS compare "$BUP_DIR" save "$BUP_DIR3" save
-WVPASSEQ "$(GIT_DIR=$BUP_DIR WVPASS git rev-parse save)" \
- "$(GIT_DIR=$BUP_DIR3 WVPASS git rev-parse save)"
+WVPASSEQ "$(GIT_DIR="$BUP_DIR" WVPASS git log --pretty=format:%T -n1 save)" \
+ "$(GIT_DIR="$BUP_DIR3" WVPASS git log --pretty=format:%T -n1 save)"

WVSTART rewrite after size not stored
# now do a hack to save without saving the size in metadata ...
@@ -102,19 +103,19 @@ WVPASS bup -d "$BUP_DIR" ls -l save/latest/test/sampledata/y/testfile1 |
WVPASS grep -- 158664

# now rewrite again - and then the size should be correct even without augmentation
-WVPASS bup -d "$BUP_DIR4" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" save:save2
+WVPASS bup -d "$BUP_DIR4" get --rewrite --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append: save save2
WVPASS bup+ -d "$BUP_DIR4" ls -l save/latest/test/sampledata/y/testfile1 |
WVPASS grep -- 158664

# and again for the other kind of splitting
-WVPASS bup -d "$BUP_DIR3" rewrite --work-db "$tmpdir/db" -s "$BUP_DIR" save:save2
-WVPASS bup+ -d "$BUP_DIR3" ls -l save/latest/test/sampledata/y/testfile1 |
+WVPASS bup -d "$BUP_DIR3" get --rewrite --rewrite-db "$tmpdir/db" -s "$BUP_DIR" --append: save save2
+WVPASS bup+ -d "$BUP_DIR3" ls -l save2/latest/test/sampledata/y/testfile1 |
WVPASS grep -- 158664

WVSTART rewrite with excluded files
WVPASS bup -d "$BUP_DIR5" init
-WVPASS bup -d "$BUP_DIR5" rewrite --work-db "$tmpdir/db2" -s "$BUP_DIR4" \
- --exclude-rx ^/test/sampledata/y/ save
+WVPASS bup -d "$BUP_DIR5" get --rewrite -s "$BUP_DIR4" \
+ --exclude-rx '^/test/sampledata/y/' --append save
WVPASS extract_all "$BUP_DIR4" "save" "orig"
WVPASS extract_all "$BUP_DIR5" "save" "new"
rm -rf "$tmpdir/restore/orig/"*"/test/sampledata/y/"
@@ -128,8 +129,8 @@ WVPASS rm -rf "$tmpdir/restore"
WVSTART "rewrite with excluded files (in repo)"
WVPASS git config -f "$BUP_DIR/config" bup.split.trees true
WVPASS git config -f "$BUP_DIR/config" bup.split.files legacy:14
-WVPASS bup -d "$BUP_DIR" rewrite --work-db "$tmpdir/db3" -s "$BUP_DIR" \
- --exclude-rx ^/test/sampledata/y/ save:save-new
+WVPASS bup -d "$BUP_DIR" get --rewrite -s "$BUP_DIR" \
+ --exclude-rx '^/test/sampledata/y/' --append: save save-new
WVPASS extract_all "$BUP_DIR" "save" "orig"
WVPASS extract_all "$BUP_DIR" "save-new" "new"
rm -rf "$tmpdir/restore/orig/"*"/test/sampledata/y/"
diff --git a/test/ext/test_get.py b/test/ext/test_get.py
index 7b29e99b..27bf3a24 100644
--- a/test/ext/test_get.py
+++ b/test/ext/test_get.py
@@ -150,18 +150,43 @@ def validate_commit(src_id, dest_id):
rmrf(b'restore-src')
rmrf(b'restore-dest')

-def _validate_save(orig_dir, save_path, commit_id, tree_id):
- global bup_cmd
+def _get_save_coid(save):
+ # FIXME: add/use some kind of ls dereference opt
+ exr = exo((bup_cmd, b'-d', b'get-dest', b'ls', b'-d', b'--commit-hash', save))
+ if exr.rc: return False
+ if not exr.out.endswith(b'/latest\n'):
+ coid = exr.out.split()[0]
+ assert len(coid) == 40, exr.out
+ return coid
+ exr = exo((bup_cmd, b'-d', b'get-dest', b'ls', b'--commit-hash', save))
+ if exr.rc: return False
+ lines = exr.out.splitlines()
+ # Is save branch or branch/latest?
+ if lines[-1].rsplit(maxsplit=2) == b'0' * 40:
+ coid = exr.out.splitlines()[-2].split()[0]
+ assert len(coid) == 40, exr.out
+ return coid
+ assert save.endswith(b'/latest'), save
+ return _get_save_coid(save[:-7])
+
+def _validate_save(orig_dir, save, save_subpath, commit_id, tree_id):
+ assert isinstance(commit_id, (bytes, type(None)))
+ assert isinstance(tree_id, (bytes, type(None)))
+ assert bool(tree_id) == bool(commit_id)
+
# Check parent connectivity, etc.
- ex((b'git', b'-P', b'--git-dir', b'get-dest', b'log', b'-n2', commit_id),
+ save_coid = _get_save_coid(save)
+ if not save_coid: return False
+ ex((b'git', b'-P', b'--git-dir', b'get-dest', b'log', b'-n2', save_coid),
stdin=DEVNULL)
rmrf(b'restore')
exr = verify_rcz((bup_cmd, b'-d', b'get-dest',
- b'restore', b'-C', b'restore', save_path + b'/.'))
+ b'restore', b'-C', b'restore',
+ save + b'/' + save_subpath + b'/.'))
if exr.rc: return False
verify_trees_match(orig_dir + b'/', b'restore/')
if tree_id:
- # FIXME: double check that get-dest is correct
+ wvpasseq(commit_id, save_coid)
exr = verify_rcz((b'git', b'--git-dir', b'get-dest', b'ls-tree', tree_id))
if exr.rc: return False
cat = verify_rcz((b'git', b'--git-dir', b'get-dest',
@@ -179,24 +204,29 @@ def validate_save(dest_name, restore_subpath, commit_id, tree_id, orig_value,
get_commit_id = out[1]
wvpasseq(tree_id, get_tree_id)
wvpasseq(commit_id, get_commit_id)
- _validate_save(orig_value, dest_name + restore_subpath, commit_id, tree_id)
+ _validate_save(orig_value, dest_name, restore_subpath, commit_id, tree_id)

def validate_new_save(dest_name, restore_subpath, commit_id, tree_id, orig_value,
- get_out):
+ get_out, *, rewrite=False):
out = get_out.splitlines()
wvpasseq(2, len(out))
- get_tree_id = out[0]
- get_commit_id = out[1]
- wvpasseq(tree_id, get_tree_id)
- wvpassne(commit_id, get_commit_id)
- _validate_save(orig_value, dest_name + restore_subpath, get_commit_id, tree_id)
-
+ get_tree_id, get_commit_id = out
+ if not rewrite:
+ wvpasseq(tree_id, get_tree_id)
+ wvpassne(commit_id, get_commit_id)
+ _validate_save(orig_value, dest_name, restore_subpath, get_commit_id,
+ tree_id)
+ else:
+ _validate_save(orig_value, dest_name, restore_subpath, get_commit_id,
+ get_tree_id)
+
def validate_tagged_save(tag_name, restore_subpath,
commit_id, tree_id, orig_value, get_out):
out = get_out.splitlines()
wvpasseq(1, len(out))
get_tag_id = out[0]
- wvpasseq(commit_id, get_tag_id)
+ if commit_id:
+ wvpasseq(commit_id, get_tag_id)
# Make sure tmp doesn't already exist.
exr = exo((b'git', b'--git-dir', b'get-dest', b'show-ref', b'tmp-branch-for-tag'),
check=False)
@@ -204,7 +234,7 @@ def validate_tagged_save(tag_name, restore_subpath,

ex((b'git', b'--git-dir', b'get-dest', b'branch', b'tmp-branch-for-tag',
b'refs/tags/' + tag_name))
- _validate_save(orig_value, b'tmp-branch-for-tag/latest' + restore_subpath,
+ _validate_save(orig_value, b'tmp-branch-for-tag/latest', restore_subpath,
commit_id, tree_id)
ex((b'git', b'--git-dir', b'get-dest', b'branch', b'-D', b'tmp-branch-for-tag'))

@@ -218,8 +248,8 @@ def validate_new_tagged_commit(tag_name, commit_id, tree_id, get_out):
ex((b'git', b'-P', b'--git-dir', b'get-dest', b'log', b'-n2', tag_name),
stdin=DEVNULL)

-def _run_get(disposition, method, what):
- print('run_get:', repr((disposition, method, what)), file=sys.stderr)
+def _run_get(disposition, method, what, rewrite=None):
+ assert rewrite in (True, False, type(None))
global bup_cmd

if disposition == 'get':
@@ -244,21 +274,27 @@ def _run_get(disposition, method, what):
method += b':'
src, dest = what
cmd = get_cmd + (method, src, dest)
+ if rewrite:
+ cmd += (b'--rewrite',)
+ elif rewrite == False:
+ cmd += (b'--no-rewrite',)
result = exo(cmd, check=False, stderr=PIPE)
fsck = ex((bup_cmd, b'-d', b'get-dest', b'fsck'), check=False)
wvpasseq(0, fsck.rc)
return result

-def run_get(disposition, method, what=None, given=None):
+def run_get(disposition, method, what=None, given=None, rewrite=False):
global bup_cmd
rmrf(b'get-dest')
ex((bup_cmd, b'-d', b'get-dest', b'init'))
-
+ if rewrite:
+ ex((b'git', b'--git-dir', b'get-dest', b'config', b'bup.split.trees', b'true'))
+ ex((b'git', b'--git-dir', b'get-dest', b'config', b'bup.split.files', b'legacy:16'))
if given:
# FIXME: replace bup-get with independent commands as is feasible
- exr = _run_get(disposition, b'--replace', given)
+ exr = _run_get(disposition, b'--replace', given, False)
assert not exr.rc
- return _run_get(disposition, method, what)
+ return _run_get(disposition, method, what, rewrite)

def _test_universal(get_disposition, src_info):
methods = (b'--ff', b'--append', b'--pick', b'--force-pick', b'--new-tag',
@@ -486,33 +522,41 @@ def _test_append(get_disposition, src_info):
subtree_vfs_path = src_info['subtree-vfs-path']

wvstart(get_disposition + ' --append to root fails')
- for item in (b'.tag/tinyfile', b'src/latest' + tinyfile_path):
- exr = run_get(get_disposition, b'--append', (item, b'/'))
+ for item, rewrite in \
+ product((b'.tag/tinyfile', b'src/latest' + tinyfile_path),
+ (False, True)):
+ exr = run_get(get_disposition, b'--append', (item, b'/'), rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'source for .+ must be a branch, save, commit, or tree',
exr.err)
- for item in (b'.tag/subtree', b'src/latest' + subtree_vfs_path,
- b'.tag/commit-1', b'src/latest', b'src'):
- exr = run_get(get_disposition, b'--append', (item, b'/'))
+ for item, rewrite in \
+ product((b'.tag/subtree', b'src/latest' + subtree_vfs_path,
+ b'.tag/commit-1', b'src/latest', b'src'),
+ (False, True)):
+ exr = run_get(get_disposition, b'--append', (item, b'/'), rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'destination for .+ is a root, not a branch', exr.err)

wvstart(get_disposition + ' --append of not-treeish fails')
- for src in (b'.tag/tinyfile', b'src/latest' + tinyfile_path):
+ for src, rewrite in product((b'.tag/tinyfile', b'src/latest' + tinyfile_path),
+ (False, True)):
for given, item in ((None, (src, b'obj')),
(None, (src, b'.tag/obj')),
((b'.tag/tinyfile', b'.tag/obj'), (src, b'.tag/obj')),
((b'.tag/tree-1', b'.tag/obj'), (src, b'.tag/obj')),
((b'.tag/commit-1', b'.tag/obj'), (src, b'.tag/obj')),
((b'.tag/commit-1', b'obj'), (src, b'obj'))):
- exr = run_get(get_disposition, b'--append', item, given=given)
+ exr = run_get(get_disposition, b'--append', item, given=given,
+ rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'must be a branch, save, commit, or tree', exr.err)

wvstart(get_disposition + ' --append committish failure cases')
save_2 = src_info['save-2']
- for src in (b'.tag/subtree', b'src/latest' + subtree_vfs_path,
- b'.tag/commit-2', b'src/' + save_2, b'src'):
+ for src, rewrite in \
+ product((b'.tag/subtree', b'src/latest' + subtree_vfs_path,
+ b'.tag/commit-2', b'src/' + save_2, b'src'),
+ (False, True)):
for given, item, complaint in \
((None, (src, b'.tag/obj'),
br'destination .+ must be a valid branch name'),
@@ -524,36 +568,61 @@ def _test_append(get_disposition, src_info):
br'destination .+ is a tagged commit, not a branch'),
((b'.tag/commit-2', b'.tag/obj'), (src, b'.tag/obj'),
br'destination .+ is a tagged commit, not a branch')):
- exr = run_get(get_disposition, b'--append', item, given=given)
+ exr = run_get(get_disposition, b'--append', item, given=given,
+ rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(complaint, exr.err)

+ wvstart(get_disposition + ' --append --rewrite SAVE currently unsupported')
+ # If we add support, consider the ancestor case too (see below)
+ for existing in (None,
+ (b'.tag/commit-1', b'obj'),
+ (b'.tag/commit-2', b'obj'),
+ (b'unrelated-branch', b'obj')):
+ exr = run_get(get_disposition, b'--append', (b'src/' + save_2, b'obj'),
+ given=existing, rewrite=True)
+ wvpassne(0, exr.rc)
+ verify_rx(br'cannot append', exr.err)
+
wvstart(get_disposition + ' --append committish')
commit_2_id = src_info['commit-2-id']
tree_2_id = src_info['tree-2-id']
- for item in (b'.tag/commit-2', b'src/' + save_2, b'src'):
+ for item, rewrite in ((b'.tag/commit-2', False),
+ (b'src/' + save_2, False),
+ (b'src', False),
+ (b'src', True)):
for existing in (None, (b'.tag/commit-1', b'obj'),
(b'.tag/commit-2', b'obj'),
(b'unrelated-branch', b'obj')):
exr = run_get(get_disposition, b'--append', (item, b'obj'),
- given=existing)
+ given=existing, rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_new_save(b'obj/latest', getcwd() + b'/src',
- commit_2_id, tree_2_id, b'src-2', exr.out)
+ commit_2_id, tree_2_id, b'src-2', exr.out,
+ rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])
# Append ancestor
save_1 = src_info['save-1']
commit_1_id = src_info['commit-1-id']
tree_1_id = src_info['tree-1-id']
- for item in (b'.tag/commit-1', b'src/' + save_1, b'src-1'):
+ for item, rewrite in ((b'.tag/commit-1', False),
+ (b'src/' + save_1, False),
+ (b'src-1', False),
+ (b'src-1', True)):
exr = run_get(get_disposition, b'--append', (item, b'obj'),
- given=(b'.tag/commit-2', b'obj'))
+ given=(b'.tag/commit-2', b'obj'),
+ rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_new_save(b'obj/latest', getcwd() + b'/src',
- commit_1_id, tree_1_id, b'src-1', exr.out)
+ commit_1_id, tree_1_id, b'src-1', exr.out,
+ rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])

wvstart(get_disposition + ' --append tree')
+ exr = run_get(get_disposition, b'--append', (b'.tag/subtree', b'obj'),
+ given=None, rewrite=True)
+ wvpassne(0, exr.rc)
+ verify_rx(br'cannot append', exr.err)
subtree_path = src_info['subtree-path']
subtree_id = src_info['subtree-id']
for item in (b'.tag/subtree', b'src/latest' + subtree_vfs_path):
@@ -568,12 +637,16 @@ def _test_append(get_disposition, src_info):
verify_only_refs(heads=(b'obj',), tags=[])

wvstart(get_disposition + ' --append, implicit destinations')
-
- for item in (b'src', b'src/latest'):
- exr = run_get(get_disposition, b'--append', item)
+ exr = run_get(get_disposition, b'--append', b'src/latest', rewrite=True)
+ wvpassne(0, exr.rc)
+ verify_rx(br'cannot append', exr.err)
+ for item, rewrite in ((b'src', False),
+ (b'src', True),
+ (b'src/latest', False)):
+ exr = run_get(get_disposition, b'--append', item, rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_new_save(b'src/latest', getcwd() + b'/src', commit_2_id, tree_2_id,
- b'src-2', exr.out)
+ b'src-2', exr.out, rewrite=rewrite)
verify_only_refs(heads=(b'src',), tags=[])

def _test_pick_common(get_disposition, src_info, force=False):
@@ -581,128 +654,175 @@ def _test_pick_common(get_disposition, src_info, force=False):
flavormsg = flavor.decode('ascii')
tinyfile_path = src_info['tinyfile-path']
subtree_vfs_path = src_info['subtree-vfs-path']
-
- wvstart(get_disposition + ' ' + flavormsg + ' to root fails')
- for item in (b'.tag/tinyfile', b'src/latest' + tinyfile_path, b'src'):
- exr = run_get(get_disposition, flavor, (item, b'/'))
+
+ wvstart(f'{get_disposition} {flavormsg} to root fails')
+ for item, rewrite in \
+ product((b'.tag/tinyfile', b'src/latest' + tinyfile_path, b'src'),
+ (False, True)):
+ exr = run_get(get_disposition, flavor, (item, b'/'), rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'can only pick a commit or save', exr.err)
- for item in (b'.tag/commit-1', b'src/latest'):
- exr = run_get(get_disposition, flavor, (item, b'/'))
+ for item, rewrite in ((b'.tag/commit-1', False),
+ (b'src/latest', False),
+ (b'src/latest', True)):
+ exr = run_get(get_disposition, flavor, (item, b'/'), rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'destination is not a tag or branch', exr.err)
- for item in (b'.tag/subtree', b'src/latest' + subtree_vfs_path):
- exr = run_get(get_disposition, flavor, (item, b'/'))
+ for item, rewrite in \
+ product((b'.tag/subtree', b'src/latest' + subtree_vfs_path),
+ (False, True)):
+ exr = run_get(get_disposition, flavor, (item, b'/'), rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'is impossible; can only --append a tree', exr.err)

- wvstart(get_disposition + ' ' + flavormsg + ' of blob or branch fails')
- for item in (b'.tag/tinyfile', b'src/latest' + tinyfile_path, b'src'):
+ wvstart(f'{get_disposition} {flavormsg} of blob or branch fails')
+ for item, rewrite in \
+ product((b'.tag/tinyfile', b'src/latest' + tinyfile_path, b'src'),
+ (False, True)):
for given, get_item in ((None, (item, b'obj')),
(None, (item, b'.tag/obj')),
((b'.tag/tinyfile', b'.tag/obj'), (item, b'.tag/obj')),
((b'.tag/tree-1', b'.tag/obj'), (item, b'.tag/obj')),
((b'.tag/commit-1', b'.tag/obj'), (item, b'.tag/obj')),
((b'.tag/commit-1', b'obj'), (item, b'obj'))):
- exr = run_get(get_disposition, flavor, get_item, given=given)
+ exr = run_get(get_disposition, flavor, get_item, given=given,
+ rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'impossible; can only pick a commit or save', exr.err)

- wvstart(get_disposition + ' ' + flavormsg + ' of tree fails')
- for item in (b'.tag/subtree', b'src/latest' + subtree_vfs_path):
+ wvstart(f'{get_disposition} {flavormsg} of tree fails')
+ for item, rewrite in \
+ product((b'.tag/subtree', b'src/latest' + subtree_vfs_path),
+ (False, True)):
for given, get_item in ((None, (item, b'obj')),
(None, (item, b'.tag/obj')),
((b'.tag/tinyfile', b'.tag/obj'), (item, b'.tag/obj')),
((b'.tag/tree-1', b'.tag/obj'), (item, b'.tag/obj')),
((b'.tag/commit-1', b'.tag/obj'), (item, b'.tag/obj')),
((b'.tag/commit-1', b'obj'), (item, b'obj'))):
- exr = run_get(get_disposition, flavor, get_item, given=given)
+ exr = run_get(get_disposition, flavor, get_item, given=given,
+ rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'impossible; can only --append a tree', exr.err)

+ wvstart(f'{get_disposition} {flavormsg} --rewrite of non-saves fails')
+ # Only --rewrite case currently not rejected more generally above
+ exr = run_get(get_disposition, flavor, (b'/.tag/commit-1', b'/'),
+ rewrite=True)
+ wvpassne(0, exr.rc)
+ verify_rx(br'cannot currently --rewrite a commit', exr.err)
+
save_2 = src_info['save-2']
commit_2_id = src_info['commit-2-id']
tree_2_id = src_info['tree-2-id']
# FIXME: these two wvstart texts?
if force:
- wvstart(get_disposition + ' ' + flavormsg + ' commit/save to existing tag')
- for item in (b'.tag/commit-2', b'src/' + save_2):
+ wvstart(f'{get_disposition} {flavormsg} commit/save to existing tag')
+ for item, rewrite in ((b'.tag/commit-2', False),
+ (b'src/' + save_2, False),
+ (b'src/' + save_2, True)):
for given in ((b'.tag/tinyfile', b'.tag/obj'),
(b'.tag/tree-1', b'.tag/obj'),
(b'.tag/commit-1', b'.tag/obj')):
exr = run_get(get_disposition, flavor, (item, b'.tag/obj'),
- given=given)
+ given=given, rewrite=rewrite)
wvpasseq(0, exr.rc)
- validate_new_tagged_commit(b'obj', commit_2_id, tree_2_id,
- exr.out)
+ if rewrite:
+ validate_tagged_save(b'obj', getcwd() + b'/src', None, None,
+ b'src-2', exr.out)
+ else:
+ validate_new_tagged_commit(b'obj', commit_2_id, tree_2_id,
+ exr.out)
verify_only_refs(heads=[], tags=(b'obj',))
else: # --pick
- wvstart(get_disposition + ' ' + flavormsg
- + ' commit/save to existing tag fails')
- for item in (b'.tag/commit-2', b'src/' + save_2):
+ wvstart(f'{get_disposition} {flavormsg} commit/save to existing tag fails')
+ for item, rewrite in ((b'.tag/commit-2', False),
+ (b'src/' + save_2, False),
+ (b'src/' + save_2, True)):
for given in ((b'.tag/tinyfile', b'.tag/obj'),
(b'.tag/tree-1', b'.tag/obj'),
(b'.tag/commit-1', b'.tag/obj')):
- exr = run_get(get_disposition, flavor, (item, b'.tag/obj'), given=given)
+ exr = run_get(get_disposition, flavor, (item, b'.tag/obj'),
+ given=given, rewrite=rewrite)
wvpassne(0, exr.rc)
verify_rx(br'cannot overwrite existing tag', exr.err)
-
- wvstart(get_disposition + ' ' + flavormsg + ' commit/save to tag')
- for item in (b'.tag/commit-2', b'src/' + save_2):
- exr = run_get(get_disposition, flavor, (item, b'.tag/obj'))
+
+ wvstart(f'{get_disposition} {flavormsg} commit/save to tag')
+ for item, rewrite in ((b'.tag/commit-2', False),
+ (b'src/' + save_2, False),
+ (b'src/' + save_2, True)):
+ exr = run_get(get_disposition, flavor, (item, b'.tag/obj'),
+ rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_clean_repo()
- validate_new_tagged_commit(b'obj', commit_2_id, tree_2_id, exr.out)
+ if rewrite:
+ validate_tagged_save(b'obj', getcwd() + b'/src', None, None,
+ b'src-2', exr.out)
+ else:
+ validate_new_tagged_commit(b'obj', commit_2_id, tree_2_id, exr.out)
verify_only_refs(heads=[], tags=(b'obj',))
-
- wvstart(get_disposition + ' ' + flavormsg + ' commit/save to branch')
- for item in (b'.tag/commit-2', b'src/' + save_2):
+
+ wvstart(f'{get_disposition} {flavormsg} commit/save to branch')
+ for item, rewrite in ((b'.tag/commit-2', False),
+ (b'src/' + save_2, False),
+ (b'src/' + save_2, True)):
for given in (None, (b'.tag/commit-1', b'obj'), (b'.tag/commit-2', b'obj')):
- exr = run_get(get_disposition, flavor, (item, b'obj'), given=given)
+ exr = run_get(get_disposition, flavor, (item, b'obj'), given=given,
+ rewrite=rewrite)
wvpasseq(0, exr.rc)
+ ex((bup_cmd, b'-d', b'get-dest', b'ls', b'--commit-hash', b'obj'))
validate_clean_repo()
validate_new_save(b'obj/latest', getcwd() + b'/src',
- commit_2_id, tree_2_id, b'src-2', exr.out)
+ commit_2_id, tree_2_id, b'src-2', exr.out,
+ rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])

- wvstart(get_disposition + ' ' + flavormsg
- + ' commit/save unrelated commit to branch')
- for item in(b'.tag/commit-2', b'src/' + save_2):
+ wvstart(f'{get_disposition} {flavormsg} commit/save unrelated commit to branch')
+ for item, rewrite in ((b'.tag/commit-2', False),
+ (b'src/' + save_2, False),
+ (b'src/' + save_2, True)):
exr = run_get(get_disposition, flavor, (item, b'obj'),
- given=(b'unrelated-branch', b'obj'))
+ given=(b'unrelated-branch', b'obj'),
+ rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_clean_repo()
validate_new_save(b'obj/latest', getcwd() + b'/src',
- commit_2_id, tree_2_id, b'src-2', exr.out)
+ commit_2_id, tree_2_id, b'src-2', exr.out,
+ rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])

- wvstart(get_disposition + ' ' + flavormsg + ' commit/save ancestor to branch')
+ wvstart(f'{get_disposition} {flavormsg} commit/save ancestor to branch')
save_1 = src_info['save-1']
commit_1_id = src_info['commit-1-id']
tree_1_id = src_info['tree-1-id']
- for item in (b'.tag/commit-1', b'src/' + save_1):
+ for item, rewrite in ((b'.tag/commit-1', False),
+ (b'src/' + save_1, False),
+ (b'src/' + save_1, True)):
exr = run_get(get_disposition, flavor, (item, b'obj'),
- given=(b'.tag/commit-2', b'obj'))
+ given=(b'.tag/commit-2', b'obj'),
+ rewrite=rewrite)
wvpasseq(0, exr.rc)
validate_clean_repo()
validate_new_save(b'obj/latest', getcwd() + b'/src',
- commit_1_id, tree_1_id, b'src-1', exr.out)
+ commit_1_id, tree_1_id, b'src-1', exr.out,
+ rewrite=rewrite)
verify_only_refs(heads=(b'obj',), tags=[])

-
- wvstart(get_disposition + ' ' + flavormsg + ', implicit destinations')
+ wvstart(f'{get_disposition} {flavormsg} implicit destinations')
exr = run_get(get_disposition, flavor, b'.tag/commit-2')
wvpasseq(0, exr.rc)
validate_clean_repo()
validate_new_tagged_commit(b'commit-2', commit_2_id, tree_2_id, exr.out)
verify_only_refs(heads=[], tags=(b'commit-2',))

- exr = run_get(get_disposition, flavor, b'src/latest')
- wvpasseq(0, exr.rc)
- validate_clean_repo()
- validate_new_save(b'src/latest', getcwd() + b'/src',
- commit_2_id, tree_2_id, b'src-2', exr.out)
- verify_only_refs(heads=(b'src',), tags=[])
+ for rewrite in False, True:
+ exr = run_get(get_disposition, flavor, b'src/latest', rewrite=rewrite)
+ wvpasseq(0, exr.rc)
+ validate_clean_repo()
+ validate_new_save(b'src/latest', getcwd() + b'/src',
+ commit_2_id, tree_2_id, b'src-2', exr.out,
+ rewrite=rewrite)
+ verify_only_refs(heads=(b'src',), tags=[])

def _test_pick_force(get_disposition, src_info):
_test_pick_common(get_disposition, src_info, force=True)
@@ -887,7 +1007,7 @@ def create_get_src():
mkdir(b'src/x')
mkdir(b'src/x/y')
ex((bup_cmd + b' -d get-src random 1k > src/1'), shell=True)
- ex((bup_cmd + b' -d get-src random 1k > src/x/2'), shell=True)
+ ex((bup_cmd + b' -d get-src random 1m > src/x/2'), shell=True)
ex((bup_cmd, b'-d', b'get-src', b'index', b'src'))
exr = exo((bup_cmd, b'-d', b'get-src', b'save', b'-tcn', b'src', b'src'))
out = exr.out.splitlines()
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Treat --bupm without --links much like links, noticing/reporting any
missing objects encountered incidentally (without requring a
PackIdxList for the more thorough evaluation) instead crashing on a
raised MissingObject.

To allow this, simplify/generalize find_live_objects by dropping
count_missing, and just rely on for_item for everything. Alow
for_item to return True to suppress the normal MissingObject raise for
missing objects.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-validate-refs.1.md | 7 ++--
lib/bup/cmd/validate_refs.py | 49 ++++++++++++++++++----------
lib/bup/gc.py | 38 +++++++++++----------
3 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/Documentation/bup-validate-refs.1.md b/Documentation/bup-validate-refs.1.md
index 4756f165..6386104c 100644
--- a/Documentation/bup-validate-refs.1.md
+++ b/Documentation/bup-validate-refs.1.md
@@ -8,7 +8,7 @@ bup-validate-refs - check integrity of repository refs

# SYNOPSIS

-bup validate-refs [--links] [--bupm] [*ref*...]
+bup validate-refs [\--links] [\--bupm] [*ref*...]

# DESCRIPTION

@@ -39,8 +39,9 @@ has encountered before.
\--bupm
: check bupm (metadata storage) files. Currently checks for missing
path entries, which could have been caused by `bup` versions since
- 0.25 and before 0.30.1. See REPAIRS in `bup-get`(1) for additional
- information.
+ 0.25 and before 0.30.1. May notice missing objects, but may not
+ notice all of them without `--links`. See REPAIRS in `bup-get`(1)
+ for additional information.

\--links
: check for commits or trees that refer to missing objects. This
diff --git a/lib/bup/cmd/validate_refs.py b/lib/bup/cmd/validate_refs.py
index 6fc2a955..03dd2214 100644
--- a/lib/bup/cmd/validate_refs.py
+++ b/lib/bup/cmd/validate_refs.py
@@ -6,7 +6,7 @@ from stat import S_ISDIR
from bup import git, options, vfs
from bup.compat import argv_bytes
from bup.gc import count_objects, find_live_objects
-from bup.git import BUP_CHUNKED, demangle_name, tree_iter
+from bup.git import BUP_CHUNKED, MissingObject, demangle_name, tree_iter
from bup.helpers import EXIT_FAILURE, EXIT_FALSE, EXIT_TRUE, log
from bup.metadata import Metadata
from bup.io import walk_path_msg, path_msg
@@ -64,7 +64,7 @@ def main(argv):
verbosity = opt.verbose

if (opt.links, opt.bupm) == (False, False):
- o.fatal(f'no validation requested')
+ o.fatal('no validation requested')
if (opt.links, opt.bupm) == (None, None):
opt.links = opt.bupm = True

@@ -77,12 +77,27 @@ def main(argv):

bad_bupm = 0
abridged_bupm = 0
+ found_missing = 0
+ def notice_missing(ref_name, item_path):
+ nonlocal found_missing
+ found_missing += 1
+ item = item_path[-1]
+ imsg = walk_path_msg(ref_name, item_path)
+ log(f'missing {item.oid.hex()} {imsg}\n')

- def validate_if_bupm(ref_name, item_path):
- nonlocal bad_bupm, abridged_bupm
+ def for_item(ref_name, item_path):
+ # Always notice missing objects; without --links won't be
+ # comprehensive.
item = item_path[-1]
+ if item.data is False:
+ notice_missing(ref_name, item_path)
+ return True
+ if not opt.bupm:
+ return True
+
+ nonlocal bad_bupm, abridged_bupm
if item.name != b'.bupm':
- return
+ return True
bupm_n = 0
with tree_data_reader(repo, item.oid) as bupm:
try:
@@ -91,6 +106,8 @@ def main(argv):
bupm_n += 1
except EOFError:
pass
+ except MissingObject:
+ return True # bupm sub-item, will be handled by later for_item
except Exception:
pm = walk_path_msg(ref_name, item_path)
raise Exception(f'Unable to parse .bupm at {pm}')
@@ -99,7 +116,7 @@ def main(argv):
assert info[0], info
exp_n = expected_bup_entry_count_for_tree(b''.join(info[3]))
if bupm_n == exp_n:
- return
+ return True
elif bupm_n > exp_n:
bad_bupm += 1
log(f'error: tree with extra bupm entries ({bupm_n} > {exp_n})'
@@ -108,8 +125,8 @@ def main(argv):
abridged_bupm += 1
imsg = walk_path_msg(ref_name, item_path)
log(f'abridged-bupm {imsg}\n')
+ return True

- found_missing = 0
# Wanted all refs, or at least some specified weren't missing
if not extra or (extra and ref_info):
existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
@@ -120,17 +137,13 @@ def main(argv):
if opt.links:
idxl = git.PackIdxList(git.repo(b'objects/pack'))
maybe_close_idxl.enter_context(idxl)
- found = find_live_objects(existing_count, cat_pipe,
- refs=ref_info,
- count_missing=opt.links,
- idx_list=idxl,
- for_item=opt.bupm and validate_if_bupm,
- verbosity=verbosity)
- if opt.links:
- live_objects, live_trees, found_missing = found
- else:
- live_objects, live_trees = found
- live_objects.close()
+ live_objs, live_trees = \
+ find_live_objects(existing_count, cat_pipe,
+ refs=ref_info,
+ idx_list=idxl,
+ for_item=for_item,
+ verbosity=verbosity)
+ live_objs.close()
if bad_bupm:
return EXIT_FAILURE
elif (ref_missing + found_missing + abridged_bupm):
diff --git a/lib/bup/gc.py b/lib/bup/gc.py
index 9bb7ff25..cd8e53d6 100644
--- a/lib/bup/gc.py
+++ b/lib/bup/gc.py
@@ -69,14 +69,22 @@ def count_objects(dir, verbosity):

def report_missing(ref_name, item_path):
item = item_path[-1]
+ if item.data is not False:
+ return True
imsg = walk_path_msg(ref_name, item_path)
note_error(f'missing {item.oid.hex()} {imsg}\n')
+ return True


def find_live_objects(existing_count, cat_pipe, refs=None, *,
- count_missing=False, idx_list=None, for_item=None,
- verbosity=0):
- if count_missing: assert idx_list, (count_missing, idx_list)
+ idx_list=None, for_item=None, verbosity=0):
+ # Currently, for_item(ref_name, item_path) is called for all
+ # items, even missing items, and item.data will always be False
+ # for missing items as per walk_object. When a missing object is
+ # encountered and for_item has not been provided, or if for_item
+ # has been provided and does not return True, then a MissingObject
+ # will be raised. If idx_list is provided, then existence checks
+ # will be broad.
pack_dir = git.repo(b'objects/pack')
ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir)
os.close(ffd)
@@ -91,7 +99,6 @@ def find_live_objects(existing_count, cat_pipe, refs=None, *,
stop_at = lambda x: unhexlify(x) in live_trees
oid_exists = (lambda oid: idx_list.exists(oid)) if idx_list else None
approx_live_count = 0
- missing = 0
scan_refs = refs if refs else list(git.list_refs())
ref_n = len(scan_refs)
def progress_msg():
@@ -106,14 +113,13 @@ def find_live_objects(existing_count, cat_pipe, refs=None, *,
for item in item_path:
assert isinstance(item, git.WalkItem)
item = item_path[-1]
- if item.data is False:
- if count_missing:
- report_missing(ref_name, item_path)
- missing += 1
- else:
- raise MissingObject(item.oid)
+ handled_missing = None
if for_item:
- for_item(ref_name, item_path)
+ handled_missing = for_item(ref_name, item_path)
+ assert handled_missing in (True, None), handled_missing
+ if (not handled_missing) and item.data is False:
+ raise MissingObject(item.oid)
+
# FIXME: batch ids
if item.type != b'blob':
if verbosity and not item.oid in live_trees:
@@ -127,10 +133,7 @@ def find_live_objects(existing_count, cat_pipe, refs=None, *,
live_blobs.add(item.oid)
log(progress_msg() + '\n')
maybe_close_bloom.pop_all()
- if count_missing:
- return live_blobs, live_trees, missing
- else:
- return live_blobs, live_trees
+ return live_blobs, live_trees

_pack_stem_rx = re.compile(br'pack-[0-9a-fA-F]{40}')

@@ -253,12 +256,13 @@ def bup_gc(threshold=10, compression=1, verbosity=0, ignore_missing=False):
else:
try:
with ExitStack() as maybe_close_idxl:
- idxl = None
+ for_item, idxl = None, None
if ignore_missing:
idxl = git.PackIdxList(git.repo(b'objects/pack'))
maybe_close_idxl.enter_context(idxl)
+ for_item = report_missing
found = find_live_objects(existing_count, cat_pipe,
- count_missing=ignore_missing,
+ for_item=for_item,
idx_list=idxl,
verbosity=verbosity)
live_objects, live_trees = found[:2]
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Now that we have "contextual" arguments, disallow those that don't
precede a fetch request, e.g.

bup get --pick src --rewrite

rather than

bup get --rewrite --pick src

Thanks to Johannes Berg for raising the concern.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 15 +++++++++++++++
test/ext/test-get-excludes | 4 ++++
2 files changed, 19 insertions(+)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 6890f6ab..c34cede9 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -178,6 +178,7 @@ def parse_args(args):
missing=MissingConfig(id=repair_id, mode=missing,
repair_info=opt.repair_info))

+ pending_method_context = {} # dict to preserve insertion order
remaining = args[1:] # Skip argv[0]
while remaining:
arg = remaining[0]
@@ -188,17 +189,21 @@ def parse_args(args):
opt.verbose += 1
remaining = remaining[1:]
elif arg == b'--missing':
+ pending_method_context[arg] = True
(val,), remaining = require_n_args_or_die(1, remaining)
if val not in (b'fail', b'ignore', b'replace'):
misuse(f'--missing must be fail, ignore, or replace, not {val!r}')
missing = val.decode('ascii')
elif arg == b'--ignore-missing':
+ pending_method_context[arg] = True
missing = 'ignore'
remaining = remaining[1:]
elif arg == b'--no-ignore-missing':
+ pending_method_context[arg] = True
missing = 'fail'
remaining = remaining[1:]
elif arg == b'--repair-id':
+ pending_method_context[arg] = True
(val,), remaining = require_n_args_or_die(1, remaining)
if not val:
misuse('empty --repair-id')
@@ -210,11 +215,14 @@ def parse_args(args):
(ref,), remaining = require_n_args_or_die(1, remaining)
opt.target_specs.append(make_spec(method=arg[2:].decode('ascii'),
src=ref, dest=None))
+ pending_method_context = {}
elif arg in (b'--ff:', b'--append:', b'--pick:', b'--force-pick:',
b'--new-tag:', b'--replace:'):
(ref, dest), remaining = require_n_args_or_die(2, remaining)
+ args_after_last_method = remaining
opt.target_specs.append(make_spec(method=arg[2:-1].decode('ascii'),
src=ref, dest=dest))
+ pending_method_context = {}
elif arg in (b'-s', b'--source'):
(opt.source,), remaining = require_n_args_or_die(1, remaining)
elif arg in (b'-r', b'--remote'):
@@ -226,13 +234,17 @@ def parse_args(args):
elif arg == b'--print-tags':
opt.print_tags, remaining = True, remaining[1:]
elif arg == b'--rewrite':
+ pending_method_context[arg] = True
rewrite, remaining = True, remaining[1:]
elif arg == b'--no-rewrite':
+ pending_method_context[arg] = True
rewrite, remaining = False, remaining[1:]
elif arg in (b'--exclude-rx', b'--exclude-rx-from'): # handled later
+ pending_method_context[arg] = True
(val,), remaining = require_n_args_or_die(1, remaining)
exclude_opts.append((arg, val))
elif arg == b'--no-excludes':
+ pending_method_context[arg] = True
exclude_opts, remaining = [], remaining[1:]
elif arg in (b'-0', b'-1', b'-2', b'-3', b'-4', b'-5', b'-6', b'-7',
b'-8', b'-9'):
@@ -253,6 +265,9 @@ def parse_args(args):
continue
else:
misuse(f'unrecognized argument: {path_msg(arg)}')
+ if pending_method_context:
+ ctx_msg = ' '. join(path_msg(x) for x in pending_method_context.keys())
+ misuse(f'trailing arguments with no effect: {ctx_msg}')
return opt

# FIXME: client error handling (remote exceptions, etc.)
diff --git a/test/ext/test-get-excludes b/test/ext/test-get-excludes
index 44a867cf..362247b4 100755
--- a/test/ext/test-get-excludes
+++ b/test/ext/test-get-excludes
@@ -39,5 +39,9 @@ WVPASSEQ $'one' "$(bup ls dst-1/latest/a)"
WVPASSEQ $'one\nthree\ntwo' "$(bup ls dst-2/latest/a)"
WVPASSEQ $'three\ntwo' "$(bup ls dst-3/latest/a)"

+WVSTART 'ignored contextual arguments are disallowed'
+WVEXPRC 2 eval 'bup get --rewrite --append x --exclude-rx y --ignore-missing 2> >(tee err.log)'
+WVPASS grep -E '^error: trailing arguments with no effect: --exclude-rx --ignore-missing' err.log

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 45 ++++++++++++++++++------------------
lib/bup/repair.py | 31 +++++++++++--------------
lib/bup/rewrite.py | 57 ++++++++++++++++++++++------------------------
3 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index cd7dd9fc..93b171c5 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -29,7 +29,7 @@ from bup.helpers import \
tty_width)
from bup.io import path_msg
from bup.pwdgrp import userfullname, username
-from bup.repair import RepairConfig, RepairInfo, valid_repair_id
+from bup.repair import Repairs, valid_repair_id
from bup.repo import LocalRepo, make_repo
from bup.rewrite import Rewriter

@@ -136,12 +136,12 @@ class Spec:
src: bytes
dest: bytes
ignore_missing: bool
- repair: Optional[RepairConfig] = None
+ repairs: Optional[Repairs] = None
excludes: Optional[list[Pattern]] = None
rewriter: Optional[Union[bool, Rewriter]] = None
def __post_init__(self):
- assert not (self.ignore_missing and self.repair), \
- (self.ignore_missing, self.repair)
+ assert not (self.ignore_missing and self.repairs), \
+ (self.ignore_missing, self.repairs)

def spec_msg(s):
if not s.dest:
@@ -158,15 +158,13 @@ def parse_args(args):
opt.print_commits = opt.print_trees = opt.print_tags = False
opt.bwlimit = None
opt.compress = None
- opt.repair_info = RepairInfo(command=get_argvb())
opt.source = opt.remote = None
opt.target_specs = []

- # For now, rewriting is a "global" state, i.e. enabled for all
- # specs or none. Since we don't want to create a Rewriter until
- # we've finished checking the requests (e.g. are past the
- # resolvers), the spec's rewriter will be set to True to indicate
- # that it needs the real Rewriter once we have it.
+ # Since we don't want to create a Rewriter until we've finished
+ # checking the requests (e.g. are past the resolvers), the spec's
+ # rewriter will be set to True to indicate that it needs the real
+ # Rewriter once we have it.
exclude_opts = []
ignore_missing = False
# rewrite and repair track each arg's "state" and repair implies rewrite
@@ -181,17 +179,16 @@ def parse_args(args):
if excludes and not (rewrite or repair):
misuse('--exclude-rx or --exclude-rx-from requires --rewrite or --repair')
rc = None
- if (rewrite or repair):
+ if rewrite or repair:
if repair_id is None:
repair_id = str(uuid4()).encode('ascii')
- rc = RepairConfig(id=repair_id, destructive=repair,
- info=opt.repair_info)
+ rc = Repairs(repair_id, repair, get_argvb())
if rewrite: rw = True
elif repair: rw = True
elif rewrite in (False, True): rw = rewrite
else: rw = repair
return Spec(method=method, src=src, dest=dest, excludes=excludes,
- rewriter=rw, ignore_missing=ignore_missing, repair=rc)
+ rewriter=rw, ignore_missing=ignore_missing, repairs=rc)

pending_method_context = {} # dict to preserve insertion order
remaining = args[1:] # Skip argv[0]
@@ -350,7 +347,7 @@ def transfer_commit(name, hash, parent, src_repo, dest_repo, ignore_missing):


def append_commit(src_loc, parent, src_repo, dest_repo, rewriter, excludes,
- repair_config, ignore_missing):
+ repairs, ignore_missing):
if not rewriter:
assert isinstance(src_loc, (bytes, Loc)), src_loc
oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
@@ -366,17 +363,17 @@ def append_commit(src_loc, parent, src_repo, dest_repo, rewriter, excludes,
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
return rewriter.append_save(path, parent, src_repo, dest_repo, excludes,
- repair_config)
+ repairs)

def append_commits(src_loc, dest_hash, src_repo, dest_repo, rewriter, excludes,
- repair_config, ignore_missing):
+ repairs, ignore_missing):
if not rewriter:
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()
last_c, tree = dest_hash, None
for commit in commits:
last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
- rewriter, excludes, repair_config,
+ rewriter, excludes, repairs,
ignore_missing)
assert tree is not None
return last_c, tree
@@ -405,7 +402,7 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, rewriter, excludes,
coid = unhexlify(commit)
last_c, tree = rewriter.append_save(path + (entry_for_coid[coid],),
last_c, src_repo, dest_repo,
- excludes, repair_config)
+ excludes, repairs)
assert tree is not None
return last_c, tree

@@ -639,7 +636,7 @@ def handle_append(item, src_repo, dest_repo):
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
return append_commits(item.src, item.dest.hash, src_repo, dest_repo,
item.spec.rewriter, item.spec.excludes,
- item.spec.repair, item.spec.ignore_missing)
+ item.spec.repairs, item.spec.ignore_missing)


def resolve_pick(spec, src_repo, dest_repo):
@@ -684,13 +681,13 @@ def handle_pick(item, src_repo, dest_repo):
if item.dest.type in ('branch', 'commit', 'save'):
return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
item.spec.rewriter, item.spec.excludes,
- item.spec.repair, item.spec.ignore_missing)
+ item.spec.repairs, item.spec.ignore_missing)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
return append_commit(item.src, None, src_repo, dest_repo,
item.spec.rewriter, item.spec.excludes,
- item.spec.repair, item.spec.ignore_missing)
+ item.spec.repairs, item.spec.ignore_missing)


def resolve_new_tag(spec, src_repo, dest_repo):
@@ -943,7 +940,9 @@ def main(argv):

get_everything(opt)

- if opt.repair_info.repair_count() and not saved_errors:
+ if any(spec.repairs and spec.repairs.repair_count() \
+ for spec in opt.target_specs) \
+ and not saved_errors:
msg = ('Repairs were needed and successful; see above. Additional'
' information may be found in the git log. Search for '
' "Repair-ID:" in "git --git-dir REPO log ..." for the related'
diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index e9c840db..d1860686 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -1,9 +1,7 @@

from binascii import hexlify
-from typing import Optional

-from bup.compat import dataclass
-from bup.io import enc_sh
+from bup.io import enc_sh, log


def valid_repair_id(s):
@@ -14,17 +12,24 @@ def valid_repair_id(s):
return True


-class RepairInfo:
+class Repairs:
# Used, for example, to track all repairs in a bup get process
- __slots__ = 'command', '_others', '_replacements'
- def __init__(self, *, command=None):
+ __slots__ = ('id', 'destructive', 'command', '_others', '_replacements')
+ def __init__(self, id, destructive, command):
+ assert valid_repair_id(id)
+ self.id = id
+ self.destructive = destructive
self.command = command
self._others = 0
self._replacements = []
- def note_repair(self): self._others += 1
+ def repair_count(self): return len(self._replacements) + self._others
+ def note_incidental_repair(self):
+ # "Safe" repairs that don't involve the repair id.
+ self._others += 1
def path_replaced(self, path, oid, new_oid):
+ if self.repair_count() == 0:
+ log(b'repairs needed, repair-id: %s\n' % self.id)
self._replacements.append((path, oid, new_oid))
- def repair_count(self): return len(self._replacements) + self._others
def repair_trailers(self, repair_id):
assert valid_repair_id(repair_id)
if not self.repair_count():
@@ -34,13 +39,3 @@ class RepairInfo:
trailers.append(b'Bup-Replaced: %s %s'
% (hexlify(new_oid), enc_sh(path)))
return trailers
-
-
-@dataclass(slots=True, frozen=True)
-class RepairConfig:
- id: bytes
- destructive: bool # Allow repairs that lose data (e.g. replacements)
- info: Optional[RepairInfo] = None
- def __post_init__(self):
- assert valid_repair_id(self.id)
- assert isinstance(self.info, RepairInfo)
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index ee9952f6..02f2abf0 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -95,11 +95,9 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

-def _path_repaired(path, oid, replacement_oid, missing_oid, repair_config):
- if repair_config.info.repair_count() == 0:
- log(b'repairs needed, repair-id: %s\n' % repair_config.id)
+def _path_repaired(path, oid, replacement_oid, missing_oid, repairs):
fs_path = _fs_path_from_vfs(path)
- repair_config.info.path_replaced(fs_path, oid, replacement_oid)
+ repairs.path_replaced(fs_path, oid, replacement_oid)
ep = path_msg(fs_path)
log(f'warning: missing object {missing_oid.hex()} for {ep}\n')
log(f'repaired {ep} {oid.hex()} -> {replacement_oid.hex()}\n')
@@ -152,7 +150,7 @@ class IncompleteDir:
missing: bytes # MissingObject oid

def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
- repair_config, *, _replacement_parents=None):
+ repairs, *, _replacement_parents=None):
"""Yield information about the paths underneath the given path.

Yield (src_path, replacement_dir), where src_path is a vfs_path
@@ -160,9 +158,9 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
representing a directory that has already been rewritten.

When unreadable objects are encountered, raise MissingObject if
- there is no repair_config, otherwise, yield an IncompleteDir if
- the path refers to a missing git tree, or split tree with missing
- split sub-trees.
+ repairs.destructive is false, otherwise, yield an IncompleteDir if
+ the path refers to a missing git tree, or a split tree with
+ missing split sub-trees.

"""
if _replacement_parents is None:
@@ -173,7 +171,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
# drop branch/DATE
fs_path_in_save = _fs_path_from_vfs((path[0],) + path[3:])

- if not repair_config.destructive:
+ if not repairs.destructive:
entries = vfs.contents(srcrepo, item)
else:
try:
@@ -225,13 +223,13 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
sub_rpath = _replacement_parents + (conv_item.oid,)
yield from _vfs_walk_dir_recursively(srcrepo, dstrepo, sub_path,
excludes, db, mapping,
- repair_config,
+ repairs,
_replacement_parents=sub_rpath)
assert path_w_meta is not None, f'{path_msg(fs_path_in_save)} has no "."'
assert isinstance(path_w_meta[-1][1].meta, (Metadata, int)), path_w_meta
yield path_w_meta, None

-def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repair_config):
+def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs):
name, item = path[-1]
assert isinstance(name, bytes)
have_meta = isinstance(item.meta, metadata.Metadata)
@@ -240,17 +238,16 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repair_config):
target = vfs.readlink(srcrepo, item)
except MissingObject as ex:
if have_meta and item.symlink_target is not None:
- repair_config.info.note_repair()
+ repairs.note_indidental_repair()
pm = path_msg(_fs_path_from_vfs(path))
log(f'warning: symlink data replaced from metadata for {pm}\n')
target = item.symlink_target
else:
- if not repair_config.destructive:
+ if not repairs.destructive:
raise ex
replacement = _replacement_symlink_item(dstrepo, item,
- repair_config.id, ex.oid)
- _path_repaired(path, item.oid, replacement.oid, ex.oid,
- repair_config)
+ repairs.id, ex.oid)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, repairs)
assert replacement.meta.mode == default_file_mode
stack.append_to_current(name, default_file_mode, default_file_mode,
replacement.oid, replacement.meta)
@@ -291,7 +288,7 @@ def _maybe_exec_mode(git_mode, meta):
return git_mode

def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
- split_cfg, stack, wdbc, mapping, repair_config):
+ split_cfg, stack, wdbc, mapping, repairs):
"""Returns either None, or, if a directory was missing, the
directory path components.

@@ -322,20 +319,20 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,

if incomplete: # must be a dir
assert replacement_dir is None, replacement_dir
- assert repair_config, repair_config
+ assert repairs, repairs
extend_stack(dir_path[len(stack):-1])
# For now, wholesale replacement (no attempt to handle
# partially readable split trees).
rep_item = incomplete.path[-1][1]
replacement = _replacement_tree_item(dstrepo, rep_item,
- repair_config.id,
+ repairs.id,
incomplete.missing)
# Must not remember repairs because the repair-id (and so blob
# content) can vary across saves, i.e. get --rewrite-id is a
# contextual argument, and because the type changes from tree
# to blob.
_path_repaired(path, rep_item.oid, replacement.oid, incomplete.missing,
- repair_config)
+ repairs)
assert replacement.meta.mode == default_file_mode, repr(replacement)
stack.append_to_current(path[-1][0],
replacement.meta.mode, GIT_MODE_FILE,
@@ -345,7 +342,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# First, things that can't be affected by the rewrite
if S_ISLNK(item_mode):
extend_stack(dir_path[len(stack):])
- _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repair_config)
+ _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs)
return
if not S_ISREG(item_mode) and not S_ISDIR(item_mode):
# Everything here (pipes, devices, etc.) should be fully
@@ -380,7 +377,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# has missing objects when it encounters it a second time (for
# say the second of two saves during an --append), which will
# omit the logging, repair trailers, etc.
- if not repair_config.destructive:
+ if not repairs.destructive:
wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
(item.oid, newtree))
return
@@ -414,15 +411,15 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
except MissingObject as ex:
# For now, wholesale replacement (no attempt to handle
# partially readable split files).
- if not repair_config.destructive:
+ if not repairs.destructive:
raise ex
- replacement = _replacement_file_item(dstrepo, item, repair_config.id,
+ replacement = _replacement_file_item(dstrepo, item, repairs.id,
ex.oid)
# Must not remember repairs because the repair-id (and so blob
# content) can vary across saves, i.e. get --rewrite-id is a
# contextual argument, and because the type may change from
# tree to blob.
- _path_repaired(path, item.oid, replacement.oid, ex.oid, repair_config)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, repairs)
assert replacement.meta.mode == default_file_mode, repr(replacement)
stack.append_to_current(name, replacement.meta.mode, GIT_MODE_FILE,
replacement.oid, replacement.meta)
@@ -473,7 +470,7 @@ class Rewriter:
pass

def append_save(self, save_path, parent, srcrepo, dstrepo, excludes,
- repair_config):
+ repairs):
# Strict for now
assert isinstance(parent, (bytes, type(None))), parent
if parent:
@@ -513,11 +510,11 @@ class Rewriter:
for path, replacement_dir \
in _vfs_walk_dir_recursively(srcrepo, dstrepo, save_path,
excludes, dbc, self._mapping,
- repair_config):
+ repairs):
_rewrite_save_item(save_path, path, replacement_dir,
srcrepo, dstrepo,
self._split_cfg, stack, dbc,
- self._mapping, repair_config)
+ self._mapping, repairs)

while len(stack) > 1: # pop all parts above root folder
stack.pop()
@@ -527,8 +524,8 @@ class Rewriter:
ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
author = ci.author_name + b' <' + ci.author_mail + b'>'
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
- trailers = repair_config.info.repair_trailers(repair_config.id)
- msg = commit_message(ci.message, repair_config.info.command,
+ trailers = repairs.repair_trailers(repairs.id)
+ msg = commit_message(ci.message, repairs.command,

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/validate_refs.py | 5 +++--
lib/bup/gc.py | 12 +++++++-----
2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/lib/bup/cmd/validate_refs.py b/lib/bup/cmd/validate_refs.py
index b238b0c2..1c80b94d 100644
--- a/lib/bup/cmd/validate_refs.py
+++ b/lib/bup/cmd/validate_refs.py
@@ -57,8 +57,9 @@ def main(argv):
if existing_count:
with git.PackIdxList(git.repo(b'objects/pack')) as idxl:
live_objects, live_trees, found_missing = \
- find_live_objects(existing_count, cat_pipe, idxl, refs=ref_info,
- verbosity=verbosity, count_missing=True)
+ find_live_objects(existing_count, cat_pipe, refs=ref_info,
+ count_missing=True, idx_list=idxl,
+ verbosity=verbosity)
live_objects.close()

return EXIT_FALSE if (ref_missing + found_missing) else EXIT_TRUE
diff --git a/lib/bup/gc.py b/lib/bup/gc.py
index b1c6b249..d41db0d3 100644
--- a/lib/bup/gc.py
+++ b/lib/bup/gc.py
@@ -81,8 +81,9 @@ def report_missing(ref_name, item_path):
note_error(f'missing {item.oid.hex()} {ref}:{path}\n')


-def find_live_objects(existing_count, cat_pipe, idx_list, refs=None,
- verbosity=0, count_missing=False):
+def find_live_objects(existing_count, cat_pipe, refs=None, *,
+ count_missing=False, idx_list=None, verbosity=0):
+ if count_missing: assert idx_list, (count_missing, idx_list)
pack_dir = git.repo(b'objects/pack')
ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir)
os.close(ffd)
@@ -261,9 +262,10 @@ def bup_gc(threshold=10, compression=1, verbosity=0, ignore_missing=False):
if ignore_missing:
idxl = git.PackIdxList(git.repo(b'objects/pack'))
maybe_close_idxl.enter_context(idxl)
- found = find_live_objects(existing_count, cat_pipe, idxl,
- verbosity=verbosity,
- count_missing=ignore_missing)
+ found = find_live_objects(existing_count, cat_pipe,
+ count_missing=ignore_missing,
+ idx_list=idxl,
+ verbosity=verbosity)
live_objects, live_trees = found[:2]
if verbosity:
log('expecting to retain about %.2f%% unnecessary objects\n'
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Instead of (likely) /tmp since that's often a tmpfs these days and
it's not clear we want to be competing for space there for this.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/rewrite.py | 9 ++++++---
test/ext/test-on | 2 +-
2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 324ea3b7..869869d4 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -17,9 +17,10 @@ from bup.hashsplit import \
GIT_MODE_SYMLINK,
GIT_MODE_TREE,
split_to_blob_or_tree)
-from bup.helpers import hostname, log, should_rx_exclude_path, temp_dir
+from bup.helpers import hostname, log, mkdirp, should_rx_exclude_path, temp_dir
from bup.io import path_msg, qsql_id
from bup.metadata import Metadata
+from bup.path import xdg_cache
from bup.pwdgrp import userfullname, username
from bup.repair import MissingConfig
from bup.tree import Stack
@@ -459,9 +460,11 @@ class Rewriter:
if db:
self._db_tmpdir = None
else:
+ cache = xdg_cache() + b'/bup/tmp'
+ mkdirp(cache)
self._db_tmpdir = \
- ctx.enter_context(temp_dir(prefix='bup-rewrite-'))
- self._db_path = f'{self._db_tmpdir}/db'
+ ctx.enter_context(temp_dir(dir=cache, prefix=b'rewrite-'))
+ self._db_path = self._db_tmpdir + b'/db'
self._db_conn = sqlite3.connect(self._db_path)
ctx.enter_context(closing(self._db_conn))
self._db_conn.text_factory = bytes
diff --git a/test/ext/test-on b/test/ext/test-on
index c8b187f5..2464b4f5 100755
--- a/test/ext/test-on
+++ b/test/ext/test-on
@@ -52,7 +52,7 @@ for idx in "$tmpdir"/bup/objects/pack/*.idx ; do
WVPASS cmp "$idx" "$cachedidx"
done

-WVSTART 'index-cache respects XDG_CACHE_DIR'
+WVSTART 'index-cache respects XDG_CACHE_HOME'
WVPASS mkdir xdg-cache
export XDG_CACHE_HOME="$(pwd)/xdg-cache"
WVPASS bup on - split -ct src/baz > /dev/null
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Exit 2 immediately on netbsd for now because script is broken for our
purposes (was hanging) due to https://gnats.netbsd.org/56254

Thanks to Greg Troxel for finding and reporting the problem.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
dev/with-tty | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dev/with-tty b/dev/with-tty
index c65aa1d4..7f3de35f 100755
--- a/dev/with-tty
+++ b/dev/with-tty
@@ -7,11 +7,15 @@ set -ueo pipefail
usage() { echo 'Usage: with-tty command [arg ...]'; }
misuse() { usage 1>&2; exit 2; }

+case "$OSTYPE" in
+ netbsd) exit 2 ;; # https://gnats.netbsd.org/56254
+esac
+
if script -qec true /dev/null; then
- # linux flavor
+ # at least linux and netbsd
script -qec "$(printf ' %q' "$@")" /dev/null
elif script -q /dev/null true; then
- # bsd flavor
+ # at least freebsd
script -q /dev/null "$@"
else
rc=0
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Reserve EOFError for unexpected, erroneous EOFs.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/_helpers.c | 2 +-
lib/bup/cmd/ftp.py | 9 ++++-----
2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/bup/_helpers.c b/lib/bup/_helpers.c
index e9490a2b..d192001a 100644
--- a/lib/bup/_helpers.c
+++ b/lib/bup/_helpers.c
@@ -1655,7 +1655,7 @@ bup_readline(PyObject *self, PyObject *args)
return NULL;
char *line = readline(prompt);
if (!line)
- return PyErr_Format(PyExc_EOFError, "readline EOF");
+ return Py_None;
PyObject *result = PyBytes_FromString(line);
free(line);
return result;
diff --git a/lib/bup/cmd/ftp.py b/lib/bup/cmd/ftp.py
index f8ecaa09..a6115fd0 100644
--- a/lib/bup/cmd/ftp.py
+++ b/lib/bup/cmd/ftp.py
@@ -102,11 +102,11 @@ def inputiter(f, pwd, out):
while 1:
prompt = b'bup %s> ' % (b'/'.join(name for name, item in pwd) or b'/', )
if hasattr(_helpers, 'readline'):
- try:
- yield _helpers.readline(prompt)
- except EOFError:
+ line = _helpers.readline(prompt)
+ if line is None:
print() # Clear the line for the terminal's next prompt
break
+ yield _helpers.readline(prompt)
else:
out.write(prompt)
out.flush()
@@ -116,8 +116,7 @@ def inputiter(f, pwd, out):
break
yield read_line
else:
- for line in f:
- yield line
+ yield from f

def rpath_msg(res):
"""Return a path_msg for the resolved path res."""
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
This should be feasible because only non-directories could have been
affected by the fix in 0.31 that added empty Metadata entries for
paths whose metadata couldn't be read from the filesystem after they'd
already been added to the pending tree:

47891d8951a95b8e0d9ca94387107cdf12ca3d3c
save: add empty metadata if reading fails

which fixed:

16f9f9829038f25aec80ebfae3c882a66281e145
save-cmd.py: don't crash when a path disappears between index and save.

i.e. an empty metadata entry for a dir is normal (i.e. in the bupm for
the dir's parent), and for a non-dir always means the metadata was
lost.

We also don't need to agument the rewrite db's key because the
restricted mode is in the item.mode, which we'll still have when
querying the db. The rewrite db only needs to track the content, and
whether or not the replacement was chunked (the latter so we'll know
what kind of git object to create).

tree._dir_metadata still reverses the process, since it converts all
integer modes back to empty metadata.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
DESIGN.md | 7 +++
Documentation/bup-restore.1.md | 7 ++-
dev/clear-bupm-entries | 30 +++++++++
lib/bup/vfs.py | 110 +++++++++++++++++++++++----------
test/ext/test-empty-metadata | 62 +++++++++++++++++++
5 files changed, 181 insertions(+), 35 deletions(-)
create mode 100755 dev/clear-bupm-entries
create mode 100755 test/ext/test-empty-metadata

diff --git a/DESIGN.md b/DESIGN.md
index 2a8328c8..c55c1997 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -561,6 +561,13 @@ over time, both as a result of intentional changes and earlier bugs.
of course have split trees, nor will current repositories with
bup.split.trees set to false.

+Currently, an empty metadata entry for a directory is expected
+(e.g. the entry for a directory in its parent's .bupm, synthetic
+directories created by `--graft`, etc.), and an empty entry for a
+non-directory indicates that the metadata was lost (e.g. via the
+abridgement fix described above). The VFS intends to provide restrictive
+permissions (e.g. umask 077) for paths whose metadata has been lost.
+

Filesystem Interaction
======================
diff --git a/Documentation/bup-restore.1.md b/Documentation/bup-restore.1.md
index a7c246ea..48f1f1aa 100644
--- a/Documentation/bup-restore.1.md
+++ b/Documentation/bup-restore.1.md
@@ -64,7 +64,12 @@ disabled via `--numeric-ids` (which can be important when restoring a
chroot, for example), and as a special case, a uid or gid of 0 will
never be remapped by name. Additionally, some systems don't allow
setting a uid/gid that doesn't correspond with a known user/group. On
-those systems, bup will log an error for each relevant path.
+those systems, bup will log an error for each relevant path. Any
+"synthetic" paths, for example a root directory affected by `bup save
+--graft`, will have group and world read/execute permissions as if via
+a umask of 022. Any paths whose metadata has been lost (perhaps via
+earlier versions of bup; see the taxonomy in DESIGN) will have
+restrictive permissions as if via a umask of 077.

The `--map-user`, `--map-group`, `--map-uid`, `--map-gid` options may
be used to adjust the available ownership information before any of
diff --git a/dev/clear-bupm-entries b/dev/clear-bupm-entries
new file mode 100755
index 00000000..c5af224c
--- /dev/null
+++ b/dev/clear-bupm-entries
@@ -0,0 +1,30 @@
+#!/bin/sh
+"""": # -*-python-*-
+bup_python="$(dirname "$0")/bup-python" || exit $?
+exec "$bup_python" "$0" ${1+"$@"}
+"""
+
+from argparse import ArgumentParser
+from sys import stderr, stdin, stdout
+import sys
+
+from bup import metadata
+from bup.metadata import Metadata
+
+parser = ArgumentParser(description='Replace indexed entries with empty entries',
+ epilog='example: clear-bupm-entries 0 2 7 < bupm > result')
+add_arg = parser.add_argument
+add_arg('indexes', metavar='<index>', type=int, nargs='*',
+ help='entry index (starting with 0)')
+opt = parser.parse_args()
+
+drop = set(opt.indexes)
+for i, m in enumerate(metadata._ArchiveIterator(stdin.buffer)):
+ if i in drop:
+ Metadata().write(stdout.buffer)
+ drop.remove(i)
+ else:
+ m.write(stdout.buffer)
+if drop:
+ print(f'ERROR: indexed entries did not exist {drop}', file=stderr)
+ sys.exit(2)
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 6a38529c..d15c75e6 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -20,18 +20,28 @@ may not be honored. Callers must be able to handle an item.meta value
that is either an instance of Metadata or an integer mode, perhaps via
item_mode() or augment_item_meta().

-An integer item.meta means that either no bup-recorded metadata was
-available, or the item was a subdirectory returned by a function like
-contents(), which doesn't retrieve the metadata for
-subdirectories. That's because the actual metadata for a directory is
-stored inside the directory (see fill_in_metadata_if_dir() or
-ensure_item_has_metadata()).
-
-Bup-recorded metadata may be unavailable for a number of reasons. For
-example, "synthetic" paths like the VFS root or /.tag/ don't have it,
-trees created by git or early versions of bup won't have it, and some
-versions of bup omitted it when the metadata was unreadable at save
-time.
+A publically readable integer item.meta means either:
+
+ - No bup-recorded metadata was available (e.g. a tree created by git
+ or by bup before metadata was supported).
+
+ - The item is "synthetic", for example, the root directory "/", the
+ .tags/ directory, a RevList (branch) directory, a Commit (save),
+ or a "fake parent" created by the strip/graft options. The mode
+ permissions will be "public", i.e. readable by group and other (as
+ if by "chmod go=rX").
+
+ - The item's metadata should exist (i.e. the parent tree was created
+ by bup), but has been lost. The mode permissions will be
+ "private", i.e. no group or other permissions (as if by "chmod
+ go="). See the Repository Taxonmy in DESIGN for some additional
+ information.
+
+ - The item is a subdirectory returned by a function like contents(),
+ which doesn't retrieve the metadata for subdirectories. That's
+ because the actual metadata for a directory is stored inside the
+ directory (see fill_in_metadata_if_dir() or
+ ensure_item_has_metadata()).

Setting want_meta=False is rarely desirable since it can limit the VFS
to only the metadata that git itself can represent, and so for
@@ -57,7 +67,22 @@ from collections import namedtuple
from errno import EINVAL, ELOOP, ENOTDIR
from itertools import tee
from random import randrange
-from stat import S_IFDIR, S_IFLNK, S_IFREG, S_ISDIR, S_ISLNK, S_ISREG
+from stat import \
+ (S_IFDIR,
+ S_IFLNK,
+ S_IFREG,
+ S_IRGRP,
+ S_IROTH,
+ S_IRUSR,
+ S_IRWXG,
+ S_IRWXO,
+ S_ISDIR,
+ S_ISLNK,
+ S_ISREG,
+ S_IWUSR,
+ S_IXGRP,
+ S_IXOTH,
+ S_IXUSR)
from time import localtime, strftime
import re

@@ -85,9 +110,11 @@ class IOError(py_IOError):
py_IOError.__init__(self, errno, message)
self.terminus = terminus

-default_file_mode = S_IFREG | 0o644
-default_dir_mode = S_IFDIR | 0o755
-default_symlink_mode = S_IFLNK | 0o755
+_reg_perms = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
+_exec_perms = _reg_perms | S_IXUSR | S_IXGRP | S_IXOTH
+default_file_mode = S_IFREG | _reg_perms
+default_dir_mode = S_IFDIR | _exec_perms
+default_symlink_mode = S_IFLNK | _exec_perms

def _default_mode_for_gitmode(gitmode):
if S_ISREG(gitmode):
@@ -555,22 +582,31 @@ def ordered_tree_entries(entries, bupm=None):

def _tree_items_except_dot(oid, entries, names=None, bupm=None):
"""Returns all tree items except ".", and assumes that any bupm is
- positioned just after that entry."""
+ positioned just after that entry. Any paths whose metadata has
+ been lost will have restrictive permissions (as if via umask
+ 077). See the Repository Taxonmy in DESIGN.
+
+ """
+
+ def read_nondir_meta(bupd, default_mode):
+ if not bupm:
+ return default_mode
+ meta = Metadata.read(bupm)
+ if meta is None: # lost metadata
+ return default_mode & ~(S_IRWXO | S_IRWXG)
+ return meta

def tree_item(ent_oid, kind, gitmode):
if kind == BUP_CHUNKED:
- meta = Metadata.read(bupm) if bupm else default_file_mode
+ assert S_ISDIR(gitmode), (ent_oid, kind, gitmode)
+ meta = read_nondir_meta(bupm, default_file_mode)
return Chunky(oid=ent_oid, meta=meta)

if S_ISDIR(gitmode):
# No metadata here (accessable via '.' inside ent_oid).
return Item(meta=default_dir_mode, oid=ent_oid)

- meta = Metadata.read(bupm) if bupm else None
- # handle the case of metadata being empty/missing in bupm
- # (or there not being bupm at all)
- if meta is None:
- meta = _default_mode_for_gitmode(gitmode)
+ meta = read_nondir_meta(bupm, _default_mode_for_gitmode(gitmode))
return Item(oid=ent_oid, meta=meta)

tree_ents = ordered_tree_entries(entries, bupm)
@@ -893,9 +929,12 @@ def contents(repo, item, names=None, want_meta=True):
(see fill_in_metadata_if_dir() or ensure_item_has_metadata()).

Note that want_meta is advisory. For any given item, item.meta
- might be a Metadata instance or a mode, and if the former,
- meta.size might be None. Missing sizes can be computed via via
- item_size() or augment_item_meta(..., include_size=True).
+ might be a Metadata instance or an integer mode, and if the
+ former, meta.size might be None. Missing sizes can be computed
+ via via item_size() or augment_item_meta(...,
+ include_size=True). If an integer mode's permissions are
+ restrictive (i.e. no permissions for group or other), then the
+ metadata for the item has been lost.

Do not modify any item.meta Metadata instances directly. If
needed, make a copy via item.meta.copy() and modify that instead.
@@ -1081,14 +1120,17 @@ def resolve(repo, path, parent=None, want_meta=True, follow=True):

When want_meta is true, detailed metadata will be included in each
result item if it's avaiable, otherwise item.meta will be an
- integer mode. The metadata size may or may not be provided, but
- can be computed by item_size() or augment_item_meta(...,
- include_size=True). Setting want_meta=False is rarely desirable
- since it can limit the VFS to just the metadata git itself can
- represent, and so, as an example, fifos and sockets will appear to
- be regular files (e.g. S_ISREG(item_mode(item)) will be true) .
- But the option is provided because it may be more efficient when
- only the path names or the more limited metadata is sufficient.
+ integer mode. If an integer mode's permissions are restrictive
+ (i.e. no permissions for group or other), then the metadata for
+ the item has been lost. The metadata size may or may not be
+ provided, but can be computed by item_size() or
+ augment_item_meta(..., include_size=True). Setting
+ want_meta=False is rarely desirable since it can limit the VFS to
+ just the metadata git itself can represent, and so, as an example,
+ fifos and sockets will appear to be regular files
+ (e.g. S_ISREG(item_mode(item)) will be true) . But the option is
+ provided because it may be more efficient when only the path names
+ or the more limited metadata is sufficient.

Do not modify any item.meta Metadata instances directly. If
needed, make a copy via item.meta.copy() and modify that instead.
diff --git a/test/ext/test-empty-metadata b/test/ext/test-empty-metadata
new file mode 100755
index 00000000..f50e11b2
--- /dev/null
+++ b/test/ext/test-empty-metadata
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh
+. ./test/lib/btl.sh
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+bup() { "$top/bup" "$@"; }
+
+
+WVPASS cd "$tmpdir"
+WVPASS bup init
+
+
+WVSTART 'non-directories with empty metadata have restrictive permissions'
+WVPASS mkdir src
+WVPASS echo 1 > src/1
+WVPASS echo 2 > src/2
+WVPASS mkfifo src/3
+WVPASS chmod 0644 src/{1,2,3}
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+WVPASS bup join src:.bupm > bupm
+
+wv-match-rx "$(bup ls -l src/latest | tr -s ' ' ' ')" \
+'^-rw-r--r-- [^/]+/.* 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} 1
+-rw-r--r-- [^/]+/.* 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} 2
+prw-r--r-- [^/]+/.* 0 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} 3$'
+
+# Now replace src:.bupm with one with some empty entries
+WVPASS "$top/dev/clear-bupm-entries" 1 3 < bupm > bupm-cleared
+cleared_bupm_oid="$(WVPASS git hash-object -w bupm-cleared)"
+new_tree="$(WVPASS git ls-tree src | WVPASS sed -Ee "1s/.*/100644 blob $cleared_bupm_oid .bupm/")"
+new_tree_oid="$(echo "$new_tree" | WVPASS git mktree)"
+new_save=$(WVPASS git commit-tree "$new_tree_oid" -p src -m 'empty metadata')
+WVPASS git branch -f src "$new_save"
+
+wv-match-rx "$(bup ls -l src/latest | tr -s ' ' ' ')" \
+'^-rw------- \?/\? 2 \?\?\?\?-\?\?-\?\? \?\?:\?\? 1
+-rw-r--r-- [^/]+/.* 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} 2
+-rw------- \?/\? 0 \?\?\?\?-\?\?-\?\? \?\?:\?\? 3$'
+
+WVPASS rm -rf restore
+um="$(umask)" || exit $?
+umask 0
+WVPASS bup restore -C restore/ src/latest/
+WVPASS cd restore
+WVPASSEQ "$(ls -l | cut -d' ' -f1 | tail +2)" \
+'-rw-------
+-rw-r--r--
+-rw-------'
+WVPASS cd ..
+umask "$um"
+WVPASS rm -rf restore
+
+WVPASS cd "$top"
+WVPASS rm -rf "$tmpdir"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
During a repair, replace the metadata entries for any paths affected
by abridgement with empty Metadata(), since their metadata has
effectively been lost by the disordering because we don't know which
entries were dropped.

This will cause the non-directory paths (see the Taxonomy in DESIGN)
to end up with restrictive permissions when viewed via the VFS.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 4 +-
Documentation/bup-restore.1.md | 4 +-
Documentation/bup-validate-refs.1.md | 2 +-
lib/bup/metadata.py | 5 +-
lib/bup/rewrite.py | 24 +++++-
lib/bup/tree.py | 39 ++++++---
lib/bup/vfs.py | 117 ++++++++++++++++++++++-----
note/main.md | 8 ++
test/ext/test-get-repair-bupm | 107 ++++++++++++++++++++++++
test/ext/test-get-rewrite-missing | 24 ++----
test/lib/btl.sh | 8 ++
wvtest.sh | 1 -
12 files changed, 286 insertions(+), 57 deletions(-)
create mode 100755 test/ext/test-get-repair-bupm

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index 26856f9a..3ef5da5c 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -238,7 +238,9 @@ transfer when `--repair` is requested.
(ambiguous). When such an abridged `.bupm` is detected, `--repair`
drops all of the `.bupm` entries except the one for the directory
itself, ".", and so the affected paths lose most or all of their
- metadata (ownership, permissions, timestamps, etc.).
+ metadata (ownership, permissions, timestamps, etc.). These paths
+ will have restrictive permissions (as if via umask 077) when
+ presented, e.g. via `bup-restore(1)`, `bup-ls(1)`, etc.

* Use of `bup get` or `bup gc` versions before 0.33.5 could cause
repositories to end up with missing objects (which can be detected
diff --git a/Documentation/bup-restore.1.md b/Documentation/bup-restore.1.md
index 48f1f1aa..e6363636 100644
--- a/Documentation/bup-restore.1.md
+++ b/Documentation/bup-restore.1.md
@@ -68,8 +68,8 @@ those systems, bup will log an error for each relevant path. Any
"synthetic" paths, for example a root directory affected by `bup save
--graft`, will have group and world read/execute permissions as if via
a umask of 022. Any paths whose metadata has been lost (perhaps via
-earlier versions of bup; see the taxonomy in DESIGN) will have
-restrictive permissions as if via a umask of 077.
+`bup get --repair`, or earlier versions of bup; see the taxonomy in
+DESIGN) will have restrictive permissions as if via a umask of 077.

The `--map-user`, `--map-group`, `--map-uid`, `--map-gid` options may
be used to adjust the available ownership information before any of
diff --git a/Documentation/bup-validate-refs.1.md b/Documentation/bup-validate-refs.1.md
index a9a3f83c..4756f165 100644
--- a/Documentation/bup-validate-refs.1.md
+++ b/Documentation/bup-validate-refs.1.md
@@ -57,7 +57,7 @@ found, and some other positive integer for other failures.

# SEE ALSO

-`bup-fsck`(1), `bup-join`(1), `bup-restore`(1)
+`bup-fsck`(1), `bup-get`(1), `bup-join`(1), `bup-restore`(1)

# BUP

diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 7a561627..565defe1 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -228,8 +228,9 @@ class Metadata:
# Timestamps are (sec, ns), relative to 1970-01-01 00:00:00, ns
# must be non-negative and < 10**9.

- # Consider bup.rewrite (_blob_replacement() ...) when making
- # changes to the records (particularly the common records).
+ # Consider bup.rewrite (e.g. _blob_replacement() ...) and
+ # LostMetadata when making changes to the records (particularly
+ # the common records).

def _add_common(self, path, st):
assert(st.st_uid >= 0)
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index a900fca4..282c06a8 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -23,9 +23,15 @@ from bup.io import path_msg, qsql_id
from bup.metadata import Metadata
from bup.path import xdg_cache
from bup.pwdgrp import userfullname, username
+from bup.repair import Repairs
from bup.tree import Stack
from bup.vfs import \
- Item, MissingObject, default_exec_mode, default_file_mode, render_path
+ (Item,
+ LostMetadata,
+ MissingObject,
+ default_exec_mode,
+ default_file_mode,
+ render_path)


# The current arrangement relies on a number of assumptions:
@@ -102,6 +108,11 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

+def _meta_replaced(path, repairs):
+ repairs.meta_replaced(path)
+ fs_path = render_path(path[1:])
+ log(f'warning: metadata lost for {path_msg(fs_path)}\n')
+
def _path_repaired(path, oid, replacement_oid, missing_oid, repairs):
fs_path = render_path(path)
repairs.path_replaced(path, oid, replacement_oid)
@@ -161,7 +172,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
"""Yield information about the paths underneath the given path.

Yield (src_path, replacement_dir), where src_path is a vfs_path
- and replacement_dir is be the replacement tree oid for a src_path
+ and replacement_dir is the replacement tree oid for a src_path
representing a directory that has already been rewritten.

When unreadable objects are encountered, raise MissingObject if
@@ -187,7 +198,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
# listed in the split-tree "leaves" are actually
# missing. So the list() only ensures that the split tree
# itself isn't broken; its contents may be.
- entries = list(vfs.contents(srcrepo, item))
+ entries = list(vfs.contents(srcrepo, item, repair=True))
except MissingObject as ex:
yield IncompleteDir(path, ex.oid), None
return
@@ -314,6 +325,10 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
is_dir = S_ISDIR(item_mode)
dir_path = fs_path if is_dir else fs_path[:-1]

+ if isinstance(item.meta, LostMetadata):
+ assert isinstance(repairs, Repairs), repairs
+ _meta_replaced(path, repairs)
+
# If switching to a new sub-tree, finish the current sub-tree, and
# then we'll establish the sub-tree for the new sub-tree via
# extend_stack for the missing components.
@@ -501,7 +516,8 @@ class Rewriter:
try:
# Maintain a stack of information representing the current
# location in the archive being constructed.
- stack = Stack(dstrepo, self._split_cfg)
+ stack = \
+ Stack(dstrepo, self._split_cfg, repair=repairs.destructive)

if self._current_excludes != excludes:
# Whenever the excludes change, remembered tree
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index b8bd7094..15f4b273 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -3,16 +3,17 @@ from io import BytesIO
from stat import S_ISDIR

from bup import hashsplit
+from bup._helpers import RecordHashSplitter
from bup.hashsplit import \
(BUP_TREE_BLOBBITS,
GIT_MODE_TREE,
GIT_MODE_FILE,
split_to_blob_or_tree)
+from bup.git import shalist_item_sort_key, mangle_name
from bup.helpers import add_error
-from bup.metadata import Metadata
from bup.io import path_msg
-from bup.git import shalist_item_sort_key, mangle_name
-from bup._helpers import RecordHashSplitter
+from bup.metadata import Metadata
+from bup.vfs import LostMetadata


_empty_metadata = Metadata(frozen=True)
@@ -118,16 +119,24 @@ class StackDir:
f' items={[(x.name, x.oid.hex()) for x in self.items]!r}>'


-def _dir_metadata(dir_meta, items):
- # If all the metadata bound for the bupm are int or None, drop the
- # bupm to either match the original (say git created) tree or
- # (not yet implemented) to repair.
+def _dir_metadata(dir_meta, items, repair):
+ # If all the metadata bound for the bupm are int, None or (when
+ # repairing) LostMetadata, drop the bupm to either match the
+ # original (say git created) tree or to repair. In the abridged
+ # case, dir_meta still won't be a LostMetadata because the bug
+ # didn't affect directories. Changes here must maintain
+ # coordination with the relevant VFS behaviors
+ # (e.g. tree_items_except_dot).
any_real_meta = False
- if isinstance(dir_meta, Metadata):
+ if isinstance(dir_meta, (int, type(None))):
+ meta_ents = [(b'', _empty_metadata)]
+ elif isinstance(dir_meta, LostMetadata):
+ if not repair:
+ raise Exception(f'LostMetadata for ".", but not repairing {dir_meta!r}')
+ meta_ents = [(b'', _empty_metadata)]
+ elif isinstance(dir_meta, Metadata):
any_real_meta = True
meta_ents = [(b'', dir_meta)]
- elif isinstance(dir_meta, (int, type(None))):
- meta_ents = [(b'', _empty_metadata)]
else:
raise Exception(f'Unexpected "." metadata type {dir_meta!r}')
for entry in items:
@@ -136,6 +145,11 @@ def _dir_metadata(dir_meta, items):
if isinstance(entry.meta, (int, type(None))):
ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
_empty_metadata)
+ elif isinstance(entry.meta, LostMetadata):
+ if not repair:
+ raise Exception(f'LostMetadata, but not repairing {entry!r}')
+ ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
+ _empty_metadata)
elif isinstance(entry.meta, Metadata):
any_real_meta = True
ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
@@ -149,10 +163,11 @@ def _dir_metadata(dir_meta, items):


class Stack:
- def __init__(self, repo, split_config):
+ def __init__(self, repo, split_config, *, repair=False):
self._stack = []
self._repo = repo
self._split_config = split_config
+ self._repair = repair

def __repr__(self):
cls = self.__class__
@@ -189,7 +204,7 @@ class Stack:
entry.oid)
for entry in items])

- metalist = _dir_metadata(dir_meta, items)
+ metalist = _dir_metadata(dir_meta, items, self._repair)
if not metalist:
return self._repo.write_tree([(entry.gitmode, entry.mangled_name(),
entry.oid)
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 15928f59..8c22578d 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -43,6 +43,12 @@ A publically readable integer item.meta means either:
directory (see fill_in_metadata_if_dir() or
ensure_item_has_metadata()).

+When repairs are requested (e.g. via contents()) and metadata cannot
+be recovered the metadata may be a LostMetadata instance, which
+currently only has an integer mode attribute. Note that bup.tree
+relies on critical assumptions about the VFS behavior in order to
+detect and restore "git created" trees, etc. during a rewrite.
+
Setting want_meta=False is rarely desirable since it can limit the VFS
to only the metadata that git itself can represent, and so for
example, fifos and sockets will appear to be regular files
@@ -97,7 +103,7 @@ from bup.git import \
parse_commit,
tree_entries,
tree_iter)
-from bup.helpers import debug2
+from bup.helpers import EXIT_FAILURE, debug2
from bup.io import path_msg
from bup.metadata import Metadata

@@ -586,22 +592,84 @@ def ordered_tree_entries(entries, bupm=None):
tree_ents.sort(key=lambda x: x[0])
return tree_ents

+class LostMetadata(Metadata):
+ """Representation for metadata that's been lost, e.g. due to a bug
+ like the one that dropped bupm entries."""
+ def __init__(self, mode):
+ super().__init__()
+ self.mode = mode
+ self.freeze()
+
+_lost_dir_meta = LostMetadata(default_dir_mode & ~(S_IRWXO | S_IRWXG))
+_lost_file_meta = LostMetadata(default_file_mode & ~(S_IRWXO | S_IRWXG))
+_lost_exec_meta = LostMetadata(default_exec_mode & ~(S_IRWXO | S_IRWXG))
+_lost_symlink_meta = LostMetadata(default_symlink_mode & ~(S_IRWXO | S_IRWXG))

-def _tree_items_except_dot(oid, entries, names=None, bupm=None):
+def _lost_metadata_for_gitinfo(mode, kind):
+ if S_ISREG(mode):
+ if mode & S_IXUSR:
+ return _lost_exec_meta
+ return _lost_file_meta
+ if S_ISDIR(mode):
+ if kind == BUP_CHUNKED:
+ # REVIEW: We've just lost any executable bit in this case, right?
+ return _lost_file_meta
+ return _lost_dir_meta
+ if S_ISLNK(mode):
+ return _lost_symlink_meta
+ assert 'unexpected mode', oct(mode) # for now shouldn't be possible
+ return None # pylint
+
+def _validated_meta_ents(oid, tree_ents, bupm, repair):
+ # Versions before 47891d8951a95b8e0d9ca94387107cdf12ca3d3c
+ # (before 0.31) might rarely drop a bupm entry, so check.
+ if not bupm:
+ return None
+ meta_entries = []
+ try:
+ while True: meta_entries.append(Metadata.read(bupm))
+ except EOFError:
+ pass
+ exp_meta_n = 0
+ for ent in tree_ents:
+ if ent[1] != b'.bupm' and (ent[2] == BUP_CHUNKED or not S_ISDIR(ent[3])):
+ exp_meta_n += 1
+ if exp_meta_n != len(meta_entries):
+ # should be increasingly rare, and rare to begin with
+ if not repair:
+ ex = SystemExit('error: tree has missing metadata'
+ f' (see bup-get(1)) - {oid.hex()}')
+ ex.code = EXIT_FAILURE
+ raise ex
+ return None
+ meta_entries.reverse()
+ return meta_entries
+
+def _tree_items_except_dot(oid, entries, names=None, bupm=None, *, repair=False):
"""Returns all tree items except ".", and assumes that any bupm is
positioned just after that entry. Any paths whose metadata has
been lost will have restrictive permissions (as if via umask
077). See the Repository Taxonmy in DESIGN.

"""
+ # Ensure any changes to the metadata yielded coordinates properly
+ # with bup.tree (e.g. _dir_metadata).
+
+ tree_ents = ordered_tree_entries(entries, bupm)
+ meta_ents = _validated_meta_ents(oid, tree_ents, bupm, repair)
+ if meta_ents is None:
+ assert repair or not bupm, (meta_ents, repair, bupm)

def read_nondir_meta(bupd, default_mode):
- if not bupm:
- return default_mode
- meta = Metadata.read(bupm)
- if meta is None: # lost metadata
- return default_mode & ~(S_IRWXO | S_IRWXG)
- return meta
+ if meta_ents is not None:
+ meta = meta_ents.pop()
+ # empty for a number of reasons; see Repository Taxonomy in DESIGN
+ if meta is None: # lost metadata
+ return default_mode & ~(S_IRWXO | S_IRWXG)
+ return meta
+ if bupm: # repair
+ return _lost_metadata_for_gitinfo(gitmode, kind)
+ return default_mode

def tree_item(ent_oid, kind, gitmode):
if kind == BUP_CHUNKED:
@@ -617,8 +685,6 @@ def _tree_items_except_dot(oid, entries, names=None, bupm=None):
meta = read_nondir_meta(bupm, _default_mode_for_gitinfo(gitmode, kind))
return Item(oid=ent_oid, meta=meta)

- tree_ents = ordered_tree_entries(entries, bupm)
-
assert isinstance(names, (set, frozenset)) or names is None
assert len(oid) == 20
if not names:
@@ -645,8 +711,8 @@ def _tree_items_except_dot(oid, entries, names=None, bupm=None):
if name not in names:
if name > last_name:
break # given bupm sort order, we're finished
- if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)) and bupm:
- Metadata.read(bupm)
+ if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)):
+ if meta_ents is not None: meta_ents.pop()
continue
yield name, tree_item(ent_oid, kind, gitmode)
if remaining == 1:
@@ -714,7 +780,7 @@ def _parse_tree_depth(mangled_name):
assert depth > 0
return depth

-def tree_items(repo, oid, tree_data, names, *, want_meta=True):
+def tree_items(repo, oid, tree_data, names, *, want_meta=True, repair=False):
# For now, the .bupm order doesn't quite match git's, and we don't
# load the tree data incrementally anyway, so we just work in RAM
# via tree_data.
@@ -740,6 +806,11 @@ def tree_items(repo, oid, tree_data, names, *, want_meta=True):
if mangled_name > b'.bupm':
break

+ # When repairing, "." wouldn't be affected by the missing entries
+ # issues, i.e. the first entry (the dir) should still be OK
+ # (16f9f9829038f25aec80ebfae3c882a66281e145). Split trees should
+ # also be unaffected because the bug was fixed before they were
+ # introduced (47891d8951a95b8e0d9ca94387107cdf12ca3d3c).
if want_meta and bupm_oid:
if depth is None:
with _FileReader(repo, bupm_oid) as bupm:
@@ -747,18 +818,20 @@ def tree_items(repo, oid, tree_data, names, *, want_meta=True):
Metadata.read(bupm)
else:
yield b'.', Item(oid=oid, meta=_read_dir_meta(bupm))
- yield from _tree_items_except_dot(oid, entries, names, bupm)
+ yield from _tree_items_except_dot(oid, entries, names, bupm,
+ repair=repair)
else:
if dot_requested:
with _FileReader(repo, bupm_oid) as bupm:
yield b'.', Item(oid=oid, meta=_read_dir_meta(bupm))
- yield from _split_subtree_items(repo, depth, oid, entries, names, True)
+ yield from _split_subtree_items(repo, depth, oid, entries, names,
+ True)
return

if dot_requested:
yield b'.', Item(oid=oid, meta=default_dir_mode)
if not depth:
- yield from _tree_items_except_dot(oid, entries, names)
+ yield from _tree_items_except_dot(oid, entries, names, repair=repair)
else:
yield from _split_subtree_items(repo, depth, oid, entries, names, False)

@@ -924,7 +997,7 @@ def tags_items(repo, names):
return
remaining -= 1

-def contents(repo, item, names=None, want_meta=True):
+def contents(repo, item, names=None, want_meta=True, repair=False):
"""Yields information about the items contained in item. Yields
(name, item) for each name in names, if the name exists, in an
unspecified order. Items that don't exist are omitted. If there
@@ -944,6 +1017,13 @@ def contents(repo, item, names=None, want_meta=True):
restrictive (i.e. no permissions for group or other), then the
metadata for the item has been lost.

+ If repair is true then when an attempt to retrieve the metadata
+ for a path fails (e.g. because the .bupm file for the directory
+ exists, but is broken), the metadata will be a suitable
+ LostMetadata instance, with uid/gid 0 and with no group or other
+ permissions. Special files will become empty regular files. If
+ repair is false, exceptions will be raised instead.
+
Do not modify any item.meta Metadata instances directly. If
needed, make a copy via item.meta.copy() and modify that instead.

@@ -960,7 +1040,8 @@ def contents(repo, item, names=None, want_meta=True):
# Note: it shouldn't be possible to see an Item with type
# 'commit' since a 'commit' should always produce a Commit.
raise Exception('unexpected git ' + obj_t.decode('ascii'))
- yield from tree_items(repo, item.oid, data, names, want_meta=want_meta)
+ yield from tree_items(repo, item.oid, data, names, want_meta=want_meta,
+ repair=repair)
elif isinstance(item, RevList):
yield from revlist_items(repo, item.oid, names,
require_meta=want_meta)
diff --git a/note/main.md b/note/main.md
index 7e78dc3f..30143352 100644
--- a/note/main.md
+++ b/note/main.md
@@ -4,6 +4,14 @@ Notable changes in main (incomplete)
May require attention
---------------------

+* Versions of `bup` at or after 0.25 and before 0.30.1 might (rarely)
+ drop metadata entries for non-directories. That makes the metadata
+ for all of the other non-directory paths in the same directory
+ unusable (ambiguous). `bup` now detects this and treats it as an
+ error, given the potential risks with respect to incorrect
+ ownership, permissions, etc. The new `bup validate-refs` command can
+ detect the problem and `bup get --repair` can repair affected saves.
+
* Previously, `bup get --force-pick: SRC /.tag/DEST` created broken
commits if the `DEST` was not itself a commit (the parent would be
whatever `DEST` initially pointed to).
diff --git a/test/ext/test-get-repair-bupm b/test/ext/test-get-repair-bupm
new file mode 100755
index 00000000..5efc90f0
--- /dev/null
+++ b/test/ext/test-get-repair-bupm
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh
+. ./test/lib/btl.sh
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+bup() { "$top/bup" "$@"; }
+
+set-repair-id()
+{
+ local log="$1"
+ WVPASSEQ 1 "$(grep -cE '^repairs needed, repair-id: ' "$log")"
+ repair_id="$(WVPASS grep -E '^repairs needed, repair-id: ' "$log")"
+ repair_id="${repair_id#repairs needed, repair-id: }"
+}
+
+# FIXME: consider checking expected compare-trees differences.
+
+
+WVPASS cd "$tmpdir"
+WVPASS bup init
+
+
+# See also test-validate-refs
+
+WVSTART 'repair of abridged bupms'
+# Create a save with two root files, and then three, and then replace
+# the three entry save's .bupm with the one from the two entry save so
+# we can check that abridged bupms are detected. Note that tree
+# splitting was added to bup well after this bug was fixed.
+WVPASS rm -rf bup src
+WVPASS bup init
+WVPASS mkdir -p src
+WVPASS echo 1 > src/1
+WVPASS echo 2 > src/2
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+WVPASS echo 3 > src/3
+WVPASS bup index src
+WVPASS bup tick
+WVPASS bup save --strip -n src src
+# Now replace src:.bupm with the abridged one
+bupm_1_2_ent="$(WVPASS git ls-tree src~ | WVPASS grep -E $'\t\.bupm$')"
+broken_tree="$(WVPASS git ls-tree src | WVPASS sed -Ee "1s/.*/$bupm_1_2_ent/")"
+broken_tree_oid="$(echo "$broken_tree" | WVPASS git mktree)"
+broken_save=$(WVPASS git commit-tree "$broken_tree_oid" -p src -m 'abridged bupm')
+WVPASS git branch -f src "$broken_save"
+
+src_oid="$(git rev-parse src)"
+WVPASS readarray -t saves < <(bup ls src)
+save_date="${saves[2]}"
+unset saves
+
+# May as well double-check, though test-validate-refs already does this
+WVEXPRC 1 eval 'bup validate-refs --bupm 2>&1 | tee validate.log'
+btl-display-file validate.log
+wv-match-rx "$(< validate.log)" \
+"abridged-bupm refs/heads/src $broken_tree_oid:\.bupm/?
+scanned 0 of 1 ref \(0\.00% of all objects\)"
+
+# Ensure a normal rewrite rejects the abridged bupm
+WVEXPRC 2 eval 'bup get --rewrite --append: src dst 2>&1 | tee rewrite.log'
+btl-display-file rewrite.log
+
+WVPASS umask 077
+
+# Test repair
+WVEXPRC 3 eval 'bup get --repair --append: src dst 2>&1 | tee rewrite.log'
+btl-display-file rewrite.log
+set-repair-id rewrite.log
+
+# Check commit message trailers
+WVPASS git --git-dir bup show -s --pretty=email dst \
+ | WVPASS git interpret-trailers --parse > repair-trailers
+btl-display-file repair-trailers
+bup_ver="$(bup version)"
+readarray -t trailers < repair-trailers
+wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* get --repair --append: src dst$'
+wv-match-rx "${trailers[2]}" "^Bup-Repair-ID: $repair_id$"
+wv-match-rx "${trailers[3]}" "^Bup-Repaired-Save: $src_oid src/$save_date$"
+wv-match-rx "${trailers[4]}" '^Bup-Lost-Meta: 1$'
+wv-match-rx "${trailers[5]}" '^Bup-Lost-Meta: 2$'
+wv-match-rx "${trailers[6]}" '^Bup-Lost-Meta: 3$'
+WVPASSEQ '' "${trailers[7]}" # end-of-line
+unset trailers
+
+# Check path metadata
+WVPASS bup restore -C res dst/latest/.
+for i in {1..3}; do
+ WVPASS bup xstat res/"$i" > res-xstat
+ ls -l res/"$i"
+ btl-display-file res-xstat
+ WVPASS grep -E '^mode: 100600 \(-rw-------\)$' res-xstat
+ WVPASS grep -E '^rdev: 0$' res-xstat
+ WVPASS grep -E '^mtime: 0$' res-xstat
+done
+
+
+WVPASS cd "$top"
+WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index 066d70b8..75aad701 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -12,14 +12,6 @@ export GIT_DIR="$tmpdir/bup"

bup() { "$top/bup" "$@"; }

-display-file()
-{
- local name="$1"
- printf -- "----- \"%q\" content below -----\n" "$name"
- cat "$name"
- printf -- "----- \"%q\" content above -----\n" "$name"
-}
-
# FIXME: consider checking expected compare-trees differences.


@@ -172,9 +164,9 @@ WVPASS git --git-dir dest-repo ls-tree src \

WVSTART 'commit trailers include repairs'
git --git-dir dest-repo show -s --pretty=email src > repair-msg
-display-file repair-msg
+btl-display-file repair-msg
git interpret-trailers --parse < repair-msg > repair-trailers
-display-file repair-trailers
+btl-display-file repair-trailers
bup_ver="$(bup version)"
readarray -t trailers < repair-trailers
wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
@@ -202,7 +194,7 @@ src_missing_split="/src/$save_date/split-tree/"
WVSTART 'missing blobs are rewritten'
WVPASS grep -E "^repaired $src_missing_file $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:a/missing-file > blob-replacement
-display-file blob-replacement
+btl-display-file blob-replacement
WVPASS grep -E '^This is a replacement for a file' blob-replacement
WVPASS grep -E "^Bup-Replacement-Info: $repair_id" blob-replacement
WVPASS grep -E "^Replaced: file $(< file-oid)" blob-replacement
@@ -212,7 +204,7 @@ WVPASS grep -E "^Missing: $(< file-oid)" blob-replacement
WVSTART 'missing trees are rewritten'
WVPASS grep -E "^repaired $src_missing_dir $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:missing-dir > tree-replacement
-display-file tree-replacement
+btl-display-file tree-replacement
WVPASS grep -E '^This is a replacement for a tree' tree-replacement
WVPASS grep -E "^Bup-Replacement-Info: $repair_id" tree-replacement
WVPASS grep -E "^Replaced: tree $(< dir-oid)" tree-replacement
@@ -222,7 +214,7 @@ WVPASS grep -E "^Missing: $(< dir-oid)" tree-replacement
WVSTART 'incomplete chunked files are rewritten'
WVPASS grep -E "^repaired $src_missing_partial $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:partial-file > partial-file-replacement
-display-file partial-file-replacement
+btl-display-file partial-file-replacement
WVPASS grep -E '^This is a replacement for a file' partial-file-replacement
WVPASS grep -E "^Bup-Replacement-Info: $repair_id" partial-file-replacement
WVPASS grep -E "^Replaced: file $(< partial-file-oid)" partial-file-replacement
@@ -241,7 +233,7 @@ repair-to-dest
set-repair-id
WVPASS grep -E "^repaired $src_missing_split $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:split-tree > split-tree-replacement
-display-file split-tree-replacement
+btl-display-file split-tree-replacement
WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
WVPASS grep -E "^Bup-Replacement-Info: $repair_id" split-tree-replacement
WVPASS grep -E "^Replaced: tree $(< split-tree-oid)" split-tree-replacement
@@ -259,7 +251,7 @@ WVPASS grep -E "^repaired /src/$save_date/split-tree/$split_tree_l2_name/ $(< sp
WVPASS git --git-dir dest-repo \
show "src:split-tree/$split_tree_l1_name/$split_tree_l2_name" \
> split-tree-replacement
-display-file split-tree-replacement
+btl-display-file split-tree-replacement
WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
WVPASS grep -E "^Bup-Replacement-Info: $repair_id" split-tree-replacement
WVPASS grep -E "^Replaced: tree $(< split-tree-l2-oid)" split-tree-replacement
@@ -275,7 +267,7 @@ set-repair-id
WVPASS grep -E "^repaired /src/$save_date/split-tree/ $(< split-tree-oid) ->" \
repair.log
WVPASS git --git-dir dest-repo show "src:split-tree" > split-tree-replacement
-display-file split-tree-replacement
+btl-display-file split-tree-replacement
WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
WVPASS grep -E "^Bup-Replacement-Info: $repair_id" split-tree-replacement
WVPASS grep -E "^Replaced: tree $(< split-tree-oid)" split-tree-replacement
diff --git a/test/lib/btl.sh b/test/lib/btl.sh
index d6d52811..255c84db 100644
--- a/test/lib/btl.sh
+++ b/test/lib/btl.sh
@@ -16,3 +16,11 @@ btl-ent-oid()
oid="${ls_tree_line%%$'\t'*}"
echo "${oid##* }"
}
+
+btl-display-file()
+{
+ local name="$1"
+ printf -- "----- \"%q\" content below -----\n" "$name" || exit $?
+ cat "$name" || exit $?
+ printf -- "----- \"%q\" content above -----\n" "$name" || exit $?
+}
diff --git a/wvtest.sh b/wvtest.sh
index e1b83dfd..22eda616 100644
--- a/wvtest.sh
+++ b/wvtest.sh
@@ -80,7 +80,6 @@ WVEXPRC()
esac
}

-
WVPASS()
{
local TEXT="$*"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Previously we'd lose the git tree exec bit when rewriting trees
without a .bupm (e.g. git created trees). Detect cases where the
original git tree entry had the exec bit and restore it.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/hashsplit.py | 1 +
lib/bup/rewrite.py | 27 ++++++++++++++++++++++-----
lib/bup/vfs.py | 3 +++
test/ext/test-rewrite | 9 +++++++--
4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 8f71c8be..15e12f0a 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -12,6 +12,7 @@ MAX_PER_TREE = 256
fanout = 16

GIT_MODE_FILE = 0o100644
+GIT_MODE_EXEC = 0o100755
GIT_MODE_TREE = 0o40000
GIT_MODE_SYMLINK = 0o120000

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index e236a2f0..324ea3b7 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -3,7 +3,7 @@ from binascii import hexlify
from contextlib import ExitStack, closing, nullcontext
from itertools import chain
from os.path import join as joinp
-from stat import S_ISDIR, S_ISLNK, S_ISREG
+from stat import S_ISDIR, S_ISLNK, S_IRWXG, S_IRWXO, S_ISREG
from typing import Any, Sequence
import sqlite3, time

@@ -11,14 +11,19 @@ from bup import hashsplit, metadata, vfs
from bup.commit import commit_message
from bup.compat import dataclass
from bup.git import get_cat_data, parse_commit
-from bup.hashsplit import GIT_MODE_FILE, GIT_MODE_SYMLINK, GIT_MODE_TREE
+from bup.hashsplit import \
+ (GIT_MODE_EXEC,
+ GIT_MODE_FILE,
+ GIT_MODE_SYMLINK,
+ GIT_MODE_TREE,
+ split_to_blob_or_tree)
from bup.helpers import hostname, log, should_rx_exclude_path, temp_dir
from bup.io import path_msg, qsql_id
from bup.metadata import Metadata
from bup.pwdgrp import userfullname, username
from bup.repair import MissingConfig
from bup.tree import Stack
-from bup.vfs import Item, MissingObject, default_file_mode
+from bup.vfs import Item, MissingObject, default_exec_mode, default_file_mode


# Currently only handles replacing entire vfs-level trees if any
@@ -279,6 +284,16 @@ def _remember_rewrite(from_oid, to_oid, chunked, size, wdbc, mapping):
' values (?, ?, ?, ?)',
(from_oid, to_oid, chunked, size))

+def _maybe_exec_mode(git_mode, meta):
+ if git_mode == GIT_MODE_FILE \
+ and meta in (default_exec_mode,
+ default_exec_mode & ~(S_IRWXO | S_IRWXG)):
+ # Means (via vfs) this had GIT_MODE_EXEC in its git
+ # entry. Restore that here so tree._write_tree will include
+ # it. This only matters for the "found in db" case.
+ return GIT_MODE_EXEC
+ return git_mode
+
def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
split_cfg, stack, wdbc, mapping, missing):
"""Returns either None, or, if a directory was missing, the
@@ -386,7 +401,8 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
assert S_ISREG(item_mode)
if oid is not None:
# already converted - oid and mode are known
- assert git_mode in (GIT_MODE_TREE, GIT_MODE_FILE)
+ assert S_ISREG(git_mode) or S_ISDIR(git_mode)
+ git_mode = _maybe_exec_mode(git_mode, item.meta)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)
return

@@ -399,7 +415,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,

try:
with vfs.tree_data_reader(srcrepo, item.oid) as f:
- git_mode, oid = hashsplit.split_to_blob_or_tree(
+ git_mode, oid = split_to_blob_or_tree(
write_data, dstrepo.write_tree,
hashsplit.from_config([f], split_cfg))
except MissingObject as ex:
@@ -430,6 +446,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
chunked = 1 if S_ISDIR(git_mode) else 0

_remember_rewrite(item.oid, oid, chunked, item_size, wdbc, mapping)
+ git_mode = _maybe_exec_mode(git_mode, item.meta)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

class Rewriter:
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index d15c75e6..793b3e6f 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -113,11 +113,14 @@ class IOError(py_IOError):
_reg_perms = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
_exec_perms = _reg_perms | S_IXUSR | S_IXGRP | S_IXOTH
default_file_mode = S_IFREG | _reg_perms
+default_exec_mode = S_IFREG | _exec_perms
default_dir_mode = S_IFDIR | _exec_perms
default_symlink_mode = S_IFLNK | _exec_perms

def _default_mode_for_gitmode(gitmode):
if S_ISREG(gitmode):
+ if gitmode & S_IXUSR:
+ return default_exec_mode
return default_file_mode
if S_ISDIR(gitmode):
return default_dir_mode
diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
index ba6c4acc..d9d40104 100755
--- a/test/ext/test-rewrite
+++ b/test/ext/test-rewrite
@@ -147,9 +147,12 @@ diff -u "$tmpdir/o" "$tmpdir/n"
WVSTART "rewrite trees without .bupm"
WVPASS rm -rf src repo
WVPASS mkdir src
+# Test rewrite db cases with multiple versions of x (numbers ensure ordering)
WVPASS date > src/x
+WVPASS cp -p src/x src/x-1-with-exec
+WVPASS cp -p src/x src/x-2-without-exec
+WVPASS chmod +x src/x-1-with-exec
WVPASS date > src/y
-WVPASS chmod +x src/x
WVPASS bup init repo
WVPASS git --git-dir repo --work-tree "$(pwd)" add src
WVPASS git --git-dir repo --work-tree "$(pwd)" commit -am save
@@ -158,9 +161,11 @@ WVPASS git --git-dir repo config bup.split.files legacy:16
WVPASS bup -d repo get --rewrite -s repo --append: main main-rewrite
WVPASSEQ "$(bup -d repo ls -ln main-rewrite/latest/src | tr -s ' ' ' ')" \
'-rw-r--r-- ?/? 29 ????-??-?? ??:?? x
+-rwxr-xr-x ?/? 29 ????-??-?? ??:?? x-1-with-exec
+-rw-r--r-- ?/? 29 ????-??-?? ??:?? x-2-without-exec
-rw-r--r-- ?/? 29 ????-??-?? ??:?? y'
WVPASS git --git-dir repo ls-tree main-rewrite:src | WVPASS grep -vE '\.bupm$'
-WVPASS rm -rf repo src
+WVPASS rm -rf repo restore src


WVPASS cd "$top"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Change the "get"s to raise MissingObject when the object is expected
to exist, for example, an object that was listed in a tree being
traversed (as compared to say a top-level ref that might or might not
exist).

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/vfs.py | 82 +++++++++++++++++++++++++++++---------------------
1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index c44ad9ad..813c5087 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -58,6 +58,7 @@ import re
from bup import git
from bup.git import \
(BUP_CHUNKED,
+ MissingObject,
GitError,
find_tree_entry,
last_tree_entry,
@@ -91,17 +92,40 @@ def _default_mode_for_gitmode(gitmode):
return default_symlink_mode
raise Exception('unexpected git mode ' + oct(gitmode))

+def get_ref(repo, ref):
+ """Yield (oidx, type, size, data_iter) for ref.
+
+ If ref is missing, yield (None, None, None, None).
+
+ """
+ it = repo.cat(ref)
+ found_oidx, obj_t, size = next(it)
+ if not found_oidx:
+ return None, None, None, None
+ return found_oidx, obj_t, size, it
+
+def get_oidx(repo, oidx, *, throw_missing=True):
+ """Yield (oidx, type, size, data_iter) for oidx.
+
+ If oidx is missing, raise a MissingObject if throw_missing is
+ false, otherwise yield (None, None, None, None).
+
+ """
+ assert len(oidx) == 40
+ result = get_ref(repo, oidx)
+ if not result[0] and throw_missing:
+ raise MissingObject(unhexlify(oidx))
+ return result
+
def _normal_or_chunked_file_size(repo, oid):
"""Return the size of the normal or chunked file indicated by oid."""
# FIXME: --batch-format CatPipe?
- it = repo.cat(hexlify(oid))
- _, obj_t, size = next(it)
+ _, obj_t, _, it = get_oidx(repo, hexlify(oid))
ofs = 0
while obj_t == b'tree':
mode, name, last_oid = last_tree_entry(b''.join(it))
ofs += int(name, 16)
- it = repo.cat(hexlify(last_oid))
- _, obj_t, size = next(it)
+ _, obj_t, _, it = get_oidx(repo, hexlify(last_oid))
return ofs + sum(len(b) for b in it)

def _skip_chunks_before_offset(tree_data, offset):
@@ -122,8 +146,7 @@ def _tree_chunks(repo, tree_data, startofs):
skipmore = startofs - ofs
if skipmore < 0:
skipmore = 0
- it = repo.cat(hexlify(oid))
- _, obj_t, size = next(it)
+ _, obj_t, _, it = get_oidx(repo, hexlify(oid))
data = b''.join(it)
if S_ISDIR(mode):
assert obj_t == b'tree'
@@ -134,8 +157,7 @@ def _tree_chunks(repo, tree_data, startofs):

class _ChunkReader:
def __init__(self, repo, oid, startofs):
- it = repo.cat(hexlify(oid))
- _, obj_t, size = next(it)
+ _, obj_t, _, it = get_oidx(repo, hexlify(oid))
isdir = obj_t == b'tree'
data = b''.join(it)
if isdir:
@@ -361,13 +383,11 @@ def _read_dir_meta(bupm):

def _treeish_tree_data(repo, oid):
assert len(oid) == 20
- it = repo.cat(hexlify(oid))
- _, item_t, size = next(it)
+ _, item_t, _, it = get_oidx(repo, hexlify(oid))
data = b''.join(it)
if item_t == b'commit':
commit = parse_commit(data)
- it = repo.cat(commit.tree)
- _, item_t, size = next(it)
+ _, item_t, _, it = get_oidx(repo, commit.tree)
data = b''.join(it)
assert item_t == b'tree'
elif item_t != b'tree':
@@ -454,8 +474,7 @@ def _commit_item_from_oid(repo, oid, require_meta):
commit = cache_get_commit_item(oid, need_meta=require_meta)
if commit and ((not require_meta) or isinstance(commit.meta, Metadata)):
return commit
- it = repo.cat(hexlify(oid))
- _, typ, size = next(it)
+ _, typ, _, it = get_oidx(repo, hexlify(oid))
assert typ == b'commit'
commit = _commit_item_from_data(oid, b''.join(it))
if require_meta:
@@ -500,7 +519,6 @@ def root_items(repo, names=None, want_meta=True):
it = repo.cat(b'refs/heads/' + ref)
oidx, typ, size = next(it)
if not oidx:
- for _ in it: pass
continue
assert typ == b'commit'
commit = parse_commit(b''.join(it))
@@ -586,9 +604,8 @@ def _tree_items_except_dot(oid, entries, names=None, bupm=None):
remaining -= 1

def _get_tree_object(repo, oid):
- res = repo.cat(hexlify(oid))
- _, kind, _ = next(res)
- assert kind == b'tree', 'expected oid %r to be tree, not %r' % (hexlify(oid), kind)
+ _, kind, _, res = get_oidx(repo, hexlify(oid))
+ assert kind == b'tree', f'expected oid {oid.hex()} to be tree, not {kind!r}'
return b''.join(res)

def _find_bupm_oid(entries):
@@ -814,9 +831,7 @@ def tags_items(repo, names):
cached = cache_get_commit_item(oid, need_meta=False)
if cached:
return cached
- oidx = hexlify(oid)
- it = repo.cat(oidx)
- _, typ, size = next(it)
+ _, typ, _, it = get_oidx(repo, hexlify(oid))
if typ == b'commit':
return _commit_item_from_data(oid, b''.join(it))
for _ in it: pass
@@ -862,9 +877,9 @@ def tags_items(repo, names):
def contents(repo, item, names=None, want_meta=True):
"""Yields information about the items contained in item. Yields
(name, item) for each name in names, if the name exists, in an
- unspecified order. If there are no names, then yields (name,
- item) for all items, including, a first item named '.'
- representing the container itself.
+ unspecified order. Items that don't exist are omitted. If there
+ are no names, then yields (name, item) for all items, including, a
+ first item named '.' representing the container itself.

The meta value for any directories other than '.' will be a
default directory mode, not a Metadata object. This is because
@@ -885,8 +900,7 @@ def contents(repo, item, names=None, want_meta=True):
assert repo
assert S_ISDIR(item_mode(item))
if isinstance(item, real_tree_types):
- it = repo.cat(hexlify(item.oid))
- _, obj_t, size = next(it)
+ _, obj_t, _, it = get_oidx(repo, hexlify(item.oid))
data = b''.join(it)
if obj_t != b'tree':
for _ in it: pass
@@ -970,6 +984,7 @@ def _resolve_path(repo, path, parent=None, want_meta=True, follow=True):
if not want_meta:
item = items[0][1] if items else None
else: # First item will be '.' and have the metadata
+ assert len(items) in (1, 2), items
item = items[1][1] if len(items) == 2 else None
dot, dot_item = items[0]
assert dot == b'.'
@@ -1164,24 +1179,23 @@ def join(repo, ref):
or a commit. The content of all blobs that can be seen from trees or
commits will be added to the list.
"""
- def _join(ref, path):
- it = repo.cat(ref)
- oidx, typ, _ = next(it)
+ def _join(oidx, typ, size, it, path):
if typ == b'blob':
yield from it
elif typ == b'tree':
treefile = b''.join(it)
for ent_mode, ent_name, ent_oid in tree_iter(treefile):
- yield from _join(hexlify(ent_oid), path + [ent_name])
+ yield from _join(*get_oidx(repo, hexlify(ent_oid)), path + [ent_name])
elif typ == b'commit':
treeline = b''.join(it).split(b'\n')[0]
assert treeline.startswith(b'tree ')
tree_oidx = treeline[5:]
path += [oidx, tree_oidx]
- yield from _join(tree_oidx, path)
+ yield from _join(*get_oidx(repo, tree_oidx), path)
else:
- if oidx is None:
- raise GitError(f'missing ref at {path!r}')
raise GitError(f'type {typ!r} is not blob/tree/commit at {path!r}')

- yield from _join(ref, [ref])
+ got = get_ref(repo, ref)
+ if not got[0]:
+ raise GitError(f'ref {ref} does not exist') # eventually some ENOENT?
+ yield from _join(*got, [ref])
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
The vfs currently doesn't allow it, so adjust copies.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 41 ++++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index 6951cbee..d2b49137 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -43,12 +43,21 @@ def prep_mapping_table(db, split_cfg):
' without rowid')
return table_id

-def converted_already(dstrepo, item, vfs_dir, db, mapping):
- size = -1 # irrelevant
- mode = item.meta
+def previous_conversion(dstrepo, item, vfs_dir, db, mapping):
+ """Return (replacement_item, converted_oid, mode) for the given
+ item if any, *and* if the dstrepo has the item.oid. If not,
+ converted_oid and mode will be None. The replacement_item will
+ either be item, or an augmented copy of item, (e.g. with a proper
+ size) that should be used instead of item.
+
+ """
if isinstance(item.meta, metadata.Metadata):
size = item.meta.size
mode = item.meta.mode
+ else:
+ size = None
+ mode = item.meta
+
# if we know the size, and the oid exists already (small file w/o
# hashsplit) then simply return it can't do that if it's a
# directory, since it might exist but in the non-augmented
@@ -60,19 +69,21 @@ def converted_already(dstrepo, item, vfs_dir, db, mapping):
db.execute(f'select dst, mode, size from {mapping} where src = ?',
(item.oid,))
data = db.fetchone()
- # if it's not found, then we don't know anything
if not data:
- return None, None
+ return item, None, None
dst, mode, size = data
# augment the size if appropriate
if size is not None and isinstance(item.meta, metadata.Metadata):
- assert item.meta.size is None or item.meta.size == size
- item.meta.size = size
+ if item.meta.size is not None:
+ assert item.meta.size == size
+ else: # must not modify vfs results (see vfs docs)
+ item = vfs.copy_item(item)
+ item.meta.size = size
# if we have it in the DB and in the destination repo, return it
if dstrepo.exists(dst):
- return dst, mode
+ return item, dst, mode
# this only happens if you reuse a database
- return None, None
+ return item, None, None

def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
fullname=b''):
@@ -84,7 +95,8 @@ def vfs_walk_recursively(srcrepo, dstrepo, vfs_item, excludes, db, mapping,
if should_rx_exclude_path(check_name, excludes):
continue
if S_ISDIR(vfs.item_mode(item)):
- if converted_already(dstrepo, item, True, db, mapping)[0] is None:
+ item, oid, _ = previous_conversion(dstrepo, item, True, db, mapping)
+ if oid is None:
yield from vfs_walk_recursively(srcrepo, dstrepo, item,
excludes, db, mapping,
fullname=itemname)
@@ -113,7 +125,7 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
meta = None
stack.push(dir_name, meta)

- oid, mode = converted_already(dstrepo, item, not filen, wdbc, mapping)
+ item, oid, mode = previous_conversion(dstrepo, item, not filen, wdbc, mapping)

if not filen:
if len(stack) == 1:
@@ -127,8 +139,7 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,

vfs_mode = vfs.item_mode(item)

- # already converted - id is known, item.meta was updated if needed
- # (in converted_already()), and the proper new mode was returned
+ # already converted - oid and mode are known
if oid is not None:
assert mode is not None, oid
stack.append_to_current(filen, vfs_mode, mode, oid, item.meta)
@@ -148,6 +159,8 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
hashsplit.from_config([f], split_cfg))
if isinstance(item.meta, metadata.Metadata):
if item.meta.size is None:
+ # must not modify vfs results (see vfs docs)
+ item = vfs.copy_item(item)
item.meta.size = item_size
size_augmented = True
else:
@@ -159,6 +172,8 @@ def rewrite_item(item, commit_name, fullname, srcrepo, src, dstrepo, split_cfg,
mode, oid = (GIT_MODE_SYMLINK, dstrepo.write_symlink(target))
if isinstance(item.meta, metadata.Metadata):
if item.meta.size is None:
+ # must not modify vfs results (see vfs docs)
+ item = vfs.copy_item(item)
item.meta.size = len(item.meta.symlink_target)
size_augmented = True
else:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/restore.py | 3 ++-
lib/bup/ls.py | 3 ++-
lib/bup/metadata.py | 7 -------
lib/bup/vfs.py | 11 ++++++-----
4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/lib/bup/cmd/restore.py b/lib/bup/cmd/restore.py
index 90ad66b0..e0ff629b 100644
--- a/lib/bup/cmd/restore.py
+++ b/lib/bup/cmd/restore.py
@@ -1,4 +1,5 @@

+from copy import deepcopy
from stat import S_ISDIR
import errno, os, re, stat, sys

@@ -70,7 +71,7 @@ def parse_owner_mappings(type, options, fatal):
return owner_map

def apply_metadata(meta, name, restore_numeric_ids, owner_map):
- m = meta.copy(frozen=False)
+ m = deepcopy(meta).thaw()
m.user = owner_map['user'].get(m.user, m.user)
m.group = owner_map['group'].get(m.group, m.group)
m.uid = owner_map['uid'].get(m.uid, m.uid)
diff --git a/lib/bup/ls.py b/lib/bup/ls.py
index ba950a3e..2ace8c83 100644
--- a/lib/bup/ls.py
+++ b/lib/bup/ls.py
@@ -1,6 +1,7 @@
"""Common code for listing files from a bup repository."""

from binascii import hexlify
+from copy import deepcopy
from itertools import chain
from stat import S_ISDIR
import os.path
@@ -42,7 +43,7 @@ def item_info(item, name,
else:
result.append(b'0000000000000000000000000000000000000000 ')
if long_fmt:
- meta = item.meta.copy(frozen=False)
+ meta = deepcopy(item.meta).thaw()
meta.path = name
# FIXME: need some way to track fake vs real meta items?
result.append(metadata.summary_bytes(meta.freeze(),
diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 93d464d5..77e3615a 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -904,13 +904,6 @@ class Metadata:
ret.append(vint.encode_vuint(_rec_tag_end))
return b''.join(ret)

- def copy(self, frozen=None):
- if frozen is None:
- return copy.deepcopy(self)
- if frozen:
- return copy.deepcopy(self).freeze()
- return copy.deepcopy(self).thaw()
-
@staticmethod
def read(port, empty=_use_empty_metadata):
"""Read an encoded Metadata instance from port, returning None on EOF.
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 902da582..fa3d375f 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -13,7 +13,7 @@ Each path is represented by an item that has least an item.meta which
may be either a Metadata object, or an integer mode. Functions like
item_mode() and item_size() will return the mode and size in either
case. Metadata instances must not be modified directly. Make a copy
-to modify via item.meta.copy() if needed, or call copy_item().
+to modify via deepcopy() if needed, or call copy_item().

The want_meta argument is advisory for calls that accept it, and it
may not be honored. Callers must be able to handle an item.meta value
@@ -70,6 +70,7 @@ item.coid.

from binascii import hexlify, unhexlify
from collections import namedtuple
+from copy import deepcopy
from errno import EINVAL, ELOOP, ENOTDIR
from itertools import tee
from random import randrange
@@ -408,7 +409,7 @@ def copy_item(item):
"""
meta = getattr(item, 'meta', None)
if isinstance(meta, Metadata):
- return(item._replace(meta=meta.copy()))
+ return(item._replace(meta=deepcopy(meta)))
return item

def item_mode(item):
@@ -1029,7 +1030,7 @@ def contents(repo, item, names=None, want_meta=True, repair=False):
repair is false, exceptions will be raised instead.

Do not modify any item.meta Metadata instances directly. If
- needed, make a copy via item.meta.copy() and modify that instead.
+ needed, make a copy via deepcopy() and modify that instead.

"""
# Q: are we comfortable promising '.' first when no names?
@@ -1226,7 +1227,7 @@ def resolve(repo, path, parent=None, want_meta=True, follow=True):
or the more limited metadata is sufficient.

Do not modify any item.meta Metadata instances directly. If
- needed, make a copy via item.meta.copy() and modify that instead.
+ needed, make a copy via deepcopy() and modify that instead.

"""
if repo.is_remote():
@@ -1279,7 +1280,7 @@ def augment_item_meta(repo, item, *, include_size=False, public=False):
m = item.meta
if isinstance(m, Metadata):
if include_size and m.size is None:
- m = m.copy(frozen=False)
+ m = deepcopy(m).thaw()
m.size = maybe_public(m.mode, _compute_item_size(repo, item))
return item._replace(meta=m.freeze())
return item
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
...since they're both created locally.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/tree.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index aedee080..ac4dcca6 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -140,9 +140,9 @@ class Stack:
for entry in items if entry.mode != GIT_MODE_TREE)
if add_meta and meta_ok:
metalist = [(b'', _empty_metadata if dir_meta is None else dir_meta)]
- metalist += [(shalist_item_sort_key((entry.mode, entry.name, None)),
- entry.meta)
- for entry in items if entry.mode != GIT_MODE_TREE]
+ metalist.extend((shalist_item_sort_key((entry.mode, entry.name, None)),
+ entry.meta)
+ for entry in items if entry.mode != GIT_MODE_TREE)
metalist.sort(key = lambda x: x[0])
metadata = BytesIO(b''.join(m[1].encode() for m in metalist))
splitter = hashsplit.from_config([metadata], self._split_config)
@@ -150,8 +150,8 @@ class Stack:
self._repo.write_tree,
splitter)
shalist.append((mode, b'.bupm', oid))
- shalist += [(entry.gitmode, entry.mangled_name(), entry.oid)
- for entry in items]
+ shalist.extend((entry.gitmode, entry.mangled_name(), entry.oid)
+ for entry in items)
return self._repo.write_tree(shalist)

def _write_split_tree(self, dir_meta, items, level=0):
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Having --[no-]rewrite and --[no-]repair boolean states creates any
number of (uninteresting) cases to consider when handling multiple get
requests in a single invocation, when in fact, there are really only
three related get cases (modes): rewrites, repairs, and direct
transfers (what get always did before the introduction of
rewrite/repair). So just add a new --copy argument for the latter,
simplify the relevant logic, update the docs, and adjust the tests.

As part of that, move the contextual argument checks to the spec
creation which hopefully makes it easier to follow, though it does
mean that previously invalid command lines are now fine, i.e. we allow
temporarily invalid "states" (reading the args from left to right)
like "... --repair --ignore-missing --no-ignore-missing --append
...". Now all that matters is the state of the contextual args when a
method argument (--append, --pick, ...) is reached. Document
contextual arguments in bup-get(1).

Extend test-get-rewrite-missing to cover more of this.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 77 +++++++++++------
lib/bup/cmd/get.py | 87 ++++++++++----------
test/ext/test-get-rewrite-missing | 132 +++++++++++++++++++++++++++---
test/ext/test_get.py | 2 +-
4 files changed, 218 insertions(+), 80 deletions(-)

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index 9d52f75a..26856f9a 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -12,12 +12,15 @@ bup get \[-s *source-path*\] \[-r *host*:*path*\] OPTIONS \<(METHOD *ref* [*des

# DESCRIPTION

-`bup get` copies the indicated *ref*s from the source repository to
+`bup get` transfers the indicated *ref*s from the source repository to
the destination repository (respecting `--bup-dir` and `BUP_DIR`),
according to the specified METHOD, which may be one of `--ff`,
`--ff:`, `--append`, `--append:`, `--pick`, `--pick:`, `--force-pick`,
`--force-pick:`, `--new-tag`, `--new-tag:`, `--replace`, `--replace:`,
-or `--unnamed`. See the EXAMPLES below for a quick introduction.
+or `--unnamed`. By default it will `--copy` the data without
+alteration, but it can also `--rewrite` it, potentially changing the
+deduplication granularity, and `--repair` some kinds of damage. See
+the EXAMPLES below for a quick introduction.

The *ref* is the source repository reference of the object to be
fetched, and the *dest* is the optional destination reference. A
@@ -106,7 +109,8 @@ used to help test before/after results.)
\--unnamed *ref*
: copy *ref* into the destination repository, without any name,
leaving a potentially dangling reference until/unless the object
- named by *ref* is referred to some other way (cf. `bup tag`).
+ named by *ref* is referred to some other way (cf. `bup
+ tag`). Currently only compatible with `--copy`.

# OPTIONS

@@ -131,17 +135,35 @@ used to help test before/after results.)
\--print-tags
: for each updated tag, print the new git id.

-\--rewrite, \--no-rewrite
+\--copy
+: copy the data without changes (i.e. without rewrites or
+ repairs). This is the default.
+
+\--rewrite
: rewrite the data according to the destination repository
configuration, e.g. its `bup.split.files`, and `bup.split.trees`
- values. Currently, one of these options must be specified whenever
- the source and destination repository configurations differ in a
- relevant way, and so far, this option is only supported for
- appends and picks. Note that rewriting a git-created save may
- (currently will) introduce bup-related changes. Further, while
- tested, `--rewrite` is relatively new and so warrants even more
- caution (see CAUTION above) than `bup get` itself. Please consider
- validating the results carefully for now.
+ values. Currently, `--rewrite`, `---repair`, or `--copy` must be
+ specified whenever the source and destination repository
+ configurations differ in a relevant way, and so far, `--rewrite`
+ is only supported for appends and picks. This option is also
+ contextual (see CONTEXTUAL OPTIONS). Note that rewriting a
+ git-created save may, and for now will, introduce bup-related
+ changes. Further, while tested, `--rewrite` is relatively new and
+ so warrants even more caution (see CAUTION above) than `bup get`
+ itself. Please consider validating the results carefully for now.
+
+\--repair
+: in addition to what `--rewrite` does, perform all known repairs
+ during the transfer. See REPAIRS below. This option is contextual
+ (see CONTEXTUAL OPTIONS).
+
+\--repair-id ID
+: set the repair session identifier, defaults to a UUID (v4). This
+ identifier will be included in any `--repair`s made during the
+ transfer. Currently, the identifier must be ASCII and must not
+ include control characters or DEL (i.e. must be comprised of bytes
+ \>= 20 and < 127). This option is contextual (see CONTEXTUAL
+ OPTIONS).

\--exclude-rx=*pattern*
: exclude any path matching *pattern*, which must be a Python regular
@@ -158,15 +180,18 @@ used to help test before/after results.)
* '/foo/.' - exclude the content of any directory named foo
* '^/tmp/.' - exclude root-level /tmp's content, but not /tmp itself

- Only supported when rewriting.
+ Only supported when rewriting or repairing. This option is
+ contextual (see CONTEXTUAL OPTIONS).

\--exclude-rx-from=*filename*
: read --exclude-rx patterns from *filename*, one pattern per-line
(may be repeated). Ignore completely empty lines. Only supported
- when rewriting.
+ when rewriting. This option is contextual (see CONTEXTUAL
+ OPTIONS).

\--no-excludes
-: forget any previous `--exclude-rx` or `--exclude-rx-from` options.
+: forget any previous `--exclude-rx` or `--exclude-rx-from`
+ options. This option is contextual (see CONTEXTUAL OPTIONS).

-v, \--verbose
: increase verbosity (can be used more than once). With
@@ -185,15 +210,21 @@ used to help test before/after results.)
pack.compression or core.compression, or 1 (fast, loose
compression).

-\--repair-id ID
-: set the repair session identifier, defaults to a UUID (v4). This
- identifier will be included in any `--repair`s made during the
- transfer. Currently, the identifier must be ASCII and must not
- include control characters or DEL (i.e. must be comprised of bytes
- >= 20 and < 127).
+# CONTEXTUAL OPTIONS

-\--repair
-: perform all known repairs during the transfer. See REPAIRS below.
+Some options like `--repair` and `--ignore-missing` can differ across
+METHODs, and each option changes the context for the next METHOD. So
+you can have
+
+ bup get ... --ignore-missing --unnamed REF \
+ --no-ignore-missing --rewrite --append REF
+
+Without `--no-ignore-missing` this command would fail because
+`--ignore-missing` is incompatible with `--rewrite`.
+
+Changing the currently active excludes is expensive because at the
+moment the cache of remembered rewrites must be cleared whenever a
+METHODs excludes differ from those for the previous METHOD.

# REPAIRS

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 93b171c5..2fa593d3 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -43,7 +43,7 @@ argspec = (
destination may be specified with -r, and data may be pulled from
a remote repository with the related "bup on HOST get ..."
command. The --exclude-rx and --exclude-rx-from options currently
- only apply to rewrites. Currently only --unnammed supports
+ only apply to rewrites and repairs. Currently only --unnammed supports
"--ignore-missing".""",

('optional arguments:',
@@ -58,13 +58,14 @@ argspec = (
('-t --print-trees', 'output a tree id for each ref set'),
('-c, --print-commits', 'output a commit id for each ref set'),
('--print-tags', 'output an id for each tag'),
- ('--[no-]rewrite', 'rewrite data according to destination repo settings'),
+ ('--copy', 'copy the data (no rewriting or repairs; the default)'),
+ ('--rewrite', 'rewrite data according to destination repo settings'),
+ ('--repair', 'repair everything possible'),
+ ('--repair-id ID', 'repair session identifier (default: UUID v4)'),
('--exclude-rx REGEX', 'skip paths matching the unanchored regex (may be repeated)'),
('--exclude-rx-from PATH', 'skip --exclude-rx patterns in PATH (may be repeated)'),
('--no-excludes', 'forget any preceeding exclude options'),
('--bwlimit BWLIMIT', 'maximum bytes/sec to transmit to server'),
- ('--[no-]repair', 'repair everything possible'),
- ('--repair-id ID', 'repair session identifier (default: UUID v4)'),
('--[no-]ignore-missing', 'ignore missing objects (*dangerous*)'),
('-0, -1, -2, -3, -4, -5, -6, -7, -8, -9, --compress LEVEL',
'set compression LEVEL (default: 1)'))),
@@ -167,26 +168,39 @@ def parse_args(args):
# Rewriter once we have it.
exclude_opts = []
ignore_missing = False
- # rewrite and repair track each arg's "state" and repair implies rewrite
- rewrite = None # None means "didn't specify", False means "said no"
- repair = None # None means "didn't specify", False means "said no"
+ # mode: copy rewrite or repair (None implies copy for compatible configs)
+ mode = None
repair_id = None
def make_spec(method, src, dest):
+ assert mode in (None, 'copy', 'rewrite', 'repair'), mode
nonlocal repair_id
- assert not (ignore_missing and repair), (ignore_missing, repair)
- assert not (ignore_missing and rewrite), (ignore_missing, rewrite)
+ if ignore_missing:
+ if mode == 'repair':
+ misuse('--ignore-missing and --repair are incompatible')
+ elif mode == 'rewrite':
+ misuse('--ignore-missing and --rewrite are incompatible')
+ if method != 'unnamed':
+ misuse('currently only --unnamed allows --ignore-missing')
excludes = parse_rx_excludes(exclude_opts, misuse)
- if excludes and not (rewrite or repair):
+ if excludes and not mode in ('rewrite', 'repair'):
misuse('--exclude-rx or --exclude-rx-from requires --rewrite or --repair')
+ # Set rw to None if no opinion has been expressed so we can
+ # require you to state what you want when src/dest repository
+ # configs differ (see target checks in get_everything).
rc = None
- if rewrite or repair:
+ if mode in ('rewrite', 'repair'):
+ if method not in ('append', 'pick', 'force-pick'):
+ misuse(f'--{method} cannot {mode} (only picks and appends)')
if repair_id is None:
repair_id = str(uuid4()).encode('ascii')
- rc = Repairs(repair_id, repair, get_argvb())
- if rewrite: rw = True
- elif repair: rw = True
- elif rewrite in (False, True): rw = rewrite
- else: rw = repair
+ rc = Repairs(repair_id, mode == 'repair', get_argvb())
+ rw = True
+ elif mode == 'copy': # explicitly specified no rewrite/repair
+ rw = False
+ elif mode is None:
+ rw = None
+ else:
+ raise Exception(f'invalid get target mode {mode!r}')
return Spec(method=method, src=src, dest=dest, excludes=excludes,
rewriter=rw, ignore_missing=ignore_missing, repairs=rc)

@@ -200,17 +214,13 @@ def parse_args(args):
elif arg in (b'-v', b'--verbose'):
opt.verbose += 1
remaining = remaining[1:]
- elif arg == b'--repair':
- if ignore_missing:
- misuse('--ignore-missing and --repair are incompatible')
+ elif arg == b'--copy':
pending_method_context[arg] = True
- repair, remaining = True, remaining[1:]
- elif arg == b'--no-repair':
+ mode, remaining = 'copy', remaining[1:]
+ elif arg == b'--repair':
pending_method_context[arg] = True
- repair, remaining = False, remaining[1:]
+ mode, remaining = 'repair', remaining[1:]
elif arg == b'--ignore-missing':
- if repair:
- misuse('--ignore-missing and --repair are incompatible')
pending_method_context[arg] = True
ignore_missing, remaining = True, remaining[1:]
elif arg == b'--no-ignore-missing':
@@ -226,16 +236,12 @@ def parse_args(args):
repair_id = val
elif arg in (b'--ff', b'--append', b'--pick', b'--force-pick',
b'--new-tag', b'--replace', b'--unnamed'):
- if ignore_missing and arg != b'--unnamed':
- misuse('currently only --unnamed allows --ignore-missing')
(ref,), remaining = require_n_args_or_die(1, remaining)
opt.target_specs.append(make_spec(method=arg[2:].decode('ascii'),
src=ref, dest=None))
pending_method_context = {}
elif arg in (b'--ff:', b'--append:', b'--pick:', b'--force-pick:',
b'--new-tag:', b'--replace:'):
- if ignore_missing and arg != b'--unnamed':
- misuse('currently only --unnamed allows --ignore-missing')
(ref, dest), remaining = require_n_args_or_die(2, remaining)
opt.target_specs.append(make_spec(method=arg[2:-1].decode('ascii'),
src=ref, dest=dest))
@@ -251,13 +257,8 @@ def parse_args(args):
elif arg == b'--print-tags':
opt.print_tags, remaining = True, remaining[1:]
elif arg == b'--rewrite':
- if ignore_missing:
- misuse('--ignore-missing and --rewrite are incompatible')
- pending_method_context[arg] = True
- rewrite, remaining = True, remaining[1:]
- elif arg == b'--no-rewrite':
pending_method_context[arg] = True
- rewrite, remaining = False, remaining[1:]
+ mode, remaining = 'rewrite', remaining[1:]
elif arg in (b'--exclude-rx', b'--exclude-rx-from'): # handled later
pending_method_context[arg] = True
(val,), remaining = require_n_args_or_die(1, remaining)
@@ -560,9 +561,8 @@ def resolve_branch_dest(spec, src, src_repo, dest_repo):


def resolve_ff(spec, src_repo, dest_repo):
- if spec.rewriter:
- misuse(f'--{spec.method} cannot rewrite (use --pick or --append)')
- assert not spec.ignore_missing
+ assert not spec.rewriter, spec
+ assert not spec.ignore_missing, spec
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if src.type == 'tree':
@@ -691,9 +691,8 @@ def handle_pick(item, src_repo, dest_repo):


def resolve_new_tag(spec, src_repo, dest_repo):
- assert not spec.ignore_missing
- if spec.rewriter:
- misuse(f'--{spec.method} cannot currently rewrite')
+ assert not spec.rewriter, spec
+ assert not spec.ignore_missing, spec
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if not spec.dest and src.path.startswith(b'/.tag/'):
@@ -720,9 +719,8 @@ def handle_new_tag(item, src_repo, dest_repo):


def resolve_replace(spec, src_repo, dest_repo):
- assert not spec.ignore_missing
- if spec.rewriter:
- misuse(f'--{spec.method} cannot currently rewrite')
+ assert not spec.rewriter, spec
+ assert not spec.ignore_missing, spec
src = resolve_src(spec, src_repo)
spec_args = spec_msg(spec)
if not spec.dest:
@@ -759,8 +757,7 @@ def handle_replace(item, src_repo, dest_repo):


def resolve_unnamed(spec, src_repo, dest_repo):
- if spec.rewriter:
- misuse(f'--{spec.method} cannot currently rewrite')
+ assert not spec.rewriter, spec
if spec.dest:
misuse('destination name given for %s' % spec_msg(spec))
src = resolve_src(spec, src_repo, allow='git')
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index 9d41edb5..066d70b8 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -120,18 +120,20 @@ WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < dir-oid
WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < file-oid
WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < partial-file-hole

-WVPASS rm -rf dest-repo bup-tmp
+
+WVSTART 'basic rejection of invalid method contexts'
WVPASS cp -pPR bup bup-tmp
-WVPASS bup -d dest-repo init
-WVPASS git --git-dir dest-repo config bup.split.trees true
-WVSTART 'rejection of --ignore-missing (only supported by --unnamed)'
-WVFAIL bup -d dest-repo get -s bup --ignore-missing --rewrite --append src
-WVSTART '--no-ignore-missing'
-bup -d dest-repo get -s bup --no-ignore-missing --rewrite --append src
-rc=$?
-WVPASSEQ 2 "$rc"
-WVPASS rm -rf bup
-WVPASS cp -pPR bup-tmp bup
+
+WVFAIL bup get --no-ignore-missing --rewrite --append src
+
+WVFAIL bup get --ignore-missing --rewrite --append src 2> >(tee get.log)
+WVPASS grep -E '^error: --ignore-missing and --rewrite are incompatible' get.log
+
+WVFAIL bup get --ignore-missing --repair --append src 2> >(tee get.log)
+WVPASS grep -E '^error: --ignore-missing and --repair are incompatible' get.log
+
+WVFAIL bup get --ignore-missing --append src 2> >(tee get.log)
+WVPASS grep -E '^error: currently only --unnamed allows --ignore-missing' get.log

repair-to-dest()
{
@@ -280,5 +282,113 @@ WVPASS grep -E "^Replaced: tree $(< split-tree-oid)" split-tree-replacement
WVPASS grep -E "^Missing: $(< split-tree-bupm-oid)" split-tree-replacement


+# Simple repo shared by following tests
+WVPASS rm -rf bup
+WVPASS bup init
+WVPASS mkdir -p src/a
+WVPASS echo 1 > src/a/1
+WVPASS echo 2 > src/a/2
+WVPASS echo 3 > src/a/3
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+WVPASS readarray -t saves < <(bup ls src)
+save_date="${saves[0]}"
+src_oid="$(git rev-parse src)"
+unset saves
+two_oid="$(WVPASS git ls-tree src:a | grep -E '2$' | btl-ent-oid)"
+echo "$two_oid" | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" || exit $?
+WVPASS mv bup bup-123
+
+
+WVSTART '--rewrite (not --repair) on broken save after --repair still fails'
+WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+# Check that non-rewrite doesn't notice the missing object (b/c same repo)
+WVEXPRC 3 bup get --repair --append: src r1 --copy --append: src r2
+# Then that a second *rewrite* can't handle the missing file
+WVEXPRC 2 eval 'bup get' \
+ '--repair --append: src r1' \
+ '--rewrite --append: src r2' \
+ '2>&1 | tee get.log'
+WVPASS grep -E 'raise MissingObject' get.log
+
+
+WVSTART '--ignore-missing on broken save after --repair'
+WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS bup init bup-dest
+WVEXPRC 2 eval 'bup -d bup-dest get -s bup' \
+ '--repair --append: src r1' \
+ '--copy --unnamed src' \
+ '2>&1 | tee get.log'
+WVPASS grep -E 'raise MissingObject' get.log
+WVEXPRC 2 eval 'bup -d bup-dest get -s bup' \
+ '--repair --append: src r1' \
+ '--copy --ignore-missing --unnamed src' \
+ ' 2>&1 | tee get.log'
+WVPASS grep -E '^skipping missing source object' get.log
+
+
+WVSTART '--copy --rewrite --repair'
+WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVEXPRC 3 eval 'bup get --rewrite --repair --repair-id r1 --append: src r1' \
+ '2>&1 | tee get.log'
+WVPASSEQ 1 "$(git show r1:a/2 | grep -cE '^Bup-Replacement-Info: r1$')"
+WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVEXPRC 2 eval 'bup get --repair --rewrite --append: src r1 2>&1 | tee get.log'
+WVPASS grep -E 'raise MissingObject' get.log
+WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup-dest && bup init bup-dest
+WVEXPRC 2 eval 'bup -d bup-dest get -s bup --repair --copy --append: src r1' \
+ '2>&1 | tee get.log'
+WVPASS grep -E 'raise MissingObject' get.log
+WVPASS rm -rf bup bup-dest
+
+
+WVSTART 'multiple gets with differing repair-ids'
+WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVEXPRC 3 eval 'bup get --repair' \
+ '--repair-id repair-1 --append: src r1' \
+ '--repair-id repair-2 --append: src r2' \
+ '2>&1 | tee get.log'
+# Check commit messages
+bup_ver="$(bup version)"
+WVPASS git show -s --pretty=email r1 | tee repair-msg
+WVPASS git interpret-trailers --parse < repair-msg | tee repair-trailers
+readarray -t trailers < repair-trailers
+wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save '
+wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get --repair '
+wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: repair-1$"
+wv-match-rx "${trailers[5]}" "^Bup-Repaired-Save: $src_oid src/$save_date$"
+wv-match-rx "${trailers[6]}" "^Bup-Replaced: [0-9a-f]{40} a/2$"
+WVPASSEQ '' "${trailers[7]}" # end-of-line
+unset trailers
+WVPASS git show -s --pretty=email r2 | tee repair-msg
+WVPASS git interpret-trailers --parse < repair-msg | tee repair-trailers
+readarray -t trailers < repair-trailers
+wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save '
+wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get --repair'
+wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: repair-2$"
+wv-match-rx "${trailers[5]}" "^Bup-Repaired-Save: $src_oid src/$save_date$"
+wv-match-rx "${trailers[6]}" "^Bup-Replaced: [0-9a-f]{40} a/2$"
+WVPASSEQ '' "${trailers[7]}" # end-of-line
+unset trailers
+# Check repair ids in the log
+WVPASSEQ 1 "$(WVPASS grep -cF "repairs needed, repair-id: repair-1" get.log)"
+id1_line="$(WVPASS grep -nF "repairs needed, repair-id: repair-1" get.log)"
+id1_line="${id1_line%%:*}"
+WVPASSEQ 1 "$(WVPASS grep -cF "repairs needed, repair-id: repair-2" get.log)"
+id2_line="$(WVPASS grep -nF "repairs needed, repair-id: repair-2" get.log)"
+id2_line="${id2_line%%:*}"
+WVPASS test "$id1_line" -lt "$id2_line"
+# Check blob content (other tests check content in more detail)
+WVPASS git show r1:a/2 | tee replacement-1 || exit $?
+WVPASS git show r2:a/2 | tee replacement-2 || exit $?
+WVPASSEQ 1 "$(WVPASS grep -cE '^Bup-Replacement-Info: repair-1$' replacement-1)"
+WVPASSEQ 1 "$(WVPASS grep -cE '^Bup-Replacement-Info: repair-2$' replacement-2)"
+
+
WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test_get.py b/test/ext/test_get.py
index e5a60fd8..99c30b2b 100644
--- a/test/ext/test_get.py
+++ b/test/ext/test_get.py
@@ -265,7 +265,7 @@ def _run_get(disposition, method, what, rewrite=None):
else:
raise Exception('error: unexpected get disposition ' + repr(disposition))

- cmd = (*get_cmd, b'--rewrite' if rewrite else b'--no-rewrite')
+ cmd = (*get_cmd, b'--rewrite' if rewrite else b'--copy')
if isinstance(what, bytes):
cmd += (method, what)
else:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Promise/document what was already true --- when the vfs returns an
item whose metadata is either an integer or None, bup created
metadata (via bupm) was unavailable, perhaps because the tree was
created by git, because the metadata was not saved at all (via
pre-bupm bup, or bup versions that would store empty metadata when
save couldn't read it), or (soon) because the metadata was lost
somehow.

Adjust the tree.Stack to detect the case where there is *no*
bup-created metadata and drop the bupm entirely. This allows us to
rewrite a git created tree without introducing (synthesized) .bupm
files.

These changes also allow rewrite to handle saves without .bupm files
at all (previously it would crash).

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 10 +++---
lib/bup/metadata.py | 2 +-
lib/bup/rewrite.py | 10 ++++--
lib/bup/tree.py | 71 +++++++++++++++++++++++++++-----------
lib/bup/vfs.py | 28 +++++++++------
test/ext/test-rewrite | 18 ++++++++++
6 files changed, 100 insertions(+), 39 deletions(-)

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index 91edc1ec..f4127769 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -137,10 +137,12 @@ used to help test before/after results.)
values. Currently, one of these options must be specified whenever
the source and destination repository configurations differ in a
relevant way, and so far, this option is only supported for
- appends and picks. Note that while tested, this option is
- relatively new and so warrants even more caution (see CAUTION
- above) than `bup get` itself. Please consider validating the
- results carefully for now.
+ appends and picks. Note that repairs (see REPLACEMENTS below)
+ require `--rewrite`, and rewriting a git-created save may
+ (currently will) introduce bup-related changes. Further, while
+ tested, `--rewrite` is relatively new and so warrants even more
+ caution (see CAUTION above) than `bup get` itself. Please consider
+ validating the results carefully for now.

\--exclude-rx=*pattern*
: exclude any path matching *pattern*, which must be a Python regular
diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 54a0cf44..f7273593 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -229,7 +229,7 @@ class Metadata:
# Timestamps are (sec, ns), relative to 1970-01-01 00:00:00, ns
# must be non-negative and < 10**9.

- # Consider bup.rewrite (e.g. _blob_replacement()) when making
+ # Consider bup.rewrite (_blob_replacement() ...) when making
# changes to the records (particularly the common records).

def _add_common(self, path, st):
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 3fbb6773..e236a2f0 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -187,15 +187,18 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
path_w_meta = None
for entry in entries:
name, sub_item = entry
+ # For git-created commits or older bup repos, the metadata
+ # will be an integer, so create synthetic Metadata.
+ meta = entry[1].meta
if name == b'.':
# contents() promises this
assert path_w_meta is None, 'two "." dir entries encountered?!'
# Create version of path with its real metadata, not the
# contents() placeholder mode for dirs.
- assert isinstance(entry[1].meta, Metadata), entry
dir_name, dir_item = path[-1]
+ assert isinstance(meta, (Metadata, int)), (entry, meta)
path_w_meta = path[:-1] \
- + ((dir_name, dir_item._replace(meta=entry[1].meta)),)
+ + ((dir_name, dir_item._replace(meta=meta)),)
continue
sub_fs_path_in_save = joinp(fs_path_in_save, name)
if S_ISDIR(vfs.item_mode(sub_item)):
@@ -203,6 +206,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
if should_rx_exclude_path(sub_fs_path_in_save, excludes):
continue
assert path_w_meta is not None, '"." not before children in dir'
+ assert isinstance(entry[1].meta, (Metadata, int)), entry
sub_path = path_w_meta + (entry,)
if not S_ISDIR(vfs.item_mode(sub_item)):
yield sub_path, None
@@ -221,7 +225,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
missing,
_replacement_parents=sub_rpath)
assert path_w_meta is not None, f'{path_msg(fs_path_in_save)} has no "."'
- assert isinstance(path_w_meta[-1][1].meta, Metadata), path_w_meta
+ assert isinstance(path_w_meta[-1][1].meta, (Metadata, int)), path_w_meta
yield path_w_meta, None

def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing):
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index a0c0c728..99da13fa 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -1,5 +1,6 @@

from io import BytesIO
+from stat import S_ISDIR

from bup import hashsplit
from bup.hashsplit import \
@@ -23,8 +24,7 @@ class TreeItem:
assert isinstance(mode, int), mode
assert isinstance(gitmode, int), gitmode
assert isinstance(oid, bytes), oid
- if meta is not None:
- assert isinstance(meta, Metadata), meta
+ assert isinstance(meta, (Metadata, int, type(None))), meta
self.name = name
self.mode = mode
self.gitmode = gitmode
@@ -118,6 +118,36 @@ class StackDir:
f' items={[(x.name, x.oid.hex()) for x in self.items]!r}>'


+def _dir_metadata(dir_meta, items):
+ # If all the metadata bound for the bupm are int or None, drop the
+ # bupm to either match the original (say git created) tree or
+ # (not yet implemented) to repair.
+ any_real_meta = False
+ if isinstance(dir_meta, Metadata):
+ any_real_meta = True
+ meta_ents = [(b'', dir_meta)]
+ elif isinstance(dir_meta, (int, type(None))):
+ meta_ents = [(b'', _empty_metadata)]
+ else:
+ raise Exception(f'Unexpected "." metadata type {dir_meta!r}')
+ for entry in items:
+ if S_ISDIR(entry.mode):
+ continue
+ if isinstance(entry.meta, (int, type(None))):
+ ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
+ _empty_metadata)
+ elif isinstance(entry.meta, Metadata):
+ any_real_meta = True
+ ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
+ entry.meta)
+ else:
+ raise Exception(f'Unexpected metadata type in {entry!r}')
+ meta_ents.append(ml)
+ if any_real_meta:
+ return meta_ents
+ return None
+
+
class Stack:
def __init__(self, repo, split_config):
self._stack = []
@@ -137,7 +167,7 @@ class Stack:
return [p.name for p in self._stack]

def push(self, name, meta):
- assert isinstance(meta, (Metadata, type(None))), meta
+ assert isinstance(meta, (Metadata, int, type(None))), meta
self._stack.append(StackDir(name, meta))

def _clean(self, tree):
@@ -154,23 +184,24 @@ class Stack:
return items

def _write_tree(self, dir_meta, items, add_meta=True):
- shalist = []
- # This might be False if doing a 'bup rewrite' where the original is
- # from an old repo without metadata, or created by 'bup split'.
- meta_ok = all(isinstance(entry.meta, Metadata)
- for entry in items if entry.mode != GIT_MODE_TREE)
- if add_meta and meta_ok:
- metalist = [(b'', _empty_metadata if dir_meta is None else dir_meta)]
- metalist.extend((shalist_item_sort_key((entry.mode, entry.name, None)),
- entry.meta)
- for entry in items if entry.mode != GIT_MODE_TREE)
- metalist.sort(key = lambda x: x[0])
- metadata = BytesIO(b''.join(m[1].encode() for m in metalist))
- splitter = hashsplit.from_config([metadata], self._split_config)
- mode, oid = split_to_blob_or_tree(self._repo.write_bupm,
- self._repo.write_tree,
- splitter)
- shalist.append((mode, b'.bupm', oid))
+ if not add_meta:
+ return self._repo.write_tree([(entry.gitmode, entry.mangled_name(),
+ entry.oid)
+ for entry in items])
+
+ metalist = _dir_metadata(dir_meta, items)
+ if not metalist:
+ return self._repo.write_tree([(entry.gitmode, entry.mangled_name(),
+ entry.oid)
+ for entry in items])
+
+ metalist.sort(key = lambda x: x[0])
+ metadata = BytesIO(b''.join(m[1].encode() for m in metalist))
+ splitter = hashsplit.from_config([metadata], self._split_config)
+ mode, oid = split_to_blob_or_tree(self._repo.write_bupm,
+ self._repo.write_tree,
+ splitter)
+ shalist = [(mode, b'.bupm', oid)]
shalist.extend((entry.gitmode, entry.mangled_name(), entry.oid)
for entry in items)
return self._repo.write_tree(shalist)
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 3507dcd0..6a38529c 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -12,14 +12,26 @@ The VFS is structured like this:
Each path is represented by an item that has least an item.meta which
may be either a Metadata object, or an integer mode. Functions like
item_mode() and item_size() will return the mode and size in either
-case. Any item.meta Metadata instances must not be modified directly.
-Make a copy to modify via item.meta.copy() if needed, or call
-copy_item().
+case. Metadata instances must not be modified directly. Make a copy
+to modify via item.meta.copy() if needed, or call copy_item().

The want_meta argument is advisory for calls that accept it, and it
may not be honored. Callers must be able to handle an item.meta value
-that is either an instance of Metadata or an integer mode, perhaps
-via item_mode() or augment_item_meta().
+that is either an instance of Metadata or an integer mode, perhaps via
+item_mode() or augment_item_meta().
+
+An integer item.meta means that either no bup-recorded metadata was
+available, or the item was a subdirectory returned by a function like
+contents(), which doesn't retrieve the metadata for
+subdirectories. That's because the actual metadata for a directory is
+stored inside the directory (see fill_in_metadata_if_dir() or
+ensure_item_has_metadata()).
+
+Bup-recorded metadata may be unavailable for a number of reasons. For
+example, "synthetic" paths like the VFS root or /.tag/ don't have it,
+trees created by git or early versions of bup won't have it, and some
+versions of bup omitted it when the metadata was unreadable at save
+time.

Setting want_meta=False is rarely desirable since it can limit the VFS
to only the metadata that git itself can represent, and so for
@@ -32,12 +44,6 @@ Any given metadata object's size may be None, in which case the size
can be computed via item_size() or augment_item_meta(...,
include_size=True).

-When traversing a directory using functions like contents(), the meta
-value for any directories other than '.' will be a default directory
-mode, not a Metadata object. This is because the actual metadata for
-a directory is stored inside the directory (see
-fill_in_metadata_if_dir() or ensure_item_has_metadata()).
-
Commit items represent commits (e.g. /.tag/some-commit or
/foo/latest), and for most purposes, they appear as the underlying
tree. S_ISDIR(item_mode(item)) will return true for both tree Items
diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
index 17e9ccfb..ba6c4acc 100755
--- a/test/ext/test-rewrite
+++ b/test/ext/test-rewrite
@@ -144,6 +144,24 @@ GIT_DIR="$BUP_DIR" WVPASS git ls-tree -r save-new^ > "$tmpdir/n"
# FIXME: analyse the diff properly
diff -u "$tmpdir/o" "$tmpdir/n"

+WVSTART "rewrite trees without .bupm"
+WVPASS rm -rf src repo
+WVPASS mkdir src
+WVPASS date > src/x
+WVPASS date > src/y
+WVPASS chmod +x src/x
+WVPASS bup init repo
+WVPASS git --git-dir repo --work-tree "$(pwd)" add src
+WVPASS git --git-dir repo --work-tree "$(pwd)" commit -am save
+WVPASS git --git-dir repo config bup.split.trees true
+WVPASS git --git-dir repo config bup.split.files legacy:16
+WVPASS bup -d repo get --rewrite -s repo --append: main main-rewrite
+WVPASSEQ "$(bup -d repo ls -ln main-rewrite/latest/src | tr -s ' ' ' ')" \
+'-rw-r--r-- ?/? 29 ????-??-?? ??:?? x
+-rw-r--r-- ?/? 29 ????-??-?? ??:?? y'
+WVPASS git --git-dir repo ls-tree main-rewrite:src | WVPASS grep -vE '\.bupm$'
+WVPASS rm -rf repo src
+

WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Move the implementation of validate-ref-links to a new validate-refs
command that does the same thing for either "bup validate-refs ..." or
"bup validate refs --links ...". This prepares for the addition of
other validations to the traversal.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-validate-ref-links.1.md | 31 ++---------
Documentation/bup-validate-refs.1.md | 55 +++++++++++++++++++
lib/bup/cmd/validate_ref_links.py | 54 +++----------------
lib/bup/cmd/validate_refs.py | 64 +++++++++++++++++++++++
4 files changed, 129 insertions(+), 75 deletions(-)
create mode 100644 Documentation/bup-validate-refs.1.md
create mode 100644 lib/bup/cmd/validate_refs.py

diff --git a/Documentation/bup-validate-ref-links.1.md b/Documentation/bup-validate-ref-links.1.md
index 41be16cc..26c5ed44 100644
--- a/Documentation/bup-validate-ref-links.1.md
+++ b/Documentation/bup-validate-ref-links.1.md
@@ -12,37 +12,12 @@ bup validate-ref-links [*ref*...]

# DESCRIPTION

-`bup validate-ref-links` checks repository references (e.g. saves) for
-commits or subtrees that refer to missing objects and reports the
-paths to any found. If no *ref*s are provided, checks all refs,
-otherwise only checks those specified.
-
-This command can also be used to validate a save more quickly than
-attempting a `restore` or `join`ing the save to /dev/null, and much
-more quickly for multiple related saves, though it only checks for the
-existence of the leaf (blob) data, it does not attempt to read that
-data.
-
-At the moment, the broken path information is only logged to standard
-error, and is not well specified (i.e. suitable for inspection, but
-not parsing).
-
-Also note that the current implementation may not report all paths to
-a given missing object because it only examines each unique tree or
-commit object once, no matter how often it appears within the refs
-being examined. This means that in order to find every broken save,
-you would need to run the command separately for each ref, which is
-likely to be much more expensive than a combined run because it can't
-skip subtrees that it has encountered before.
-
-# EXIT STATUS
-
-The exit status will be 1 if any broken links are found, 0 if none are
-found, and some other positive integer for other failures.
+This command is equivalent to `bup validate-refs --links`. Please
+prefer that.

# SEE ALSO

-`bup-fsck`(1), `bup-join`(1), `bup-restore`(1)
+`bup-validate-refs`(1)

# BUP

diff --git a/Documentation/bup-validate-refs.1.md b/Documentation/bup-validate-refs.1.md
new file mode 100644
index 00000000..4339e521
--- /dev/null
+++ b/Documentation/bup-validate-refs.1.md
@@ -0,0 +1,55 @@
+% bup-validate-refs(1) Bup %BUP_VERSION%
+% Rob Browning <r...@defaultvalue.org>
+% %BUP_DATE%
+
+# NAME
+
+bup-validate-refs - check integrity of repository refs
+
+# SYNOPSIS
+
+bup validate-refs [--links] [*ref*...]
+
+# DESCRIPTION
+
+`bup validate-refs` can check repository references (e.g. saves) for
+commits or trees (directories) that refer to missing objects,
+reporting the path to any it finds. If no *ref*s are provided, it
+checks all refs, otherwise it only checks those specified. If
+`--links` is not specified, then it is implied.
+
+At the moment, the broken path information is only logged to standard
+error, and is not well specified (i.e. suitable for inspection, but
+not parsing).
+
+Also note that the current implementation may not report all paths to
+a given missing object because it only examines each unique tree or
+commit object once, no matter how often it appears within the refs
+being examined. This means that in order to find every save with
+missing objects you would need to run the command separately for each
+ref, which will almost certainly to be much more expensive than a
+combined run because it can't skip subtrees that it has encountered
+before.
+
+# OPTIONS
+
+\--links
+: check for commits or trees that refer to missing objects. This
+ command can also be used to validate a save more quickly than
+ attempting a `restore` or `join`ing the save to /dev/null, and
+ much more quickly for multiple related saves, though it only
+ checks for the existence of the leaf (blob) data, it does not
+ attempt to read that data.
+
+# EXIT STATUS
+
+The exit status will be 1 if any broken links are found, 0 if none are
+found, and some other positive integer for other failures.
+
+# SEE ALSO
+
+`bup-fsck`(1), `bup-join`(1), `bup-restore`(1)
+
+# BUP
+
+Part of the `bup`(1) suite.
diff --git a/lib/bup/cmd/validate_ref_links.py b/lib/bup/cmd/validate_ref_links.py
index a8615b8f..d9c305a1 100644
--- a/lib/bup/cmd/validate_ref_links.py
+++ b/lib/bup/cmd/validate_ref_links.py
@@ -1,10 +1,7 @@

-from bup import git, options, vfs
+from bup import options
+from bup.cmd import validate_refs
from bup.compat import argv_bytes
-from bup.gc import count_objects, find_live_objects
-from bup.helpers import EXIT_FALSE, EXIT_TRUE, log
-from bup.io import path_msg
-from bup.repo import LocalRepo


optspec = """
@@ -16,45 +13,8 @@ v,verbose increase log output (can be used more than once)
def main(argv):
o = options.Options(optspec)
opt, flags, extra = o.parse_bytes(argv[1:])
- verbosity = opt.verbose
-
- git.check_repo_or_die()
- cat_pipe = git.cp()
-
- ref_missing = 0
- ref_info = []
- with LocalRepo() as repo:
- for ref in [argv_bytes(x) for x in extra]:
- # FIXME: unify with other commands and git: vfs:, etc.
- res = vfs.try_resolve(repo, ref, want_meta=False)
- # FIXME: if symlink, error(dangling)
- # FIXME: IOError ENOTDIR ELOOP
- _, leaf = res[-1]
- if not leaf:
- log(f'missing {path_msg(ref)}')
- ref_missing += 1
- continue
- kind = type(leaf)
- # FIXME: Root Tags FakeLink
- if kind in (vfs.Item, vfs.Chunky, vfs.RevList):
- ref_info.append((ref, leaf.oid))
- elif kind == vfs.Commit:
- ref_info.append((ref, leaf.coid))
- else:
- o.fatal(f"can't currently handle VFS {kind} for {path_msg(ref)}")
-
- found_missing = 0
- # Wanted all refs, or at least some specified weren't missing
- if not extra or (extra and ref_info):
- existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
- if verbosity:
- log(f'found {existing_count} objects\n')
-
- if existing_count:
- with git.PackIdxList(git.repo(b'objects/pack')) as idxl:
- live_objects, live_trees, found_missing = \
- find_live_objects(existing_count, cat_pipe, idxl, refs=ref_info,
- verbosity=verbosity, count_missing=True)
- live_objects.close()
-
- return EXIT_FALSE if (ref_missing + found_missing) else EXIT_TRUE
+ args = [argv[0]]
+ args.extend([b'-v'] * (opt.verbose or 0))
+ args.append(b'--links')
+ args.extend(map(argv_bytes, extra))
+ return validate_refs.main(args)
diff --git a/lib/bup/cmd/validate_refs.py b/lib/bup/cmd/validate_refs.py
new file mode 100644
index 00000000..a754b0f5
--- /dev/null
+++ b/lib/bup/cmd/validate_refs.py
@@ -0,0 +1,64 @@
+
+from bup import git, options, vfs
+from bup.compat import argv_bytes
+from bup.gc import count_objects, find_live_objects
+from bup.helpers import EXIT_FALSE, EXIT_SUCCESS, EXIT_TRUE, log
+from bup.io import path_msg
+from bup.repo import LocalRepo
+
+
+optspec = """
+bup validate-refs [--links] [REF...]
+--
+links report missing objects referred to by REFs
+v,verbose increase log output (can be used more than once)
+"""
+
+def main(argv):
+ o = options.Options(optspec)
+ opt, flags, extra = o.parse_bytes(argv[1:])
+ verbosity = opt.verbose
+
+ if opt.links is False:
+ return EXIT_SUCCESS
+
+ git.check_repo_or_die()
+ cat_pipe = git.cp()
+
+ ref_missing = 0
+ ref_info = []
+ with LocalRepo() as repo:
+ for ref in [argv_bytes(x) for x in extra]:
+ # FIXME: unify with other commands and git: vfs:, etc.
+ res = vfs.try_resolve(repo, ref, want_meta=False)
+ # FIXME: if symlink, error(dangling)
+ # FIXME: IOError ENOTDIR ELOOP
+ _, leaf = res[-1]
+ if not leaf:
+ log(f'missing {path_msg(ref)}')
+ ref_missing += 1
+ continue
+ kind = type(leaf)
+ # FIXME: Root Tags FakeLink
+ if kind in (vfs.Item, vfs.Chunky, vfs.RevList):
+ ref_info.append((ref, leaf.oid))
+ elif kind == vfs.Commit:
+ ref_info.append((ref, leaf.coid))
+ else:
+ o.fatal(f"can't currently handle VFS {kind} for {path_msg(ref)}")
+
+ found_missing = 0
+ # Wanted all refs, or at least some specified weren't missing
+ if not extra or (extra and ref_info):
+ existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
+ if verbosity:
+ log(f'found {existing_count} objects\n')
+
+ if existing_count:
+ with git.PackIdxList(git.repo(b'objects/pack')) as idxl:
+ live_objects, live_trees, found_missing = \
+ find_live_objects(existing_count, cat_pipe, idxl, refs=ref_info,
+ verbosity=verbosity, count_missing=True)
+ live_objects.close()
+
+ return EXIT_FALSE if (ref_missing + found_missing) else EXIT_TRUE
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Drop --missing MODE, which has never been released, in favor of a
simpler --repair option, and the existing --ignore-missing.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 63 ++++++++++++++++------------
Documentation/bup-validate-refs.1.md | 6 ++-
lib/bup/cmd/get.py | 53 +++++++++++------------
note/main.md | 13 ++----
test/ext/test-get-missing | 13 +++---
test/ext/test-get-rewrite-missing | 12 +++---
6 files changed, 84 insertions(+), 76 deletions(-)

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index ef260845..9d52f75a 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -137,8 +137,7 @@ used to help test before/after results.)
values. Currently, one of these options must be specified whenever
the source and destination repository configurations differ in a
relevant way, and so far, this option is only supported for
- appends and picks. Note that repairs (see REPLACEMENTS below)
- require `--rewrite`, and rewriting a git-created save may
+ appends and picks. Note that rewriting a git-created save may
(currently will) introduce bup-related changes. Further, while
tested, `--rewrite` is relatively new and so warrants even more
caution (see CAUTION above) than `bup get` itself. Please consider
@@ -188,28 +187,40 @@ used to help test before/after results.)

\--repair-id ID
: set the repair session identifier, defaults to a UUID (v4). This
- identifier will be included in repairs made during the transfer,
- i.e. via `--missing replace`. Currently, the identifier must be
- ASCII and must not include control characters or DEL (i.e. must be
- comprised of bytes >= 20 and < 127).
-
-\--missing <fail|ignore|replace>
-: when missing objects are encountered during a transfer, either
- `fail` (exit with nonzero status, the default), `ignore` them
- (currently only supported by `--unnamed`, and potentially
- *dangerous*), or `replace` them with placeholders (see
- REPLACEMENTS below).
-
-# REPLACEMENTS
-
-Saves (commits) with missing objects can be repaired by specifying
-`--missing replace` which will substitute synthesized "repair files"
-for any paths with missing objects. There is currently no support for
-retrieving unaffected parts of split files or trees, the entire file
-or tree is replaced with a repair file.
-
-These repair files contain the `--repair-id` and information about
-the replacement.
+ identifier will be included in any `--repair`s made during the
+ transfer. Currently, the identifier must be ASCII and must not
+ include control characters or DEL (i.e. must be comprised of bytes
+ >= 20 and < 127).
+
+\--repair
+: perform all known repairs during the transfer. See REPAIRS below.
+
+# REPAIRS
+
+`bup get` can fix (or mitigate) a number of known issues during the
+transfer when `--repair` is requested.
+
+ * Versions of `bup` at or after 0.25 and before 0.30.1 might rarely
+ drop metadata entries for non-directories (which can be detected by
+ `bup-validate-refs`(1) `--bupm`). This makes the metadata for all
+ of the other non-directory paths in the same directory unusable
+ (ambiguous). When such an abridged `.bupm` is detected, `--repair`
+ drops all of the `.bupm` entries except the one for the directory
+ itself, ".", and so the affected paths lose most or all of their
+ metadata (ownership, permissions, timestamps, etc.).
+
+ * Use of `bup get` or `bup gc` versions before 0.33.5 could cause
+ repositories to end up with missing objects (which can be detected
+ by `bup-validate-object-links`(1)). To fix affected trees,
+ `--repair` substitutes synthesized "repair files" for any paths
+ with missing objects. Note that there is currently no support for
+ retrieving the unaffected parts of split files; the entire file is
+ replaced with a repair file. These repair files contain the
+ `--repair-id` and information about the replacement. Support for
+ split trees was added after the problem was fixed, and so should be
+ unaffected. See the
+ [0.33.5 release notes (0.33.5-from-0.33.4.md)](https://github.com/bup/bup/blob/main/note/0.33.5-from-0.33.4.md)
+ for additional information.

# EXAMPLES

@@ -268,8 +279,8 @@ the replacement.
$ bup gc
$ git --git-dir "$BUP_DIR" branch -m archives-resplit archives
#
- # Repair a single save with missing objects.
- $ bup get --missing replace --pick archives/latest fixed
+ # Repair a single save.
+ $ bup get --repair --pick archives/latest fixed
#
# Check that fixed/latest looks OK, perhaps via trial
# restores, joining it, etc. (see CAUTION above).
diff --git a/Documentation/bup-validate-refs.1.md b/Documentation/bup-validate-refs.1.md
index fa779f1e..a9a3f83c 100644
--- a/Documentation/bup-validate-refs.1.md
+++ b/Documentation/bup-validate-refs.1.md
@@ -18,7 +18,8 @@ abridged bupm files (metadata storage), reporting the paths to those
it finds. If no *ref*s are provided, it checks all refs, otherwise it
only checks those specified. If no checks are explicitly requested,
then a default set of checks will be performed, currently `--links`
-and `--bupm`.
+and `--bupm`. If problems are found, `bup-get`(1) `--repair` may be
+able to help.

At the moment, the broken path information is only logged to standard
error, and is not well specified (i.e. suitable for inspection, but
@@ -38,7 +39,8 @@ has encountered before.
\--bupm
: check bupm (metadata storage) files. Currently checks for missing
path entries, which could have been caused by `bup` versions since
- 0.25 and before 0.30.1.
+ 0.25 and before 0.30.1. See REPAIRS in `bup-get`(1) for additional
+ information.

\--links
: check for commits or trees that refer to missing objects. This
diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index c559277f..cd7dd9fc 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -44,7 +44,7 @@ argspec = (
a remote repository with the related "bup on HOST get ..."
command. The --exclude-rx and --exclude-rx-from options currently
only apply to rewrites. Currently only --unnammed supports
- "--missing ignore".""",
+ "--ignore-missing".""",

('optional arguments:',
(('-h, --help', 'show this help message and exit'),
@@ -58,13 +58,14 @@ argspec = (
('-t --print-trees', 'output a tree id for each ref set'),
('-c, --print-commits', 'output a commit id for each ref set'),
('--print-tags', 'output an id for each tag'),
- ('--rewrite', 'rewrite data according to destination repo settings'),
+ ('--[no-]rewrite', 'rewrite data according to destination repo settings'),
('--exclude-rx REGEX', 'skip paths matching the unanchored regex (may be repeated)'),
('--exclude-rx-from PATH', 'skip --exclude-rx patterns in PATH (may be repeated)'),
('--no-excludes', 'forget any preceeding exclude options'),
('--bwlimit BWLIMIT', 'maximum bytes/sec to transmit to server'),
- ('--missing <fail|ignore|replace>', 'behavior for missing objects (default: fail)'),
+ ('--[no-]repair', 'repair everything possible'),
('--repair-id ID', 'repair session identifier (default: UUID v4)'),
+ ('--[no-]ignore-missing', 'ignore missing objects (*dangerous*)'),
('-0, -1, -2, -3, -4, -5, -6, -7, -8, -9, --compress LEVEL',
'set compression LEVEL (default: 1)'))),

@@ -166,25 +167,31 @@ def parse_args(args):
# we've finished checking the requests (e.g. are past the
# resolvers), the spec's rewriter will be set to True to indicate
# that it needs the real Rewriter once we have it.
- rewrite = None # None means "didn't specify", False means "said no"
exclude_opts = []
ignore_missing = False
- repair = False
+ # rewrite and repair track each arg's "state" and repair implies rewrite
+ rewrite = None # None means "didn't specify", False means "said no"
+ repair = None # None means "didn't specify", False means "said no"
repair_id = None
def make_spec(method, src, dest):
nonlocal repair_id
assert not (ignore_missing and repair), (ignore_missing, repair)
+ assert not (ignore_missing and rewrite), (ignore_missing, rewrite)
excludes = parse_rx_excludes(exclude_opts, misuse)
- if excludes and not rewrite:
- misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
+ if excludes and not (rewrite or repair):
+ misuse('--exclude-rx or --exclude-rx-from requires --rewrite or --repair')
rc = None
- if rewrite:
+ if (rewrite or repair):
if repair_id is None:
repair_id = str(uuid4()).encode('ascii')
rc = RepairConfig(id=repair_id, destructive=repair,
info=opt.repair_info)
+ if rewrite: rw = True
+ elif repair: rw = True
+ elif rewrite in (False, True): rw = rewrite
+ else: rw = repair
return Spec(method=method, src=src, dest=dest, excludes=excludes,
- rewriter=rewrite, ignore_missing=ignore_missing, repair=rc)
+ rewriter=rw, ignore_missing=ignore_missing, repair=rc)

pending_method_context = {} # dict to preserve insertion order
remaining = args[1:] # Skip argv[0]
@@ -196,22 +203,14 @@ def parse_args(args):
elif arg in (b'-v', b'--verbose'):
opt.verbose += 1
remaining = remaining[1:]
- elif arg == b'--missing':
+ elif arg == b'--repair':
+ if ignore_missing:
+ misuse('--ignore-missing and --repair are incompatible')
pending_method_context[arg] = True
- (val,), remaining = require_n_args_or_die(1, remaining)
- if val == b'fail':
- ignore_missing = False
- repair = False
- elif val == b'ignore':
- if repair:
- misuse('--ignore-missing and --repair are incompatible')
- ignore_missing = True
- elif val == b'replace':
- if ignore_missing:
- misuse('--ignore-missing and --repair are incompatible')
- repair = True
- else:
- misuse(f'--missing must be fail, ignore, or replace, not {val!r}')
+ repair, remaining = True, remaining[1:]
+ elif arg == b'--no-repair':
+ pending_method_context[arg] = True
+ repair, remaining = False, remaining[1:]
elif arg == b'--ignore-missing':
if repair:
misuse('--ignore-missing and --repair are incompatible')
@@ -231,7 +230,7 @@ def parse_args(args):
elif arg in (b'--ff', b'--append', b'--pick', b'--force-pick',
b'--new-tag', b'--replace', b'--unnamed'):
if ignore_missing and arg != b'--unnamed':
- misuse('currently only --unnamed allows --missing ignore')
+ misuse('currently only --unnamed allows --ignore-missing')
(ref,), remaining = require_n_args_or_die(1, remaining)
opt.target_specs.append(make_spec(method=arg[2:].decode('ascii'),
src=ref, dest=None))
@@ -239,7 +238,7 @@ def parse_args(args):
elif arg in (b'--ff:', b'--append:', b'--pick:', b'--force-pick:',
b'--new-tag:', b'--replace:'):
if ignore_missing and arg != b'--unnamed':
- misuse('currently only --unnamed allows --missing ignore')
+ misuse('currently only --unnamed allows --ignore-missing')
(ref, dest), remaining = require_n_args_or_die(2, remaining)
opt.target_specs.append(make_spec(method=arg[2:-1].decode('ascii'),
src=ref, dest=dest))
@@ -255,6 +254,8 @@ def parse_args(args):
elif arg == b'--print-tags':
opt.print_tags, remaining = True, remaining[1:]
elif arg == b'--rewrite':
+ if ignore_missing:
+ misuse('--ignore-missing and --rewrite are incompatible')
pending_method_context[arg] = True
rewrite, remaining = True, remaining[1:]
elif arg == b'--no-rewrite':
diff --git a/note/main.md b/note/main.md
index 5218bbd4..7e78dc3f 100644
--- a/note/main.md
+++ b/note/main.md
@@ -106,15 +106,10 @@ General
e.g. its `bup.split.files` and `bup.split.trees` settings. See
`bup-get`(1) for additional information.

-* `bup get --missing <fail|ignore|replace> ...` can now specify how to
- handle missing objects that are encountered during a
- transfer. `fail`, the default, causes bup to exit with a nonzero
- status. `ignore` causes bup to skip over them (only supported by
- `--unnamed` and potentially *dangerous*). `replace` only works with
- `--rewrite` and replaces paths with missing contents with
- synthesized "repair files". See bup-get(1) for additional
- information. `--missing ignore` is the preferred replacement for the
- existing `--ignore-missing`.
+* The new `bup get --repair` acts like a `--rewrite` while also
+ attempting to detect and fix known issues during the transfer, for
+ example, replacing paths with missing contents with synthesized
+ "repair files". See `bup-get`(1) for additional information.

* The default pack compression level can now be configured via either
`pack.compression` or `core.compression`. See `bup-config`(5) for
diff --git a/test/ext/test-get-missing b/test/ext/test-get-missing
index 6de4455f..d098d8bf 100755
--- a/test/ext/test-get-missing
+++ b/test/ext/test-get-missing
@@ -39,13 +39,12 @@ WVFAIL bup -d dest-repo get -s bup --unnamed "git:$src_oid" 2>&1 | tee get.log
WVPASS grep -E 'raise MissingObject' get.log
WVPASS rm -rf dest-repo

-for args in '--missing ignore' --ignore-missing; do
- WVPASS bup -d dest-repo init
- WVFAIL bup -d dest-repo get $args -s bup --unnamed "git:$src_oid" 2>&1 \
- | tee get.log
- WVPASSEQ 1 "$(grep -cF "skipping missing source object ${bupm_oid}" get.log)"
- WVPASS rm -rf dest-repo
-done
+WVPASS bup -d dest-repo init
+WVFAIL bup -d dest-repo get -s bup --ignore-missing --unnamed "git:$src_oid" 2>&1 \
+ | tee get.log
+WVPASSEQ 1 "$(grep -cF "skipping missing source object ${bupm_oid}" get.log)"
+WVPASS rm -rf dest-repo
+


WVPASS cd "$top"
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index 9755e466..b20b72c0 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -123,10 +123,10 @@ WVPASS rm -rf dest-repo bup-tmp
WVPASS cp -pPR bup bup-tmp
WVPASS bup -d dest-repo init
WVPASS git --git-dir dest-repo config bup.split.trees true
-# test rejection of --missing ignore (only supported by --unnamed)
-WVFAIL bup -d dest-repo get -s bup --missing ignore --rewrite --append src
-# test --missing fail
-bup -d dest-repo get -s bup --missing fail --rewrite --append src
+WVSTART 'rejection of --ignore-missing (only supported by --unnamed)'
+WVFAIL bup -d dest-repo get -s bup --ignore-missing --rewrite --append src
+WVSTART '--no-ignore-missing'
+bup -d dest-repo get -s bup --no-ignore-missing --rewrite --append src
rc=$?
WVPASSEQ 2 "$rc"
WVPASS rm -rf bup
@@ -137,7 +137,7 @@ repair-to-dest()
rm -rf dest-repo
WVPASS bup -d dest-repo init
WVPASS git --git-dir dest-repo config bup.split.trees true
- bup -d dest-repo get -s bup --missing replace --rewrite --append src 2> repair.log
+ bup -d dest-repo get -s bup --repair --append src 2> repair.log
rc=$?
display-file repair.log
WVPASSEQ 3 "$rc"
@@ -180,7 +180,7 @@ readarray -t trailers < repair-trailers
wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save '
wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
-wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get .* --rewrite '
+wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get .* --repair '
wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: $repair_id$"
wv-match-rx "${trailers[5]}" "^Bup-Replaced: $(< blob-replacement-oid) $missing_file$"
wv-match-rx "${trailers[6]}" "^Bup-Replaced: $(< dir-replacement-oid) $missing_dir$"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Make it so that each requested transfer respects the relevant
arguments that precede it on the command line, so that, for example,
this is no longer an error:

bup get --ignore-missing --unnamed REF --no-ignore-missing --pick REF

This also applies for --exclude*, --rewrite, --missing, etc., and
whenever the excludes change, clear the rewrite db because existing
trees may have become invalid.

We'll update the docs, etc. after some additional upcoming changes to
the arguments.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 52 ++++++++++++++++++++------------------
lib/bup/repair.py | 14 +++++-----
lib/bup/rewrite.py | 40 ++++++++++++++++++++---------
test/ext/test-get-excludes | 9 +++++++
test/ext/test_get.py | 9 +++----
5 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 62d9e7c3..6890f6ab 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -153,7 +153,7 @@ def parse_args(args):
opt.print_commits = opt.print_trees = opt.print_tags = False
opt.bwlimit = None
opt.compress = None
- opt.repair_info = None
+ opt.repair_info = RepairInfo(command=get_argvb())
opt.source = opt.remote = None
opt.target_specs = []

@@ -162,9 +162,22 @@ def parse_args(args):
# we've finished checking the requests (e.g. are past the
# resolvers), the spec's rewriter will be set to True to indicate
# that it needs the real Rewriter once we have it.
- rewrite = None # None means "didn't specify"
+ rewrite = None # None means "didn't specify", False means "said no"
missing = 'fail'
exclude_opts = []
+ repair_id = None
+ def make_spec(method, src, dest):
+ nonlocal repair_id
+ excludes = parse_rx_excludes(exclude_opts, misuse)
+ if excludes and not rewrite:
+ misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
+ if repair_id is None:
+ repair_id = str(uuid4()).encode('ascii')
+ return Spec(method=method, src=src, dest=dest, excludes=excludes,
+ rewriter=rewrite,
+ missing=MissingConfig(id=repair_id, mode=missing,
+ repair_info=opt.repair_info))
+
remaining = args[1:] # Skip argv[0]
while remaining:
arg = remaining[0]
@@ -191,17 +204,17 @@ def parse_args(args):
misuse('empty --repair-id')
if not valid_repair_id(val):
misuse('--repair-id must be ASCII without control characters or DEL')
- opt.repair_info = RepairInfo(val, command=get_argvb())
+ repair_id = val
elif arg in (b'--ff', b'--append', b'--pick', b'--force-pick',
b'--new-tag', b'--replace', b'--unnamed'):
(ref,), remaining = require_n_args_or_die(1, remaining)
- opt.target_specs.append(Spec(method=arg[2:].decode('ascii'),
- src=ref, dest=None))
+ opt.target_specs.append(make_spec(method=arg[2:].decode('ascii'),
+ src=ref, dest=None))
elif arg in (b'--ff:', b'--append:', b'--pick:', b'--force-pick:',
b'--new-tag:', b'--replace:'):
(ref, dest), remaining = require_n_args_or_die(2, remaining)
- opt.target_specs.append(Spec(method=arg[2:-1].decode('ascii'),
- src=ref, dest=dest))
+ opt.target_specs.append(make_spec(method=arg[2:-1].decode('ascii'),
+ src=ref, dest=dest))
elif arg in (b'-s', b'--source'):
(opt.source,), remaining = require_n_args_or_die(1, remaining)
elif arg in (b'-r', b'--remote'):
@@ -240,16 +253,6 @@ def parse_args(args):
continue
else:
misuse(f'unrecognized argument: {path_msg(arg)}')
- if opt.repair_info is None:
- opt.repair_info = RepairInfo(str(uuid4()).encode('ascii'),
- command=get_argvb())
- excludes = parse_rx_excludes(exclude_opts, misuse)
- if excludes and not rewrite:
- misuse('cannot --exclude-rx or --exclude-rx-from when not rewriting')
- missing = MissingConfig(mode=missing, repair_info=opt.repair_info)
- opt.target_specs = [dcreplace(x, missing=missing, excludes=excludes,
- rewriter=rewrite)
- for x in opt.target_specs]
return opt

# FIXME: client error handling (remote exceptions, etc.)
@@ -806,13 +809,14 @@ def get_everything(opt):
src_split_cfg = hashsplit.configuration(src_repo.config_get)
dest_split_cfg = hashsplit.configuration(dest_repo.config_get)

- # For now (maybe forever), they're all the same
- rewrite = opt.target_specs[0].rewriter
- assert all(x.rewriter == rewrite for x in opt.target_specs), \
- opt.target_specs
-
- if src_split_cfg != dest_split_cfg and rewrite is None:
- misuse('repository configs differ; specify --rewrite or --no-rewrite')
+ # For now, --rewrite is never implict
+ rewrite = False
+ for spec in opt.target_specs:
+ if spec.rewriter is not None:
+ rewrite = True
+ elif src_split_cfg != dest_split_cfg:
+ misuse('repository configs differ; need --[no-]rewrite before'
+ f' {spec_msg(spec)}')

# Resolve and validate all sources and destinations, implicit
# or explicit, combinations of methods and modes (rewrite,
diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index bf050c66..5ee7294e 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -15,10 +15,9 @@ def valid_repair_id(s):


class RepairInfo:
- __slots__ = 'id', 'command', '_others', '_replacements'
- def __init__(self, id, *, command=None):
- assert valid_repair_id(id)
- self.id = id
+ # Used, for example, to track all repairs in a bup get process
+ __slots__ = 'command', '_others', '_replacements'
+ def __init__(self, *, command=None):
self.command = command
self._others = 0
self._replacements = []
@@ -26,10 +25,11 @@ class RepairInfo:
def path_replaced(self, path, oid, new_oid):
self._replacements.append((path, oid, new_oid))
def repair_count(self): return len(self._replacements) + self._others
- def repair_trailers(self):
+ def repair_trailers(self, repair_id):
+ assert valid_repair_id(repair_id)
if not self.repair_count():
return []
- trailers = [b'Bup-Repair-ID: ' + self.id]
+ trailers = [b'Bup-Repair-ID: ' + repair_id]
for path, oid, new_oid in self._replacements:
trailers.append(b'Bup-Replaced: %s %s'
% (hexlify(new_oid), enc_sh(path)))
@@ -38,9 +38,11 @@ class RepairInfo:

@dataclass(slots=True, frozen=True)
class MissingConfig:
+ id: bytes
mode: Union['fail', 'ignore', 'replace']
repair_info: Optional[RepairInfo] = None
def __post_init__(self):
+ assert valid_repair_id(self.id)
assert self.mode in ('fail', 'ignore', 'replace')
if self.mode == 'replace':
assert isinstance(self.repair_info, RepairInfo), self.repair_info
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 869869d4..94fc0424 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -3,6 +3,7 @@ from binascii import hexlify
from contextlib import ExitStack, closing, nullcontext
from itertools import chain
from os.path import join as joinp
+from re import Pattern
from stat import S_ISDIR, S_ISLNK, S_IRWXG, S_IRWXO, S_ISREG
from typing import Any, Sequence
import sqlite3, time
@@ -96,9 +97,10 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, None
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

-def _path_repaired(path, oid, replacement_oid, missing_oid, repair_info):
+def _path_repaired(path, oid, replacement_oid, missing_oid, repair_id,
+ repair_info):
if repair_info.repair_count() == 0:
- log(b'repairs needed, repair-id: %s\n' % repair_info.id)
+ log(b'repairs needed, repair-id: %s\n' % repair_id)
fs_path = _fs_path_from_vfs(path)
repair_info.path_replaced(fs_path, oid, replacement_oid)
ep = path_msg(fs_path)
@@ -253,9 +255,10 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, missing):
if missing.mode == 'fail':
raise ex
repair_info = missing.repair_info
- replacement = _replacement_symlink_item(dstrepo, item,
- repair_info.id, ex.oid)
- _path_repaired(path, item.oid, replacement.oid, ex.oid, repair_info)
+ replacement = _replacement_symlink_item(dstrepo, item, missing.id,
+ ex.oid)
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, missing.id,
+ repair_info)
assert replacement.meta.mode == default_file_mode
stack.append_to_current(name, default_file_mode, default_file_mode,
replacement.oid, replacement.meta)
@@ -336,14 +339,14 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# For now, wholesale replacement (no attempt to handle
# partially readable split trees).
rep_item = incomplete.path[-1][1]
- replacement = _replacement_tree_item(dstrepo, rep_item, repair_info.id,
+ replacement = _replacement_tree_item(dstrepo, rep_item, missing.id,
incomplete.missing)
# Must not remember repairs because the repair-id (and so blob
# content) can vary across saves, i.e. get --rewrite-id is a
# contextual argument, and because the type changes from tree
# to blob.
_path_repaired(path, rep_item.oid, replacement.oid, incomplete.missing,
- repair_info)
+ missing.id, repair_info)
assert replacement.meta.mode == default_file_mode, repr(replacement)
stack.append_to_current(path[-1][0],
replacement.meta.mode, GIT_MODE_FILE,
@@ -425,13 +428,14 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
if missing.mode == 'fail':
raise ex
repair_info = missing.repair_info
- replacement = _replacement_file_item(dstrepo, item, repair_info.id,
+ replacement = _replacement_file_item(dstrepo, item, missing.id,
ex.oid)
- _path_repaired(path, item.oid, replacement.oid, ex.oid, repair_info)
# Must not remember repairs because the repair-id (and so blob
# content) can vary across saves, i.e. get --rewrite-id is a
# contextual argument, and because the type may change from
# tree to blob.
+ _path_repaired(path, item.oid, replacement.oid, ex.oid, missing.id,
+ repair_info)
assert replacement.meta.mode == default_file_mode, repr(replacement)
stack.append_to_current(name, replacement.meta.mode, GIT_MODE_FILE,
replacement.oid, replacement.meta)
@@ -455,6 +459,9 @@ class Rewriter:
assert isinstance(db, (bytes, type(None)))
self._context = nullcontext()
with ExitStack() as ctx:
+ # Allows us to detect changes in excludes which invalidate
+ # related tree rewrites.
+ self._current_excludes = []
self._split_cfg = split_cfg
self._db_path = db
if db:
@@ -486,6 +493,7 @@ class Rewriter:
assert missing.mode in ('fail', 'replace'), missing
if parent:
assert len(parent) == 20, parent
+ assert all(isinstance(x, Pattern) for x in excludes)
assert len(save_path) == 3, (len(save_path), save_path)
assert isinstance(save_path[1][1], vfs.RevList)
leaf_name, leaf_item = save_path[2]
@@ -504,6 +512,14 @@ class Rewriter:
# location in the archive being constructed.
stack = Stack(dstrepo, self._split_cfg)

+ if self._current_excludes != excludes:
+ # Whenever the excludes change, remembered tree
+ # rewrites may become incorrect. We could just
+ # drop the trees if we had an indicator, but for
+ # now just drop everything.
+ dbc.execute(f'delete from {self._mapping}')
+ self._current_excludes = excludes
+
# Relies on the fact that recursion is dfs post-order,
# and so if a dir is broken, we'll see that "up
# front", and never produce any children.
@@ -525,9 +541,9 @@ class Rewriter:
ci = parse_commit(get_cat_data(srcrepo.cat(save_oidx), b'commit'))
author = ci.author_name + b' <' + ci.author_mail + b'>'
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
- msg = commit_message(ci.message,
- missing.repair_info.command,
- missing.repair_info.repair_trailers())
+ trailers = missing.repair_info.repair_trailers(missing.id)
+ msg = commit_message(ci.message, missing.repair_info.command,
+ trailers)
return (dstrepo.write_commit(tree, parent,
author,
ci.author_sec, ci.author_offset,
diff --git a/test/ext/test-get-excludes b/test/ext/test-get-excludes
index f592234e..44a867cf 100755
--- a/test/ext/test-get-excludes
+++ b/test/ext/test-get-excludes
@@ -30,5 +30,14 @@ WVSTART '--rewrite --exclude-rx --no-excludes'
WVPASS bup get --rewrite --exclude-rx 't.*' --no-excludes --pick: src/latest dst
WVPASSEQ $'one\nthree\ntwo' "$(bup ls dst/latest/a)"

+WVSTART '--rewrite --exclude-rx contextuality'
+WVPASS bup get --rewrite \
+ --exclude-rx 't.*' --pick: src/latest dst-1 \
+ --no-excludes --pick: src/latest dst-2 \
+ --exclude-rx 'one' --pick: src/latest dst-3
+WVPASSEQ $'one' "$(bup ls dst-1/latest/a)"
+WVPASSEQ $'one\nthree\ntwo' "$(bup ls dst-2/latest/a)"
+WVPASSEQ $'three\ntwo' "$(bup ls dst-3/latest/a)"
+
WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test_get.py b/test/ext/test_get.py
index 4bd60edc..e5a60fd8 100644
--- a/test/ext/test_get.py
+++ b/test/ext/test_get.py
@@ -265,19 +265,16 @@ def _run_get(disposition, method, what, rewrite=None):
else:
raise Exception('error: unexpected get disposition ' + repr(disposition))

+ cmd = (*get_cmd, b'--rewrite' if rewrite else b'--no-rewrite')
if isinstance(what, bytes):
- cmd = get_cmd + (method, what)
+ cmd += (method, what)
else:
assert not isinstance(what, str) # python 3 sanity check
if method in (b'--ff', b'--append', b'--pick', b'--force-pick', b'--new-tag',
b'--replace'):
method += b':'
src, dest = what
- cmd = get_cmd + (method, src, dest)
- if rewrite:
- cmd += (b'--rewrite',)
- elif rewrite == False:
- cmd += (b'--no-rewrite',)
+ cmd += (method, src, dest)
result = exo(cmd, check=False, stderr=PIPE)
fsck = ex((bup_cmd, b'-d', b'get-dest', b'fsck'), check=False)
wvpasseq(0, fsck.rc)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Add an optional --bupm argument that when given, or when no
validations are selected, causes validate-refs to check that none of
the ref's bupm files are missing metadata entries. Earlier versions of
bup save could omit entries if it failed to read a path's metadata
after it had already added that path to the pending tree.

The problem was introduced in 0.25 by

16f9f9829038f25aec80ebfae3c882a66281e145
save-cmd.py: don't crash when a path disappears between index and save.

and fixed for 0.30.1 by

47891d8951a95b8e0d9ca94387107cdf12ca3d3c
save: add empty metadata if reading fails

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-validate-refs.1.md | 25 +++--
HACKING.md | 4 +
lib/bup/cmd/validate_refs.py | 154 ++++++++++++++++++++-------
lib/bup/gc.py | 19 ++--
lib/bup/io.py | 31 ++++++
test/ext/test-validate-refs | 75 +++++++++++++
wvtest.sh | 31 ++++++
7 files changed, 279 insertions(+), 60 deletions(-)
create mode 100755 test/ext/test-validate-refs

diff --git a/Documentation/bup-validate-refs.1.md b/Documentation/bup-validate-refs.1.md
index 4339e521..fa779f1e 100644
--- a/Documentation/bup-validate-refs.1.md
+++ b/Documentation/bup-validate-refs.1.md
@@ -8,15 +8,17 @@ bup-validate-refs - check integrity of repository refs

# SYNOPSIS

-bup validate-refs [--links] [*ref*...]
+bup validate-refs [--links] [--bupm] [*ref*...]

# DESCRIPTION

`bup validate-refs` can check repository references (e.g. saves) for
-commits or trees (directories) that refer to missing objects,
-reporting the path to any it finds. If no *ref*s are provided, it
-checks all refs, otherwise it only checks those specified. If
-`--links` is not specified, then it is implied.
+commits or trees (directories) that refer to missing objects, and for
+abridged bupm files (metadata storage), reporting the paths to those
+it finds. If no *ref*s are provided, it checks all refs, otherwise it
+only checks those specified. If no checks are explicitly requested,
+then a default set of checks will be performed, currently `--links`
+and `--bupm`.

At the moment, the broken path information is only logged to standard
error, and is not well specified (i.e. suitable for inspection, but
@@ -26,13 +28,18 @@ Also note that the current implementation may not report all paths to
a given missing object because it only examines each unique tree or
commit object once, no matter how often it appears within the refs
being examined. This means that in order to find every save with
-missing objects you would need to run the command separately for each
-ref, which will almost certainly to be much more expensive than a
-combined run because it can't skip subtrees that it has encountered
-before.
+missing objects, for example, you would need to run the command
+separately for each ref, which will almost certainly to be much more
+expensive than a combined run because it can't skip subtrees that it
+has encountered before.

# OPTIONS

+\--bupm
+: check bupm (metadata storage) files. Currently checks for missing
+ path entries, which could have been caused by `bup` versions since
+ 0.25 and before 0.30.1.
+
\--links
: check for commits or trees that refer to missing objects. This
command can also be used to validate a save more quickly than
diff --git a/HACKING.md b/HACKING.md
index 0b8bb285..26f4740b 100644
--- a/HACKING.md
+++ b/HACKING.md
@@ -101,6 +101,10 @@ test via fixtures in conftest.py, including the state of the
environment variables and the working directory; the latter is reset
to the top of the source tree.

+You may want to exclude test/tmp from backups because the testing
+directories are preserved on failure (until the next `make clean`),
+and test/tmp experiences a lot of churn.
+
Submitting patches
==================

diff --git a/lib/bup/cmd/validate_refs.py b/lib/bup/cmd/validate_refs.py
index 1c80b94d..6fc2a955 100644
--- a/lib/bup/cmd/validate_refs.py
+++ b/lib/bup/cmd/validate_refs.py
@@ -1,65 +1,141 @@

+from binascii import hexlify
+from contextlib import ExitStack
+from stat import S_ISDIR
+
from bup import git, options, vfs
from bup.compat import argv_bytes
from bup.gc import count_objects, find_live_objects
-from bup.helpers import EXIT_FALSE, EXIT_SUCCESS, EXIT_TRUE, log, progress
-from bup.io import path_msg
+from bup.git import BUP_CHUNKED, demangle_name, tree_iter
+from bup.helpers import EXIT_FAILURE, EXIT_FALSE, EXIT_TRUE, log
+from bup.metadata import Metadata
+from bup.io import walk_path_msg, path_msg
from bup.repo import LocalRepo
+from bup.vfs import tree_data_reader


optspec = """
-bup validate-refs [--links] [REF...]
+bup validate-refs [--links] [--bupm] [REF...]
--
+bupm report broken bupm (path metadata) objects within REFs
links report missing objects referred to by REFs
v,verbose increase log output (can be used more than once)
"""

+def expected_bup_entry_count_for_tree(tree_data):
+ exp_n = 1 # for the parent dir
+ for mode, mangled_name, oid in tree_iter(tree_data):
+ if mangled_name.endswith(b'.bupd'):
+ return 2
+ if mangled_name == b'.bupm':
+ continue
+ name, kind = demangle_name(mangled_name, mode)
+ if S_ISDIR(mode) and kind != BUP_CHUNKED:
+ continue
+ exp_n += 1
+ return exp_n
+
+def resolve_refs(repo, refs, fatal):
+ ref_missing = 0
+ ref_info = []
+ for ref in refs:
+ # FIXME: unify with other commands and git: vfs:, etc.
+ res = vfs.try_resolve(repo, ref, want_meta=False)
+ # FIXME: if symlink, error(dangling)
+ # FIXME: IOError ENOTDIR ELOOP
+ _, leaf = res[-1]
+ if not leaf:
+ log(f'missing {path_msg(ref)}')
+ ref_missing += 1
+ continue
+ kind = type(leaf)
+ # FIXME: Root Tags FakeLink
+ if kind in (vfs.Item, vfs.Chunky, vfs.RevList):
+ ref_info.append((ref, leaf.oid))
+ elif kind == vfs.Commit:
+ ref_info.append((ref, leaf.coid))
+ else:
+ fatal(f"can't currently handle VFS {kind} for {path_msg(ref)}")
+ return ref_missing, ref_info
+
def main(argv):
o = options.Options(optspec)
opt, flags, extra = o.parse_bytes(argv[1:])
verbosity = opt.verbose

- if opt.links is False:
- return EXIT_SUCCESS
+ if (opt.links, opt.bupm) == (False, False):
+ o.fatal(f'no validation requested')
+ if (opt.links, opt.bupm) == (None, None):
+ opt.links = opt.bupm = True

git.check_repo_or_die()
cat_pipe = git.cp()

- ref_missing = 0
- ref_info = []
with LocalRepo() as repo:
- for ref in [argv_bytes(x) for x in extra]:
- # FIXME: unify with other commands and git: vfs:, etc.
- res = vfs.try_resolve(repo, ref, want_meta=False)
- # FIXME: if symlink, error(dangling)
- # FIXME: IOError ENOTDIR ELOOP
- _, leaf = res[-1]
- if not leaf:
- log(f'missing {path_msg(ref)}')
- ref_missing += 1
- continue
- kind = type(leaf)
- # FIXME: Root Tags FakeLink
- if kind in (vfs.Item, vfs.Chunky, vfs.RevList):
- ref_info.append((ref, leaf.oid))
- elif kind == vfs.Commit:
- ref_info.append((ref, leaf.coid))
- else:
- o.fatal(f"can't currently handle VFS {kind} for {path_msg(ref)}")
+ ref_missing, ref_info = \
+ resolve_refs(repo, [argv_bytes(x) for x in extra], o.fatal)

- found_missing = 0
- # Wanted all refs, or at least some specified weren't missing
- if not extra or (extra and ref_info):
- existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
- if verbosity:
- progress(f'found {existing_count} objects\r')
+ bad_bupm = 0
+ abridged_bupm = 0

- if existing_count:
- with git.PackIdxList(git.repo(b'objects/pack')) as idxl:
- live_objects, live_trees, found_missing = \
- find_live_objects(existing_count, cat_pipe, refs=ref_info,
- count_missing=True, idx_list=idxl,
- verbosity=verbosity)
- live_objects.close()
+ def validate_if_bupm(ref_name, item_path):
+ nonlocal bad_bupm, abridged_bupm
+ item = item_path[-1]
+ if item.name != b'.bupm':
+ return
+ bupm_n = 0
+ with tree_data_reader(repo, item.oid) as bupm:
+ try:
+ while True:
+ Metadata.read(bupm)
+ bupm_n += 1
+ except EOFError:
+ pass
+ except Exception:
+ pm = walk_path_msg(ref_name, item_path)
+ raise Exception(f'Unable to parse .bupm at {pm}')
+ parent = item_path[-2]
+ info = vfs.get_ref(repo, hexlify(parent.oid))
+ assert info[0], info
+ exp_n = expected_bup_entry_count_for_tree(b''.join(info[3]))
+ if bupm_n == exp_n:
+ return
+ elif bupm_n > exp_n:
+ bad_bupm += 1
+ log(f'error: tree with extra bupm entries ({bupm_n} > {exp_n})'
+ f' (please report): {parent.oid.hex()}\n')
+ else:
+ abridged_bupm += 1
+ imsg = walk_path_msg(ref_name, item_path)
+ log(f'abridged-bupm {imsg}\n')

- return EXIT_FALSE if (ref_missing + found_missing) else EXIT_TRUE
+ found_missing = 0
+ # Wanted all refs, or at least some specified weren't missing
+ if not extra or (extra and ref_info):
+ existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
+ if verbosity:
+ log(f'found {existing_count} objects\n')
+ with ExitStack() as maybe_close_idxl:
+ idxl = None
+ if opt.links:
+ idxl = git.PackIdxList(git.repo(b'objects/pack'))
+ maybe_close_idxl.enter_context(idxl)
+ found = find_live_objects(existing_count, cat_pipe,
+ refs=ref_info,
+ count_missing=opt.links,
+ idx_list=idxl,
+ for_item=opt.bupm and validate_if_bupm,
+ verbosity=verbosity)
+ if opt.links:
+ live_objects, live_trees, found_missing = found
+ else:
+ live_objects, live_trees = found
+ live_objects.close()
+ if bad_bupm:
+ return EXIT_FAILURE
+ elif (ref_missing + found_missing + abridged_bupm):
+ if (ref_missing or found_missing) and not opt.links:
+ log(f'note: missing object list may be incomplete without --links\n')
+ return EXIT_FALSE
+ else:
+ return EXIT_TRUE
diff --git a/lib/bup/gc.py b/lib/bup/gc.py
index d41db0d3..9bb7ff25 100644
--- a/lib/bup/gc.py
+++ b/lib/bup/gc.py
@@ -3,14 +3,13 @@ from binascii import hexlify, unhexlify
from contextlib import ExitStack
#from itertools import chain
from os.path import basename
-from stat import S_ISDIR
import glob, os, re, subprocess, sys, tempfile

from bup import bloom, git, midx
from bup.git import MissingObject, walk_object
from bup.helpers import \
EXIT_FAILURE, log, note_error, progress, qprogress, reprogress
-from bup.io import path_msg
+from bup.io import walk_path_msg, path_msg
from bup.repo import LocalRepo

# This garbage collector uses a Bloom filter to track the live blobs
@@ -69,20 +68,14 @@ def count_objects(dir, verbosity):


def report_missing(ref_name, item_path):
- ref = path_msg(ref_name)
- i = len(item_path) - 1
- while i >= 0 and item_path[i].type != b'commit':
- i -= 1
- path = path_msg(b'/'.join(x.name for x in item_path[i:]))
item = item_path[-1]
- if S_ISDIR(item.mode):
- note_error(f'missing {item.oid.hex()} {ref}:{path}/\n')
- else:
- note_error(f'missing {item.oid.hex()} {ref}:{path}\n')
+ imsg = walk_path_msg(ref_name, item_path)
+ note_error(f'missing {item.oid.hex()} {imsg}\n')


def find_live_objects(existing_count, cat_pipe, refs=None, *,
- count_missing=False, idx_list=None, verbosity=0):
+ count_missing=False, idx_list=None, for_item=None,
+ verbosity=0):
if count_missing: assert idx_list, (count_missing, idx_list)
pack_dir = git.repo(b'objects/pack')
ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir)
@@ -119,6 +112,8 @@ def find_live_objects(existing_count, cat_pipe, refs=None, *,
missing += 1
else:
raise MissingObject(item.oid)
+ if for_item:
+ for_item(ref_name, item_path)
# FIXME: batch ids
if item.type != b'blob':
if verbosity and not item.oid in live_trees:
diff --git a/lib/bup/io.py b/lib/bup/io.py
index 4ccdc58b..05e247f1 100644
--- a/lib/bup/io.py
+++ b/lib/bup/io.py
@@ -1,6 +1,7 @@

from errno import EAGAIN
from os import fsdecode
+from stat import S_ISDIR
import mmap as py_mmap
import os, select, sys, time

@@ -253,6 +254,36 @@ def path_msg(x):
return enc_shs(fsdecode(x))


+def walk_path_msg(ref_name, item_path):
+ # walk ref of
+ # archive/latest/home
+ # produces
+ # archive/latest/home 1c8749ada58cbb2b7e3752db12ee7bbbded5cf84:rlb/.bupm
+ #
+ # Currently, the item_path is always an optional sequence of
+ # commits (named by their oids), followed by the last commit's
+ # tree (also oid named), followed by any remaining items, with
+ # their git tree names.
+ if len(item_path) == 1:
+ return path_msg(item_path[0].name)
+ root = None # either the last commit or a tree
+ path_top = 1 # skips the tree if we have a commit
+ for i in range(-1, - (len(item_path) + 1), -1):
+ if item_path[i].type == b'commit':
+ root = i
+ break
+ if root is None:
+ root = 0
+ else:
+ path_top = root + 2
+ path = path_msg(b'/'.join(x.name for x in item_path[path_top:]))
+ path = f'{path_msg(item_path[root].name)}:{path}'
+ if S_ISDIR(item_path[-1].mode):
+ return f'{path_msg(ref_name)} {path}/'
+ else:
+ return f'{path_msg(ref_name)} {path}'
+
+
def qsql_id(s):
return ''.join(('"', s.replace('"', '""'), '"'))
def qsql_str(s):
diff --git a/test/ext/test-validate-refs b/test/ext/test-validate-refs
new file mode 100755
index 00000000..225bc444
--- /dev/null
+++ b/test/ext/test-validate-refs
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh || exit $?
+. test/lib/btl.sh || exit $?
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+bup() { "$top/bup" "$@"; }
+
+
+WVPASS cd "$tmpdir"
+
+
+# Additional --links tests are handled in test-validate-ref-links
+
+WVSTART 'handling of correct refs'
+WVPASS rm -rf bup src
+WVPASS bup init
+WVPASS mkdir src
+WVPASS mkdir -p src/a/b
+WVPASS touch src/a/{1,2,3}
+WVPASS "$top/dev/make-splittable-tree" src/split-tree
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+WVPASS bup validate-refs --links 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+WVPASS bup validate-refs --bupm 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+WVPASS bup validate-refs 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+WVPASS rm -rf src bup
+
+WVSTART 'detection of abridged bupms'
+# Create a save with two root files, and then three, and then replace
+# the three entry save's .bupm with the one from the two entry save so
+# we can check that abridged bupms are detected. Note that tree
+# splitting was added to bup well after this bug was fixed.
+WVPASS rm -rf bup src
+WVPASS bup init
+WVPASS mkdir -p src
+WVPASS echo 1 > src/1
+WVPASS echo 2 > src/2
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+WVPASS echo 3 > src/3
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+# Check that validate-refs thinks this is fine
+WVPASS bup validate-refs --links 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+WVPASS bup validate-refs --bupm 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+WVPASS bup validate-refs 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+# Now replace src:.bupm with the abridged one and test
+bupm_1_2_ent="$(WVPASS git ls-tree src~ | WVPASS grep -E $'\t\.bupm$')"
+broken_tree="$(WVPASS git ls-tree src | WVPASS sed -Ee "1s/.*/$bupm_1_2_ent/")"
+broken_tree_oid="$(echo "$broken_tree" | WVPASS git mktree)"
+broken_save=$(WVPASS git commit-tree "$broken_tree_oid" -p src -m 'abridged bupm')
+WVPASS git branch -f src "$broken_save"
+WVPASS bup validate-refs --links 2>&1 | tee validate.log
+WVPASS grep -vE '^missing ' validate.log
+for args in --bupm ''; do
+ WVEXPRC 1 eval 'bup validate-refs $args 2> >(tee validate.log)'
+ WVPASSEQ 1 "$(grep -cE "abridged-bupm refs/heads/src [0-9a-f]{40}:\.bupm" validate.log)"
+done
+
+
+WVPASS cd "$top"
+WVPASS rm -rf "$tmpdir"
diff --git a/wvtest.sh b/wvtest.sh
index 270714ff..e1b83dfd 100644
--- a/wvtest.sh
+++ b/wvtest.sh
@@ -50,6 +50,37 @@ _wvcheck()
}


+WVEXPRC()
+{
+ if test $# -lt 2; then
+ echo 'Usage: WVEXPRC RC_CASE_PATTERN COMMAND [ARG ...]'
+ exit 2
+ fi
+ local exp="$1"
+ shift
+ local TEXT="$*"
+ _wvpushcall "$@"
+
+ _wvfind_caller
+ case "$-" in
+ *e*) set +e; "$@"; rc=$? ;;
+ *) "$@"; rc=$? ;;
+ esac
+ case "$rc" in
+ $exp)
+ _wvcheck 0 "\$?=$rc matches $exp for $TEXT"
+ _wvpopcall
+ return 0
+ ;;
+ *)
+ _wvcheck 1 "\$?=$rc matches $exp for $TEXT"
+ # NOTREACHED
+ return 1
+ ;;
+ esac
+}
+
+

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
The previous information could be completely misleading when a subset
of refs was given since the progress was always/only based on the
total number of objects in the repository.

Exapnd the progress information to make it clear that the object count
is all objects, and include information about far we've gotten through
the refs that are being scanned, e.g.

scanned 9 of 742 refs (28.32% of all objects)

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/validate_refs.py | 4 ++--
lib/bup/gc.py | 14 ++++++++++----
2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/lib/bup/cmd/validate_refs.py b/lib/bup/cmd/validate_refs.py
index a754b0f5..b238b0c2 100644
--- a/lib/bup/cmd/validate_refs.py
+++ b/lib/bup/cmd/validate_refs.py
@@ -2,7 +2,7 @@
from bup import git, options, vfs
from bup.compat import argv_bytes
from bup.gc import count_objects, find_live_objects
-from bup.helpers import EXIT_FALSE, EXIT_SUCCESS, EXIT_TRUE, log
+from bup.helpers import EXIT_FALSE, EXIT_SUCCESS, EXIT_TRUE, log, progress
from bup.io import path_msg
from bup.repo import LocalRepo

@@ -52,7 +52,7 @@ def main(argv):
if not extra or (extra and ref_info):
existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
if verbosity:
- log(f'found {existing_count} objects\n')
+ progress(f'found {existing_count} objects\r')

if existing_count:
with git.PackIdxList(git.repo(b'objects/pack')) as idxl:
diff --git a/lib/bup/gc.py b/lib/bup/gc.py
index e4684b94..b1c6b249 100644
--- a/lib/bup/gc.py
+++ b/lib/bup/gc.py
@@ -98,7 +98,13 @@ def find_live_objects(existing_count, cat_pipe, idx_list, refs=None,
oid_exists = (lambda oid: idx_list.exists(oid)) if idx_list else None
approx_live_count = 0
missing = 0
- for ref_name, ref_id in refs if refs else git.list_refs():
+ scan_refs = refs if refs else list(git.list_refs())
+ ref_n = len(scan_refs)
+ def progress_msg():
+ return 'scanned %s of %s ref%s (%02.2f%% of all objects)' \
+ % (ref_i, ref_n, 's' if ref_n > 1 else '',
+ approx_live_count * 100.0 / existing_count)
+ for ref_i, (ref_name, ref_id) in enumerate(scan_refs):
for item_path in walk_object(cat_pipe.get, hexlify(ref_id),
stop_at=stop_at, include_data=None,
oid_exists=oid_exists):
@@ -113,17 +119,17 @@ def find_live_objects(existing_count, cat_pipe, idx_list, refs=None,
else:
raise MissingObject(item.oid)
# FIXME: batch ids
- elif verbosity:
- qprogress('scanned %02.2f%%\r'
- % (approx_live_count * 100.0 / existing_count))
if item.type != b'blob':
if verbosity and not item.oid in live_trees:
approx_live_count += 1
+ qprogress(progress_msg() + '\r')
live_trees.add(item.oid)
else:
if verbosity and not live_blobs.exists(item.oid):
approx_live_count += 1
+ qprogress(progress_msg() + '\r')
live_blobs.add(item.oid)
+ log(progress_msg() + '\n')
maybe_close_bloom.pop_all()
if count_missing:
return live_blobs, live_trees, missing
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index f76465e9..972a8e0c 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -880,11 +880,12 @@ def get_everything(opt):
dest_repo.update_ref(ref_name, new_ref, orig_ref)
if opt.verbose:
new_hex = hexlify(new_ref).decode('ascii')
+ ref_msg = path_msg(ref_name)
if orig_ref:
orig_hex = hexlify(orig_ref).decode('ascii')
- log('updated %r (%s -> %s)\n' % (ref_name, orig_hex, new_hex))
+ log(f'updated {ref_msg} ({orig_hex} -> {new_hex})\n')
else:
- log('updated %r (%s)\n' % (ref_name, new_hex))
+ log(f'updated {ref_msg} ({new_hex})\n')
except (git.GitError, client.ClientError) as ex:
note_error('unable to update ref %r: %s\n' % (ref_name, ex))

--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:34 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 7 +++++++
lib/bup/rewrite.py | 21 +++++++++++++++++----
2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 2fa593d3..1200d3b2 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -853,6 +853,13 @@ def get_everything(opt):
# before creating any database via the Rewriter.
target_items = resolve_targets(opt.target_specs, src_repo, dest_repo)

+ # The current arrangement relies on the assumption that any
+ # (sub)tree created by a --rewrite is exactly the same tree
+ # that --repair would create. If we ever need to change that,
+ # then we'll need to drop the relevant dirs from the rewriter
+ # (db) when switching from a --rewrite to a
+ # --repair. (Regarding the converse, --repairs just never
+ # record directory trees in the database.)
with (Rewriter(split_cfg=dest_split_cfg) if rewrite else nullctx) \
as rewriter:

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 2594cd49..d4057386 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -28,9 +28,20 @@ from bup.vfs import \
Item, MissingObject, default_exec_mode, default_file_mode, render_path


-# Currently only handles replacing entire vfs-level trees if any
-# consituent object is missing, entire files, and symlinks.
-
+# The current arrangement relies on a number of assumptions:
+#
+# - repairs (when repairs.destructive is true) never remember
+# replacements nor trees representing directories (i.e. not
+# chunked files) because their content can vary (e.g. changing
+# repair-id).
+#
+# - all rewrite created trees (when repairs.destructive is false)
+# are identical to the one --repair would have created, which
+# allows --rewrite to enter those trees into the db and subsequent
+# --repair(s) to re-use them.
+#
+# Rewrites currently only handle replacing entire vfs-level trees if
+# any consituent object is missing, entire files, and symlinks.

def _prep_mapping_table(db, split_cfg):
# This currently only needs to track items that may be split,
@@ -46,7 +57,7 @@ def _prep_mapping_table(db, split_cfg):
db.execute(f'create table if not exists {table_id}'
' (src blob primary key,'
' dst blob not null,'
- ' chunked integer,' # is this a chunked file
+ ' chunked integer,' # chunked file? (0, 1, or NULL (dir))
' size integer)' # only for files
' without rowid')
return table_id
@@ -429,6 +440,8 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
assert item.meta.size == item_size, (item.meta.size, item_size)
chunked = 1 if S_ISDIR(git_mode) else 0

+ # Isn't and must not be dir or replacement (since we must not
+ # remember those).
_remember_rewrite(item.oid, oid, chunked, item_size, wdbc, mapping)
git_mode = _maybe_exec_mode(git_mode, item.meta)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
For use by forthcoming repairs of abridged bupm files.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/repair.py | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index e57ce3bc..c20b1883 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -12,11 +12,10 @@ def valid_repair_id(s):
return False
return True

-
class Repairs:
# Used, for example, to track all repairs in a bup get process
__slots__ = ('id', 'destructive', 'command', '_others', '_repaired_save',
- '_replacements')
+ '_replaced_files', '_replaced_meta')
def __init__(self, id, destructive, command):
assert valid_repair_id(id)
self.id = id
@@ -24,8 +23,11 @@ class Repairs:
self.command = command
self._others = 0
self._repaired_save = {} # requires 3.7+ dict ordering
- self._replacements = []
- def repair_count(self): return len(self._replacements) + self._others
+ self._replaced_files = []
+ self._replaced_meta = []
+ def repair_count(self):
+ return len(self._replaced_files) + len(self._replaced_meta) \
+ + self._others
def note_incidental_repair(self):
# "Safe" repairs that don't involve the repair id.
self._others += 1
@@ -37,11 +39,16 @@ class Repairs:
existing = self._repaired_save.setdefault(path, commit[1].coid)
if existing:
assert existing == commit[1].coid, (existing, revlist, commit)
+ def meta_replaced(self, path):
+ if self.repair_count() == 0:
+ log(b'repairs needed, repair-id: %s\n' % self.id)
+ self._remember_save(path)
+ self._replaced_meta.append(render_path(path[3:]))
def path_replaced(self, path, oid, new_oid):
if self.repair_count() == 0:
log(b'repairs needed, repair-id: %s\n' % self.id)
self._remember_save(path)
- self._replacements.append((render_path(path[3:]), oid, new_oid))
+ self._replaced_files.append((render_path(path[3:]), oid, new_oid))
def repair_trailers(self, repair_id):
assert valid_repair_id(repair_id)
if not self.repair_count():
@@ -50,7 +57,9 @@ class Repairs:
for save_path, coid in self._repaired_save.items():
trailers.append(b'Bup-Repaired-Save: %s %s'
% (hexlify(coid), enc_sh(save_path)))
- for path, oid, new_oid in self._replacements:
+ for path, oid, new_oid in self._replaced_files:
trailers.append(b'Bup-Replaced: %s %s'
% (hexlify(new_oid), enc_sh(path)))
+ for path in self._replaced_meta:
+ trailers.append(b'Bup-Lost-Meta: %s' % enc_sh(path))
return trailers
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Switch from vfs.join to get_ref so that we can detect missing objects
and raise MissingObject instead of producing a more generic
GitError. Likely also a bit more efficient.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/vfs.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 0987a132..86f3dec4 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -469,7 +469,12 @@ def _find_treeish_oid_metadata(repo, oid):
return None

def _readlink(repo, oid):
- return b''.join(repo.join(hexlify(oid)))
+ # symlink blobs are never split
+ _, kind, _, it = get_ref(repo, hexlify(oid))
+ if not kind:
+ raise MissingObject(oid)
+ assert kind == b'blob', kind
+ return b''.join(it)

def readlink(repo, item):
"""Return the link target of item, which must be a symlink. Reads the
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Make sure we only use EOFError for unexpected, erroneous EOFs, and
provide some other way to detect EOF in other cases. Add many missing
EOF checks.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/client.py | 10 ++-
lib/bup/cmd/validate_refs.py | 5 +-
lib/bup/metadata.py | 127 +++++++++++++++++++++++------------
lib/bup/protocol.py | 95 +++++++++++++++++---------
lib/bup/tree.py | 14 ++--
lib/bup/vfs.py | 30 +++++----
lib/bup/vint.py | 39 +++++++----
test/int/test_vint.py | 17 +++--
test/lib/buptest/vfs.py | 15 +++--
9 files changed, 230 insertions(+), 122 deletions(-)

diff --git a/lib/bup/client.py b/lib/bup/client.py
index 6f9d5ca1..d6358a0d 100644
--- a/lib/bup/client.py
+++ b/lib/bup/client.py
@@ -588,6 +588,8 @@ class Client:
with self._call('config-get'):
vint.send(conn, 'ss', name, opttype.encode('ascii') if opttype else b'')
kind = read_vuint(conn)
+ if kind is None:
+ raise EOFError('EOF while reading config value type')
if kind == 0:
return None
elif kind == 1:
@@ -595,9 +597,13 @@ class Client:
elif kind == 2:
return False
elif kind == 3:
- return read_vint(conn)
+ val = read_vint(conn)
+ if val is None: raise EOFError('EOF while reading vint')
+ return val
elif kind == 4:
- return read_bvec(conn)
+ val = read_bvec(conn)
+ if val is None: raise EOFError('EOF while reading bvec')
+ return val
elif kind == 5:
raise PermissionError(f'config-get does not allow remote access to {name}')
else:
diff --git a/lib/bup/cmd/validate_refs.py b/lib/bup/cmd/validate_refs.py
index 03dd2214..e26672b9 100644
--- a/lib/bup/cmd/validate_refs.py
+++ b/lib/bup/cmd/validate_refs.py
@@ -101,11 +101,8 @@ def main(argv):
bupm_n = 0
with tree_data_reader(repo, item.oid) as bupm:
try:
- while True:
- Metadata.read(bupm)
+ while Metadata.read(bupm):
bupm_n += 1
- except EOFError:
- pass
except MissingObject:
return True # bupm sub-item, will be handled by later for_item
except Exception:
diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 565defe1..93d464d5 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -208,6 +208,9 @@ class ApplyError(Exception):
# Thrown when unable to apply any given bit of metadata to a path.
pass

+# Because we want to use a Metadata instance as a default for
+# Metadata.read() staticmethod arg.
+_use_empty_metadata = object()

class Metadata:
# Metadata is stored as a sequence of tagged binary records. Each
@@ -302,6 +305,8 @@ class Metadata:
else:
raise Exception('unexpected common_rec version %d' % version)
data = vint.read_bvec(port)
+ if data is None:
+ raise EOFError('EOF while reading metadata common records')
values = vint.unpack(unpack_fmt, data)
if version == 3:
(self.mode, self.uid, self.user, self.gid, self.group,
@@ -483,7 +488,10 @@ class Metadata:
return None

def _load_path_rec(self, port):
- self.path = vint.unpack('s', vint.read_bvec(port))[0]
+ data = vint.read_bvec(port)
+ if data is None:
+ raise EOFError('EOF while reading metadata path record')
+ self.path = vint.unpack('s', data)[0]


## Symlink targets
@@ -503,6 +511,8 @@ class Metadata:

def _load_symlink_target_rec(self, port):
target = vint.read_bvec(port)
+ if target is None:
+ raise EOFError('EOF while reading metadata symlink target')
self.symlink_target = target
if self.size is None:
self.size = len(target)
@@ -523,7 +533,10 @@ class Metadata:
return self.hardlink_target

def _load_hardlink_target_rec(self, port):
- self.hardlink_target = vint.read_bvec(port)
+ target = vint.read_bvec(port)
+ if target is None:
+ raise EOFError('EOF while reading metadata hardlink target')
+ self.hardlink_target = target


## POSIX1e ACL records
@@ -580,7 +593,10 @@ class Metadata:

def _load_posix1e_acl_rec(self, port, *, version):
assert version in (1, 2)
- acl_rep = vint.unpack('ssss', vint.read_bvec(port))
+ acl_data = vint.read_bvec(port)
+ if acl_data is None:
+ raise EOFError('EOF while reading POSIX1e ACL metadata')
+ acl_rep = vint.unpack('ssss', acl_data)
if acl_rep[2] == b'':
acl_rep = acl_rep[:2]
if version == 1:
@@ -654,6 +670,8 @@ class Metadata:

def _load_linux_attr_rec(self, port):
data = vint.read_bvec(port)
+ if data is None:
+ raise EOFError('EOF while reading Linux attr metadata')
self.linux_attr = vint.unpack('V', data)[0]

def _apply_linux_attr_rec(self, path, restore_numeric_ids=False):
@@ -702,11 +720,20 @@ class Metadata:

def _load_linux_xattr_rec(self, file):
data = vint.read_bvec(file)
+ if data is None:
+ raise EOFError('EOF while reading Linux xattr metadata')
memfile = BytesIO(data)
result = []
- for i in range(vint.read_vuint(memfile)):
+ xattr_n = vint.read_vuint(memfile)
+ if xattr_n is None:
+ raise EOFError('EOF while reading number of Linux xattrs')
+ for i in range(xattr_n):
key = vint.read_bvec(memfile)
+ if key is None:
+ raise EOFError('EOF while reading Linux xattr metadata key')
value = vint.read_bvec(memfile)
+ if value is None:
+ raise EOFError('EOF while reading Linux xattr metadata value')
result.append((key, value))
self.linux_xattr = result

@@ -885,45 +912,52 @@ class Metadata:
return copy.deepcopy(self).thaw()

@staticmethod
- def read(port):
- # This method should either return a valid Metadata object,
- # return None if there was no information at all (just a
- # _rec_tag_end), throw EOFError if there was nothing at all to
- # read, or throw an Exception if a valid object could not be
- # read completely.
+ def read(port, empty=_use_empty_metadata):
+ """Read an encoded Metadata instance from port, returning None on EOF.
+
+ Return either a valid Metadata object, None on EOF, or empty
+ (defaulting to metadata.empty_metadata) if there was no
+ information at all (just a _rec_tag_end). Throw an Exception
+ if a valid object could not be read completely.
+
+ """
+ if empty is _use_empty_metadata:
+ empty = empty_metadata
tag = vint.read_vuint(port)
- if tag == _rec_tag_end:
+ if tag is None:
return None
- try: # From here on, EOF is an error.
- result = Metadata()
- while True: # only exit is error (exception) or _rec_tag_end
- if tag == _rec_tag_path:
- result._load_path_rec(port)
- elif tag == _rec_tag_common_v3:
- result._load_common_rec(port, version=3)
- elif tag == _rec_tag_common_v2:
- result._load_common_rec(port, version=2)
- elif tag == _rec_tag_symlink_target:
- result._load_symlink_target_rec(port)
- elif tag == _rec_tag_hardlink_target:
- result._load_hardlink_target_rec(port)
- elif tag == _rec_tag_posix1e_acl_v2:
- result._load_posix1e_acl_rec(port, version=2)
- elif tag == _rec_tag_posix1e_acl_v1:
- result._load_posix1e_acl_rec(port, version=1)
- elif tag == _rec_tag_linux_attr:
- result._load_linux_attr_rec(port)
- elif tag == _rec_tag_linux_xattr:
- result._load_linux_xattr_rec(port)
- elif tag == _rec_tag_end:
- return result
- elif tag == _rec_tag_common_v1: # Should be very rare.
- result._load_common_rec(port, version=1)
- else: # unknown record
- vint.skip_bvec(port)
- tag = vint.read_vuint(port)
- except EOFError:
- raise Exception("EOF while reading Metadata")
+ if tag == _rec_tag_end:
+ return empty
+ # From here on, EOF is an error.
+ result = Metadata()
+ while True: # only exit is error (exception) or _rec_tag_end
+ if tag == _rec_tag_path:
+ result._load_path_rec(port)
+ elif tag == _rec_tag_common_v3:
+ result._load_common_rec(port, version=3)
+ elif tag == _rec_tag_common_v2:
+ result._load_common_rec(port, version=2)
+ elif tag == _rec_tag_symlink_target:
+ result._load_symlink_target_rec(port)
+ elif tag == _rec_tag_hardlink_target:
+ result._load_hardlink_target_rec(port)
+ elif tag == _rec_tag_posix1e_acl_v2:
+ result._load_posix1e_acl_rec(port, version=2)
+ elif tag == _rec_tag_posix1e_acl_v1:
+ result._load_posix1e_acl_rec(port, version=1)
+ elif tag == _rec_tag_linux_attr:
+ result._load_linux_attr_rec(port)
+ elif tag == _rec_tag_linux_xattr:
+ result._load_linux_xattr_rec(port)
+ elif tag == _rec_tag_end:
+ return result
+ elif tag == _rec_tag_common_v1: # Should be very rare.
+ result._load_common_rec(port, version=1)
+ else: # unknown record
+ vint.skip_bvec(port)
+ tag = vint.read_vuint(port)
+ if tag is None:
+ raise EOFError('EOF within Metadata entry')

def isdir(self):
return stat.S_ISDIR(self.mode)
@@ -963,6 +997,9 @@ class Metadata:
and self._same_linux_xattr(other)


+empty_metadata = Metadata(frozen=True)
+
+
def from_path(path, statinfo=None, archive_path=None,
save_symlinks=True, hardlink_target=None,
normalized=False, after_stat=None):
@@ -1166,10 +1203,12 @@ def detailed_bytes(meta, fields = None):

class _ArchiveIterator:
def __next__(self):
- try:
- return Metadata.read(self._file)
- except EOFError:
+ m = Metadata.read(self._file)
+ if m is empty_metadata:
+ return None
+ if m is None:
raise StopIteration()
+ return m

next = __next__

diff --git a/lib/bup/protocol.py b/lib/bup/protocol.py
index 8434dae4..65eeb575 100644
--- a/lib/bup/protocol.py
+++ b/lib/bup/protocol.py
@@ -14,31 +14,45 @@ from bup.metadata import Metadata


def read_item(port):
- def read_m(port, has_meta):
+ """Read an encoded VFS item from port. Throw EOFError for EOF."""
+ def read_oid(port, kind):
+ bv = read_bvec(port)
+ if bv is None:
+ raise EOFError(f'EOF while reading {kind} OID')
+ return bv
+ def read_m(port, kind, has_meta):
if has_meta:
m = Metadata.read(port)
- return m
- return read_vuint(port)
+ if m is None:
+ raise EOFError(f'EOF while reading {kind} metadata')
+ else:
+ m = read_vuint(port)
+ if m is None:
+ raise EOFError(f'EOF while reading {kind} integer mode')
+ return m
kind, has_meta = vint.recv(port, 'sV')
if kind == b'Item':
- oid, meta = read_bvec(port), read_m(port, has_meta)
+ oid, meta = read_oid(port, kind), read_m(port, kind, has_meta)
return Item(oid=oid, meta=meta)
if kind == b'Chunky':
- oid, meta = read_bvec(port), read_m(port, has_meta)
+ oid, meta = read_oid(port, kind), read_m(port, kind, has_meta)
return Chunky(oid=oid, meta=meta)
if kind == b'RevList':
- oid, meta = read_bvec(port), read_m(port, has_meta)
+ oid, meta = read_oid(port, kind), read_m(port, kind, has_meta)
return RevList(oid=oid, meta=meta)
if kind == b'Root':
- return Root(meta=read_m(port, has_meta))
+ return Root(meta=read_m(port, kind, has_meta))
if kind == b'Tags':
- return Tags(meta=read_m(port, has_meta))
+ return Tags(meta=read_m(port, kind, has_meta))
if kind == b'Commit':
oid, coid = vint.recv(port, 'ss')
- meta = read_m(port, has_meta)
+ meta = read_m(port, kind, has_meta)
return Commit(oid=oid, coid=coid, meta=meta)
if kind == b'FakeLink':
- target, meta = read_bvec(port), read_m(port, has_meta)
+ target = read_bvec(port)
+ if target is None:
+ raise EOFError(f'EOF while reading {kind} target')
+ meta = read_m(port, kind, has_meta)
return FakeLink(target=target, meta=meta)
assert False

@@ -78,26 +92,6 @@ def write_item(port, item):
else:
assert False

-def write_ioerror(port, ex):
- assert isinstance(ex, vfs.IOError)
- write_vuint(port,
- (1 if ex.errno is not None else 0)
- | (2 if ex.strerror is not None else 0)
- | (4 if ex.terminus is not None else 0))
- if ex.errno is not None:
- write_vint(port, ex.errno)
- if ex.strerror is not None:
- write_bvec(port, ex.strerror.encode('utf-8'))
- if ex.terminus is not None:
- write_resolution(port, ex.terminus)
-
-def read_ioerror(port):
- mask = read_vuint(port)
- no = read_vint(port) if 1 & mask else None
- msg = read_bvec(port).decode('utf-8') if 2 & mask else None
- term = read_resolution(port) if 4 & mask else None
- return vfs.IOError(errno=no, message=msg, terminus=term)
-
def write_resolution(port, resolution):
write_vuint(port, len(resolution))
for name, item in resolution:
@@ -109,16 +103,55 @@ def write_resolution(port, resolution):
port.write(b'\x00')

def read_resolution(port):
+ # resolution must exist; raise EOFError if not
n = read_vuint(port)
+ if n is None:
+ raise EOFError('EOF while reading VFS resolve path length')
result = []
for i in range(n):
name = read_bvec(port)
+ if name is None:
+ raise EOFError(f'EOF while reading VFS resolve path name')
have_item = ord(port.read(1))
+ if have_item is None:
+ raise EOFError(f'EOF while reading VFS resolve path item indicator')
assert have_item in (0, 1)
item = read_item(port) if have_item else None
result.append((name, item))
return tuple(result)

+def write_ioerror(port, ex):
+ assert isinstance(ex, vfs.IOError)
+ write_vuint(port,
+ (1 if ex.errno is not None else 0)
+ | (2 if ex.strerror is not None else 0)
+ | (4 if ex.terminus is not None else 0))
+ if ex.errno is not None:
+ write_vint(port, ex.errno)
+ if ex.strerror is not None:
+ write_bvec(port, ex.strerror.encode('utf-8'))
+ if ex.terminus is not None:
+ write_resolution(port, ex.terminus)
+
+def read_ioerror(port):
+ mask = read_vuint(port)
+ if mask is None:
+ raise EOFError(f'EOF while reading IOError mask')
+ no, msg, term = None, None, None
+ if 1 & mask:
+ no = read_vint(port)
+ if no is None:
+ raise EOFError(f'EOF while reading IOError errno')
+ if 2 & mask:
+ msg = read_bvec(port).decode('utf-8')
+ if msg is None:
+ raise EOFError(f'EOF while reading IOError message')
+ if 4 & mask:
+ term = read_resolution(port)
+ if msg is None:
+ raise EOFError(f'EOF while reading IOError terminus')
+ return vfs.IOError(errno=no, message=msg, terminus=term)
+
def _command(fn):
fn.bup_server_command = True
return fn
@@ -364,6 +397,8 @@ class Server:
have_parent = bool(flags & 4)
parent = read_resolution(self.conn) if have_parent else None
path = read_bvec(self.conn)
+ if path is None:
+ raise EOFError('EOF while reading resolved VFS path length')
if not len(path):
raise Exception('Empty resolve path')
try:
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 15f4b273..594f60fa 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -12,12 +12,10 @@ from bup.hashsplit import \
from bup.git import shalist_item_sort_key, mangle_name
from bup.helpers import add_error
from bup.io import path_msg
-from bup.metadata import Metadata
+from bup.metadata import Metadata, empty_metadata
from bup.vfs import LostMetadata


-_empty_metadata = Metadata(frozen=True)
-
class TreeItem:
__slots__ = 'name', 'mode', 'gitmode', 'oid', 'meta'
def __init__(self, name, mode, gitmode, oid, meta):
@@ -30,7 +28,7 @@ class TreeItem:
self.mode = mode
self.gitmode = gitmode
self.oid = oid
- self.meta = meta or _empty_metadata
+ self.meta = meta or empty_metadata
def __repr__(self):
cls = self.__class__
return f'<{cls.__module__}.{cls.__name__} object at {hex(id(self))}' \
@@ -129,11 +127,11 @@ def _dir_metadata(dir_meta, items, repair):
# (e.g. tree_items_except_dot).
any_real_meta = False
if isinstance(dir_meta, (int, type(None))):
- meta_ents = [(b'', _empty_metadata)]
+ meta_ents = [(b'', empty_metadata)]
elif isinstance(dir_meta, LostMetadata):
if not repair:
raise Exception(f'LostMetadata for ".", but not repairing {dir_meta!r}')
- meta_ents = [(b'', _empty_metadata)]
+ meta_ents = [(b'', empty_metadata)]
elif isinstance(dir_meta, Metadata):
any_real_meta = True
meta_ents = [(b'', dir_meta)]
@@ -144,12 +142,12 @@ def _dir_metadata(dir_meta, items, repair):
continue
if isinstance(entry.meta, (int, type(None))):
ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
- _empty_metadata)
+ empty_metadata)
elif isinstance(entry.meta, LostMetadata):
if not repair:
raise Exception(f'LostMetadata, but not repairing {entry!r}')
ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
- _empty_metadata)
+ empty_metadata)
elif isinstance(entry.meta, Metadata):
any_real_meta = True
ml = (shalist_item_sort_key((entry.mode, entry.name, None)),
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 8c22578d..902da582 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -105,7 +105,7 @@ from bup.git import \
tree_iter)
from bup.helpers import EXIT_FAILURE, debug2
from bup.io import path_msg
-from bup.metadata import Metadata
+from bup.metadata import Metadata, empty_metadata

py_IOError = IOError

@@ -419,12 +419,13 @@ def item_mode(item):
return m

def _read_dir_meta(bupm):
- # This is because save writes unmodified Metadata() entries for
- # fake parents -- test-save-strip-graft.sh demonstrates.
- m = Metadata.read(bupm)
- if not m:
- return default_dir_mode
- assert m.mode is not None
+ # May be empty because save writes unmodified Metadata() entries
+ # for fake parents -- test-save-strip-graft.sh demonstrates.
+ m = Metadata.read(bupm, empty=default_dir_mode)
+ if m is None:
+ raise EOFError('EOF while reading directory metadata')
+ if isinstance(m, Metadata):
+ assert m.mode is not None
return m

def _treeish_tree_data(repo, oid):
@@ -626,10 +627,10 @@ def _validated_meta_ents(oid, tree_ents, bupm, repair):
if not bupm:
return None
meta_entries = []
- try:
- while True: meta_entries.append(Metadata.read(bupm))
- except EOFError:
- pass
+ m = Metadata.read(bupm)
+ while m:
+ meta_entries.append(None if m is empty_metadata else m)
+ m = Metadata.read(bupm)
exp_meta_n = 0
for ent in tree_ents:
if ent[1] != b'.bupm' and (ent[2] == BUP_CHUNKED or not S_ISDIR(ent[3])):
@@ -747,7 +748,9 @@ def _split_subtree_items(repo, level, oid, entries, names, want_meta, root=True)
yield from _tree_items_except_dot(oid, entries, names)
else:
with _FileReader(repo, bupm_oid) as bupm:
- Metadata.read(bupm) # skip dummy entry provided for older bups
+ # skip dummy entry provided for older bups
+ if not Metadata.read(bupm):
+ raise EOFError('EOF instead of split tree placeholder metadata')
yield from _tree_items_except_dot(oid, entries, names, bupm)
else:
for _, mangled_name, sub_oid in entries:
@@ -815,7 +818,8 @@ def tree_items(repo, oid, tree_data, names, *, want_meta=True, repair=False):
if depth is None:
with _FileReader(repo, bupm_oid) as bupm:
if not dot_requested: # skip it
- Metadata.read(bupm)
+ if not Metadata.read(bupm):
+ raise EOFError('EOF while skipping directory metadata')
else:
yield b'.', Item(oid=oid, meta=_read_dir_meta(bupm))
yield from _tree_items_except_dot(oid, entries, names, bupm,
diff --git a/lib/bup/vint.py b/lib/bup/vint.py
index 03ad4b18..5f8f8a37 100644
--- a/lib/bup/vint.py
+++ b/lib/bup/vint.py
@@ -35,10 +35,11 @@ def encode_vuint(x):
break
return ret

+
def read_vuint(port):
c = port.read(1)
if not c:
- raise EOFError('encountered EOF while reading vuint')
+ return None
assert isinstance(c, bytes)
if ord(c) == 0:
return 0
@@ -82,7 +83,7 @@ def encode_vint(x):
def read_vint(port):
c = port.read(1)
if not c:
- raise EOFError('encountered EOF while reading vint')
+ return None
assert isinstance(c, bytes)
negative = False
result = 0
@@ -95,6 +96,8 @@ def read_vint(port):
if b & 0x80:
offset += 6
c = port.read(1)
+ if not c:
+ raise EOFError('encountered EOF while reading vint')
elif negative:
return -result
else:
@@ -123,15 +126,25 @@ def write_bvec(port, x):

def read_bvec(port):
n = read_vuint(port)
- return port.read(n)
-
+ if n is None:
+ return None
+ val = port.read(n)
+ if len(val) != n: # e.g. EOF when n != 0
+ raise EOFError('EOF while reading bvec bytes')
+ return val

def encode_bvec(x):
return _helpers.vuint_encode(len(x)) + x


def skip_bvec(port):
- port.read(read_vuint(port))
+ n = read_vuint(port)
+ if n is None:
+ raise EOFError('encountered EOF while skipping bvec')
+ val = port.read(n)
+ if not val:
+ raise EOFError('encountered EOF while skipping bvec')
+

def send(port, types, *args):
if len(types) != len(args):
@@ -149,14 +162,14 @@ def send(port, types, *args):
def recv(port, types):
result = []
for type in types:
- if type == 'V':
- result.append(read_vuint(port))
- elif type == 'v':
- result.append(read_vint(port))
- elif type == 's':
- result.append(read_bvec(port))
- else:
- raise Exception('unknown xunpack format string item "' + type + '"')
+ if type == 'V': decode = read_vuint
+ elif type == 'v': decode = read_vint
+ elif type == 's': decode = read_bvec
+ else: raise Exception(f'unknown xunpack format string item {type!r}')
+ x = decode(port)
+ if x is None:
+ raise EOFError(f'EOF while reading xunpack type {type!r}')
+ result.append(x)
return result

def pack(types, *args):
diff --git a/test/int/test_vint.py b/test/int/test_vint.py
index 8c596f08..f8120896 100644
--- a/test/int/test_vint.py
+++ b/test/int/test_vint.py
@@ -17,7 +17,9 @@ def test_vuint():
for x in (0, 1, 42, 128, 10**16, 10**100):
WVPASSEQ(encode_and_decode_vuint(x), x)
WVEXCEPT(Exception, vint.write_vuint, BytesIO(), -1)
- WVEXCEPT(EOFError, vint.read_vuint, BytesIO())
+ assert vint.read_vuint(BytesIO()) is None
+ WVEXCEPT(EOFError, vint.read_vuint, BytesIO(b'\x80'))
+ WVEXCEPT(EOFError, vint.read_vuint, BytesIO(b'\x80\x80'))


def encode_and_decode_vint(x):
@@ -32,8 +34,9 @@ def test_vint():
WVPASSEQ(encode_and_decode_vint(x), x)
for x in [-x for x in values]:
WVPASSEQ(encode_and_decode_vint(x), x)
- WVEXCEPT(EOFError, vint.read_vint, BytesIO())
- WVEXCEPT(EOFError, vint.read_vint, BytesIO(b"\x80\x80"))
+ assert vint.read_vint(BytesIO()) is None
+ WVEXCEPT(EOFError, vint.read_vint, BytesIO(b'\x80'))
+ WVEXCEPT(EOFError, vint.read_vint, BytesIO(b'\x80\x80'))


def encode_and_decode_bvec(x):
@@ -46,7 +49,10 @@ def test_bvec():
values = (b'', b'x', b'foo', b'\0', b'\0foo', b'foo\0bar\0')
for x in values:
WVPASSEQ(encode_and_decode_bvec(x), x)
- WVEXCEPT(EOFError, vint.read_bvec, BytesIO())
+ assert vint.read_bvec(BytesIO()) is None
+ assert b'' == vint.read_bvec(BytesIO(b'\x00'))
+ WVEXCEPT(EOFError, vint.read_bvec, BytesIO(b'\x80'))
+ WVEXCEPT(EOFError, vint.read_bvec, BytesIO(b'\x01'))
outf = BytesIO()
for x in (b'foo', b'bar', b'baz', b'bax'):
vint.write_bvec(outf, x)
@@ -55,6 +61,9 @@ def test_bvec():
WVPASSEQ(vint.read_bvec(inf), b'bar')
vint.skip_bvec(inf)
WVPASSEQ(vint.read_bvec(inf), b'bax')
+ WVEXCEPT(EOFError, vint.skip_bvec, BytesIO(b''))
+ WVEXCEPT(EOFError, vint.skip_bvec, BytesIO(b'\x80'))
+ WVEXCEPT(EOFError, vint.skip_bvec, BytesIO(b'\x01'))


def pack_and_unpack(types, *values):
diff --git a/test/lib/buptest/vfs.py b/test/lib/buptest/vfs.py
index dbd3c609..8c2989c8 100644
--- a/test/lib/buptest/vfs.py
+++ b/test/lib/buptest/vfs.py
@@ -19,8 +19,14 @@ def tree_items(repo, oid):
tree_data, bupm_oid = vfs.tree_data_and_bupm(repo, oid)
bupm = vfs._FileReader(repo, bupm_oid) if bupm_oid else None
try:
- maybe_meta = lambda : Metadata.read(bupm) if bupm else None
- m = maybe_meta()
+ def maybe_meta(name):
+ if not bupm:
+ return None
+ m = Metadata.read(bupm)
+ if not m:
+ raise EOFError(f'EOF while reading metadata entry for {name}')
+ return m
+ m = maybe_meta(b'.')
yield TreeDictValue(name=b'.', oid=oid, meta=m)
tree_ents = vfs.ordered_tree_entries(tree_entries(tree_data), bupm=True)
for name, mangled_name, kind, gitmode, sub_oid in tree_ents:
@@ -30,12 +36,13 @@ def tree_items(repo, oid):
if S_ISDIR(gitmode):
if kind == BUP_CHUNKED:
yield TreeDictValue(name=name, oid=sub_oid,
- meta=maybe_meta())
+ meta=maybe_meta(name))
else:
yield TreeDictValue(name=name, oid=sub_oid,
meta=vfs.default_dir_mode)
else:
- yield TreeDictValue(name=name, oid=sub_oid, meta=maybe_meta())
+ yield TreeDictValue(name=name, oid=sub_oid,
+ meta=maybe_meta(name))
finally:
if bupm:
bupm.close()
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/repair.py | 19 ++++++++++++++--
lib/bup/rewrite.py | 17 +++++---------
lib/bup/vfs.py | 5 +++++
test/ext/test-get-rewrite-missing | 37 ++++++++++++++++++-------------
4 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index d1860686..e57ce3bc 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -2,6 +2,7 @@
from binascii import hexlify

from bup.io import enc_sh, log
+from bup.vfs import Commit, RevList, render_path


def valid_repair_id(s):
@@ -14,27 +15,41 @@ def valid_repair_id(s):

class Repairs:
# Used, for example, to track all repairs in a bup get process
- __slots__ = ('id', 'destructive', 'command', '_others', '_replacements')
+ __slots__ = ('id', 'destructive', 'command', '_others', '_repaired_save',
+ '_replacements')
def __init__(self, id, destructive, command):
assert valid_repair_id(id)
self.id = id
self.destructive = destructive
self.command = command
self._others = 0
+ self._repaired_save = {} # requires 3.7+ dict ordering
self._replacements = []
def repair_count(self): return len(self._replacements) + self._others
def note_incidental_repair(self):
# "Safe" repairs that don't involve the repair id.
self._others += 1
+ def _remember_save(self, path):
+ revlist, commit = path[1:3]
+ assert isinstance(revlist[1], RevList), path
+ assert isinstance(commit[1], Commit), path
+ path = b'%s/%s' % (revlist[0], commit[0])
+ existing = self._repaired_save.setdefault(path, commit[1].coid)
+ if existing:
+ assert existing == commit[1].coid, (existing, revlist, commit)
def path_replaced(self, path, oid, new_oid):
if self.repair_count() == 0:
log(b'repairs needed, repair-id: %s\n' % self.id)
- self._replacements.append((path, oid, new_oid))
+ self._remember_save(path)
+ self._replacements.append((render_path(path[3:]), oid, new_oid))
def repair_trailers(self, repair_id):
assert valid_repair_id(repair_id)
if not self.repair_count():
return []
trailers = [b'Bup-Repair-ID: ' + repair_id]
+ for save_path, coid in self._repaired_save.items():
+ trailers.append(b'Bup-Repaired-Save: %s %s'
+ % (hexlify(coid), enc_sh(save_path)))
for path, oid, new_oid in self._replacements:
trailers.append(b'Bup-Replaced: %s %s'
% (hexlify(new_oid), enc_sh(path)))
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 02f2abf0..2594cd49 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -24,19 +24,14 @@ from bup.metadata import Metadata
from bup.path import xdg_cache
from bup.pwdgrp import userfullname, username
from bup.tree import Stack
-from bup.vfs import Item, MissingObject, default_exec_mode, default_file_mode
+from bup.vfs import \
+ Item, MissingObject, default_exec_mode, default_file_mode, render_path


# Currently only handles replacing entire vfs-level trees if any
# consituent object is missing, entire files, and symlinks.


-def _fs_path_from_vfs(path):
- fs = b'/'.join(x[0] for x in path)
- if not S_ISDIR(vfs.item_mode(path[-1][1])):
- return fs
- return fs + b'/'
-
def _prep_mapping_table(db, split_cfg):
# This currently only needs to track items that may be split,
# depending on the current repo settings (e.g. files and
@@ -96,8 +91,8 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
return item, dst, GIT_MODE_TREE if chunked else GIT_MODE_FILE

def _path_repaired(path, oid, replacement_oid, missing_oid, repairs):
- fs_path = _fs_path_from_vfs(path)
- repairs.path_replaced(fs_path, oid, replacement_oid)
+ fs_path = render_path(path)
+ repairs.path_replaced(path, oid, replacement_oid)
ep = path_msg(fs_path)
log(f'warning: missing object {missing_oid.hex()} for {ep}\n')
log(f'repaired {ep} {oid.hex()} -> {replacement_oid.hex()}\n')
@@ -169,7 +164,7 @@ def _vfs_walk_dir_recursively(srcrepo, dstrepo, path, excludes, db, mapping,
item = path[-1][1]
assert len(path) >= 3
# drop branch/DATE
- fs_path_in_save = _fs_path_from_vfs((path[0],) + path[3:])
+ fs_path_in_save = render_path((path[0],) + path[3:])

if not repairs.destructive:
entries = vfs.contents(srcrepo, item)
@@ -239,7 +234,7 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs):
except MissingObject as ex:
if have_meta and item.symlink_target is not None:
repairs.note_indidental_repair()
- pm = path_msg(_fs_path_from_vfs(path))
+ pm = path_msg(render_path(path))
log(f'warning: symlink data replaced from metadata for {pm}\n')
target = item.symlink_target
else:
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index f4333643..b246790a 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -1263,3 +1263,8 @@ def join(repo, ref):
if not got[0]:
raise GitError(f'ref {ref} does not exist') # eventually some ENOENT?
yield from _join(*got, [ref])
+
+def render_path(path):
+ if not S_ISDIR(item_mode(path[-1][1])):
+ return b'/'.join(x[0] for x in path)
+ return b'/'.join(x[0] for x in path) + b'/'
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index b20b72c0..9d41edb5 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -50,6 +50,7 @@ WVPASS bup save --strip -n src src
WVPASS readarray -t saves < <(bup ls src)
save_date="${saves[0]}"
src_oid="$(git rev-parse src)"
+unset saves

WVPASS bup -d dest-repo init
WVPASS git --git-dir dest-repo config bup.split.trees true
@@ -137,10 +138,7 @@ repair-to-dest()
rm -rf dest-repo
WVPASS bup -d dest-repo init
WVPASS git --git-dir dest-repo config bup.split.trees true
- bup -d dest-repo get -s bup --repair --append src 2> repair.log
- rc=$?
- display-file repair.log
- WVPASSEQ 3 "$rc"
+ WVEXPRC 3 eval 'bup -d dest-repo get -s bup --repair --append src 2> >(tee repair.log)'
}

set-repair-id()
@@ -156,10 +154,10 @@ repair-to-dest
set-repair-id

oid_rx='[0-9a-fA-F]{40}'
-missing_file="/src/$save_date/a/missing-file"
-missing_dir="/src/$save_date/missing-dir/"
-missing_partial="/src/$save_date/partial-file"
-missing_split="/src/$save_date/split-tree/"
+missing_file="a/missing-file"
+missing_dir="missing-dir/"
+missing_partial="partial-file"
+missing_split="split-tree/"

WVPASS git --git-dir dest-repo ls-tree src \
| WVPASS grep -E $'\tmissing-dir$' | btl-ent-oid > dir-replacement-oid
@@ -182,18 +180,25 @@ wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save '
wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get .* --repair '
wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: $repair_id$"
-wv-match-rx "${trailers[5]}" "^Bup-Replaced: $(< blob-replacement-oid) $missing_file$"
-wv-match-rx "${trailers[6]}" "^Bup-Replaced: $(< dir-replacement-oid) $missing_dir$"
-wv-match-rx "${trailers[7]}" \
+wv-match-rx "${trailers[5]}" "^Bup-Repaired-Save: $src_oid src/$save_date$"
+wv-match-rx "${trailers[6]}" "^Bup-Replaced: $(< blob-replacement-oid) $missing_file$"
+wv-match-rx "${trailers[7]}" "^Bup-Replaced: $(< dir-replacement-oid) $missing_dir$"
+wv-match-rx "${trailers[8]}" \
"^Bup-Replaced: $(< partial-file-replacement-oid) $missing_partial$"
-WVPASSEQ "" "${trailers[8]}" # end-of-line
+WVPASSEQ "" "${trailers[9]}" # end-of-line
unset trailers

+src_missing_file="/src/$save_date/a/missing-file"
+src_missing_dir="/src/$save_date/missing-dir/"
+src_missing_partial="/src/$save_date/partial-file"
+src_missing_split="/src/$save_date/split-tree/"
+
# A missing dir .bupm in a non-split repo is indistinguishable from a
# git created tree.

+
WVSTART 'missing blobs are rewritten'
-WVPASS grep -E "^repaired $missing_file $oid_rx -> $oid_rx\$" repair.log
+WVPASS grep -E "^repaired $src_missing_file $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:a/missing-file > blob-replacement
display-file blob-replacement
WVPASS grep -E '^This is a replacement for a file' blob-replacement
@@ -203,7 +208,7 @@ WVPASS grep -E "^Missing: $(< file-oid)" blob-replacement


WVSTART 'missing trees are rewritten'
-WVPASS grep -E "^repaired $missing_dir $oid_rx -> $oid_rx\$" repair.log
+WVPASS grep -E "^repaired $src_missing_dir $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:missing-dir > tree-replacement
display-file tree-replacement
WVPASS grep -E '^This is a replacement for a tree' tree-replacement
@@ -213,7 +218,7 @@ WVPASS grep -E "^Missing: $(< dir-oid)" tree-replacement


WVSTART 'incomplete chunked files are rewritten'
-WVPASS grep -E "^repaired $missing_partial $oid_rx -> $oid_rx\$" repair.log
+WVPASS grep -E "^repaired $src_missing_partial $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:partial-file > partial-file-replacement
display-file partial-file-replacement
WVPASS grep -E '^This is a replacement for a file' partial-file-replacement
@@ -232,7 +237,7 @@ WVSTART 'incomplete split tree (missing top-level sub-tree)'
WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" < split-tree-l1-oid
repair-to-dest
set-repair-id
-WVPASS grep -E "^repaired $missing_split $oid_rx -> $oid_rx\$" repair.log
+WVPASS grep -E "^repaired $src_missing_split $oid_rx -> $oid_rx\$" repair.log
WVPASS git --git-dir dest-repo show src:split-tree > split-tree-replacement
display-file split-tree-replacement
WVPASS grep -E '^This is a replacement for a tree' split-tree-replacement
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/restore.py | 4 ++--
lib/bup/ls.py | 4 ++--
lib/bup/metadata.py | 51 +++++++++++++++++++++++++++---------------
lib/bup/rewrite.py | 3 ++-
lib/bup/tree.py | 4 ++--
lib/bup/vfs.py | 3 ++-
6 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/lib/bup/cmd/restore.py b/lib/bup/cmd/restore.py
index 6363d781..90ad66b0 100644
--- a/lib/bup/cmd/restore.py
+++ b/lib/bup/cmd/restore.py
@@ -1,6 +1,6 @@

from stat import S_ISDIR
-import copy, errno, os, re, stat, sys
+import errno, os, re, stat, sys

from bup import options, vfs
from bup._helpers import write_sparsely
@@ -70,7 +70,7 @@ def parse_owner_mappings(type, options, fatal):
return owner_map

def apply_metadata(meta, name, restore_numeric_ids, owner_map):
- m = copy.deepcopy(meta)
+ m = meta.copy(frozen=False)
m.user = owner_map['user'].get(m.user, m.user)
m.group = owner_map['group'].get(m.group, m.group)
m.uid = owner_map['uid'].get(m.uid, m.uid)
diff --git a/lib/bup/ls.py b/lib/bup/ls.py
index 0fe2412c..ba950a3e 100644
--- a/lib/bup/ls.py
+++ b/lib/bup/ls.py
@@ -42,10 +42,10 @@ def item_info(item, name,
else:
result.append(b'0000000000000000000000000000000000000000 ')
if long_fmt:
- meta = item.meta.copy()
+ meta = item.meta.copy(frozen=False)
meta.path = name
# FIXME: need some way to track fake vs real meta items?
- result.append(metadata.summary_bytes(meta,
+ result.append(metadata.summary_bytes(meta.freeze(),
numeric_ids=numeric_ids,
classification=classification,
human_readable=human_readable))
diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 91d34bca..7a561627 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -5,11 +5,10 @@
# This code is covered under the terms of the GNU Library General
# Public License as described in the bup LICENSE file.

-from copy import deepcopy
from errno import EACCES, EINVAL, ENOTTY, ENOSYS, EOPNOTSUPP
from io import BytesIO
from time import gmtime, strftime
-import errno, os, sys, stat, socket, struct
+import copy, errno, os, sys, stat, socket, struct

from bup import vint, xstat
from bup.drecurse import recursive_dirlist
@@ -745,12 +744,13 @@ class Metadata:
else:
raise

- __slots__ = ('mode', 'uid', 'gid', 'user', 'group', 'rdev',
+ __slots__ = ('_frozen',
+ 'mode', 'uid', 'gid', 'user', 'group', 'rdev',
'atime', 'mtime', 'ctime', 'path',
'size', 'symlink_target', 'hardlink_target',
'linux_attr', 'linux_xattr', 'posix1e_acl')

- def __init__(self):
+ def __init__(self, *, frozen=False):
self.mode = self.uid = self.gid = self.user = self.group = None
self.rdev = None
self.atime = self.mtime = self.ctime = None
@@ -762,6 +762,29 @@ class Metadata:
self.linux_attr = None
self.linux_xattr = None
self.posix1e_acl = None
+ self._frozen = frozen
+
+ def freeze(self): self._frozen = True; return self
+ def thaw(self): self._frozen = False; return self
+ def __setattr__(self, k, v):
+ if k == '_frozen':
+ return super().__setattr__(k, v)
+ if getattr(self, '_frozen', False):
+ raise AttributeError(f'Cannot change frozen instance attribute {k}',
+ name=k, obj=self)
+ return super().__setattr__(k, v)
+ def __copy__(self):
+ result = self.__new__(self.__class__)
+ for k in [x for x in self.__slots__ if x != '_frozen']:
+ setattr(result, k, copy.copy(getattr(self, k)))
+ result._frozen = self._frozen
+ return result
+ def __deepcopy__(self, memo):
+ result = self.__new__(self.__class__)
+ for k in [x for x in self.__slots__ if x != '_frozen']:
+ setattr(result, k, copy.deepcopy(getattr(self, k), memo))
+ result._frozen = self._frozen
+ return result

def __eq__(self, other):
if not isinstance(other, Metadata): return False
@@ -853,8 +876,12 @@ class Metadata:
ret.append(vint.encode_vuint(_rec_tag_end))
return b''.join(ret)

- def copy(self):
- return deepcopy(self)
+ def copy(self, frozen=None):
+ if frozen is None:
+ return copy.deepcopy(self)
+ if frozen:
+ return copy.deepcopy(self).freeze()
+ return copy.deepcopy(self).thaw()

@staticmethod
def read(port):
@@ -935,18 +962,6 @@ class Metadata:
and self._same_linux_xattr(other)


-class MetadataRO(Metadata):
- __slots__ = '_frozen',
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self._frozen = True
- def __setattr__(self, k, v):
- if getattr(self, '_frozen', None) and hasattr(self, k):
- raise AttributeError(f'Cannot modify read-only instance attribute {k}',
- name=k, obj=self)
- return super().__setattr__(k, v)
-
-
def from_path(path, statinfo=None, archive_path=None,
save_symlinks=True, hardlink_target=None,
normalized=False, after_stat=None):
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index d4057386..a900fca4 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -95,7 +95,8 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
assert item.meta.size == size
else: # must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
- item.meta.size = size
+ item.meta.thaw().size = size
+ item.meta.freeze()
# it's in the DB and in the destination repo
if chunked is None: # dir, not file
return item, dst, None
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 99da13fa..b8bd7094 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -9,13 +9,13 @@ from bup.hashsplit import \
GIT_MODE_FILE,
split_to_blob_or_tree)
from bup.helpers import add_error
-from bup.metadata import Metadata, MetadataRO
+from bup.metadata import Metadata
from bup.io import path_msg
from bup.git import shalist_item_sort_key, mangle_name
from bup._helpers import RecordHashSplitter


-_empty_metadata = MetadataRO()
+_empty_metadata = Metadata(frozen=True)

class TreeItem:
__slots__ = 'name', 'mode', 'gitmode', 'oid', 'meta'
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index b246790a..15928f59 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -1194,8 +1194,9 @@ def augment_item_meta(repo, item, *, include_size=False, public=False):
m = item.meta
if isinstance(m, Metadata):
if include_size and m.size is None:
+ m = m.copy(frozen=False)
m.size = maybe_public(m.mode, _compute_item_size(repo, item))
- return item._replace(meta=m)
+ return item._replace(meta=m.freeze())
return item
# m is mode
meta = Metadata()
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/tree.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 27ae97ea..58369ab4 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -222,8 +222,9 @@ class Stack:
def _write_split_tree(self, dir_meta, items, level=0):
"""Write a (possibly split) tree representing items.

- Write items as either a a single git tree object, or as a "split
- subtree" See DESIGN for additional information.
+ Write items as either a single git tree object, or as a "split
+ subtree" See DESIGN for additional information.
+
"""
assert level >= 0
if not items:
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Return "empty" from detailed_bytes() for empty entries (for say "bup
meta -tvv ...") instead of crashing.

Handle empty metadata (None via _ArchiveIterator) in display_archive()
when checking for invalid request.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/metadata.py | 15 +++++++++------
test/ext/test-meta | 16 ++++++++++++++++
2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index fec5af98..dc808fbc 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -218,11 +218,11 @@ class Metadata:
# record will have some subset of add, encode, load, create, and
# apply methods, i.e. _add_foo...

- # We do allow an "empty" object as a special case, i.e. no
- # records. One can be created by trying to write Metadata(), and
- # for such an object, read() will return None. This is used by
- # "bup save", for example, as a placeholder in cases where
- # from_path() fails.
+ # We do allow an "empty" object as a special case, i.e. via just
+ # Metadata(), and .bupm files may include these (see the
+ # Repository Taxonomy in DESIGN). The current code assumes all
+ # in-memory instances will be the metadata.empty_metadata object,
+ # which Metadata.read() will return by default when appropriate.

# NOTE: if any relevant fields are added or removed, be sure to
# update same_file() below.
@@ -1147,6 +1147,8 @@ def summary_bytes(meta, numeric_ids = False, classification = None,
def detailed_bytes(meta, fields = None):
# FIXME: should optional fields be omitted, or empty i.e. "rdev:
# 0", "link-target:", etc.
+ if not meta:
+ return b'empty'
if not fields:
fields = all_fields

@@ -1196,6 +1198,7 @@ def detailed_bytes(meta, fields = None):


class _ArchiveIterator:
+ """Yields the metadata instances in file, or None for empty metadata."""
def __next__(self):
m = Metadata.read(self._file)
if m is empty_metadata:
@@ -1228,7 +1231,7 @@ def display_archive(file, out):
out.write(b'\n')
elif verbose == 0:
for meta in _ArchiveIterator(file):
- if not meta.path:
+ if not (meta and meta.path):
log('bup: no metadata path, but asked to only display path'
' (increase verbosity?)\n')
sys.exit(EXIT_FAILURE)
diff --git a/test/ext/test-meta b/test/ext/test-meta
index 9e9a35bb..8155858e 100755
--- a/test/ext/test-meta
+++ b/test/ext/test-meta
@@ -432,6 +432,22 @@ src/foo/3"
WVPASS rm -r "$tmpdir"
) || exit $?

+WVSTART 'handling of empty metadata'
+(
+ tmpdir="$(WVPASS wvmktempdir)" || exit $?
+ WVPASS cd "$tmpdir"
+
+ WVPASS bup-python -c "
+from bup.metadata import Metadata
+with open('bupm', 'wb') as f: Metadata().write(f)
+"
+ WVEXPRC "$bup_exit_failure" bup meta -tf bupm
+ WVPASS bup meta -tvf bupm
+ WVPASSEQ "empty" "$(WVPASS bup meta -tvvf bupm)"
+
+ WVPASS rm -r "$tmpdir"
+) || exit $?
+
# Test ownership restoration (when not root or fakeroot).
(
if [ "$root_status" != none ]; then
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
When an exclude applies, it changes the tree, and since the rewrite db
is shared across all of a get's targets we must not store entries for
any such trees. Otherwise a rewrite --copy following a --rewrite might
find and use a tree inserted by the --rewrite that had exclusions.

Since an exclusion also affects all of the tree's parents, for now
just treat this case like --repair, and don't remember *any* trees.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/rewrite.py | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 950581cd..f200f091 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -41,6 +41,10 @@ from bup.vfs import \
# chunked files) because their content can vary (e.g. changing
# repair-id).
#
+# - repairs never remember trees when there are any excludes since
+# different excludes can produce different trees from the same
+# original tree.
+#
# - all rewrite created trees (when repairs.destructive is false)
# are identical to the one --repair would have created, which
# allows --rewrite to enter those trees into the db and subsequent
@@ -284,7 +288,7 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs):
assert item.meta.size == len(item.meta.symlink_target)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

-def _remember_rewrite(from_oid, to_oid, chunked, size, wdbc, mapping):
+def _remember_file_rewrite(from_oid, to_oid, chunked, size, wdbc, mapping):
assert len(from_oid) == 20, from_oid
assert len(to_oid) == 20, to_oid
wdbc.execute(f'select src, dst, chunked, size from {mapping} where src = ?',
@@ -309,7 +313,7 @@ def _maybe_exec_mode(git_mode, meta):
return git_mode

def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
- split_cfg, stack, wdbc, mapping, repairs):
+ split_cfg, stack, wdbc, mapping, excludes, repairs):
"""Returns either None, or, if a directory was missing, the
directory path components.

@@ -402,7 +406,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
# has missing objects when it encounters it a second time (for
# say the second of two saves during an --append), which will
# omit the logging, repair trailers, etc.
- if not repairs.destructive:
+ if not (repairs.destructive or excludes):
wdbc.execute(f'insert into {mapping} (src, dst) values (?, ?)',
(item.oid, newtree))
return
@@ -463,7 +467,7 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,

# Isn't and must not be dir or replacement (since we must not
# remember those).
- _remember_rewrite(item.oid, oid, chunked, item_size, wdbc, mapping)
+ _remember_file_rewrite(item.oid, oid, chunked, item_size, wdbc, mapping)
git_mode = _maybe_exec_mode(git_mode, item.meta)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

@@ -544,7 +548,7 @@ class Rewriter:
_rewrite_save_item(save_path, path, replacement_dir,
srcrepo, dstrepo,
self._split_cfg, stack, dbc,
- self._mapping, repairs)
+ self._mapping, excludes, repairs)

while len(stack) > 1: # pop all parts above root folder
stack.pop()
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Change Metadata to be frozen by default, and alter other interactions
with metadata to accommodate that, and to keep instaces read-only as
much as possible.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/index.py | 4 ++++
lib/bup/cmd/meta.py | 3 ++-
lib/bup/cmd/save.py | 15 +++++++++------
lib/bup/index.py | 6 ++++--
lib/bup/metadata.py | 18 +++++++++++-------
lib/bup/rewrite.py | 15 ++++++++++-----
lib/bup/vfs.py | 8 ++++----
test/ext/test-rewrite | 6 ++++--
test/int/test_index.py | 2 +-
test/int/test_vfs.py | 6 ++++--
10 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/lib/bup/cmd/index.py b/lib/bup/cmd/index.py
index 18c5e540..813537a9 100644
--- a/lib/bup/cmd/index.py
+++ b/lib/bup/cmd/index.py
@@ -148,7 +148,9 @@ def update_index(top, excluded_paths, exclude_rxs, fsindex,
# read/used from the index if hashvalid is true. (3)
# "faked" entries will be stale(), and so we'll invalidate
# them below.
+ meta.thaw()
meta.ctime = meta.mtime = meta.atime = 0
+ meta.freeze()
meta_ofs = msw.store(meta)
rig.cur.update_from_stat(pst, meta_ofs)
rig.cur.invalidate()
@@ -172,7 +174,9 @@ def update_index(top, excluded_paths, exclude_rxs, fsindex,
add_error(e)
continue
# See same assignment to 0, above, for rationale.
+ meta.thaw()
meta.atime = meta.mtime = meta.ctime = 0
+ meta.freeze()
meta_ofs = msw.store(meta)
wi.add(path, pst, meta_ofs, hashgen=fake_hash)
if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1:
diff --git a/lib/bup/cmd/meta.py b/lib/bup/cmd/meta.py
index a5611e8b..4b6f1bb4 100644
--- a/lib/bup/cmd/meta.py
+++ b/lib/bup/cmd/meta.py
@@ -128,6 +128,7 @@ def main(argv):
f = open(argv_bytes(path), 'rb')
try:
for m in metadata._ArchiveIterator(f):
+ m.thaw()
if opt.set_uid is not None:
try:
m.uid = int(opt.set_uid)
@@ -150,6 +151,6 @@ def main(argv):
elif opt.set_group is not None:
m.group = argv_bytes(opt.set_group)

- m.write(output_file)
+ m.freeze().write(output_file)
finally:
f.close()
diff --git a/lib/bup/cmd/save.py b/lib/bup/cmd/save.py
index 0d2c4ee6..21b96d09 100644
--- a/lib/bup/cmd/save.py
+++ b/lib/bup/cmd/save.py
@@ -30,6 +30,7 @@ from bup.helpers import \
stripped_path_components,
valid_save_name)
from bup.io import byte_stream, path_msg
+from bup.metadata import empty_metadata
from bup.path import default_fsindex, flat_fsindex
from bup.pwdgrp import userfullname, username
from bup.tree import Stack
@@ -309,11 +310,11 @@ def save_tree(opt, reader, hlink_db, msr, repo, split_cfg):
# Not indexed, so just grab the FS metadata or use empty metadata.
try:
meta = metadata.from_path(fs_path, normalized=True) \
- if fs_path else metadata.Metadata()
+ if fs_path else empty_metadata
except (OSError, IOError) as e:
add_error(e)
lastskip_name = dir_name
- meta = metadata.Metadata()
+ meta = empty_metadata
stack.push(dir_name, meta)

if not file:
@@ -334,10 +335,11 @@ def save_tree(opt, reader, hlink_db, msr, repo, split_cfg):

# it's not a directory
if already_saved_oid:
- meta = msr.metadata_at(ent.meta_ofs)
+ meta = msr.metadata_at(ent.meta_ofs).thaw()
meta.hardlink_target = find_hardlink_target(hlink_db, ent)
# Restore the times that were cleared to 0 in the metastore.
- (meta.atime, meta.mtime, meta.ctime) = (ent.atime, ent.mtime, ent.ctime)
+ meta.atime, meta.mtime, meta.ctime = ent.atime, ent.mtime, ent.ctime
+ meta.freeze()
stack.append_to_current(file, ent.mode, ent.gitmode, ent.sha, meta)
else:
id = None
@@ -374,7 +376,7 @@ def save_tree(opt, reader, hlink_db, msr, repo, split_cfg):
# if the other stat() data might be slightly older than the file
# content (which we can't fix, this is inherently racy, but we
# can prevent the size mismatch.)
- meta.size = 0
+ meta.thaw().size = 0
def write_data(data):
meta.size += len(data)
return repo.write_data(data)
@@ -383,6 +385,7 @@ def save_tree(opt, reader, hlink_db, msr, repo, split_cfg):
mode, id = \
split_to_blob_or_tree(write_data, repo.write_tree,
hashsplit.from_config([f], split_cfg))
+ meta.freeze()
except (IOError, OSError) as e:
add_error('%s: %s' % (ent.name, e))
lastskip_name = ent.name
@@ -417,7 +420,7 @@ def save_tree(opt, reader, hlink_db, msr, repo, split_cfg):

# Finish the root directory.
# When there's a collision, use empty metadata for the root.
- root_meta = metadata.Metadata() if root_collision else None
+ root_meta = empty_metadata if root_collision else None
tree = stack.pop(override_meta=root_meta)

return tree
diff --git a/lib/bup/index.py b/lib/bup/index.py
index 904e6eb8..da58b27e 100644
--- a/lib/bup/index.py
+++ b/lib/bup/index.py
@@ -16,6 +16,7 @@ from bup.helpers import \
qprogress,
resolve_parent,
slashappend)
+from bup.metadata import empty_metadata


EMPTY_SHA = b'\0' * 20
@@ -73,6 +74,7 @@ class MetaStoreReader:
def __exit__(self, type, value, traceback): self.close()

def metadata_at(self, ofs):
+ # Must return a new instance (callers are allowed to modify it)
self._file.seek(ofs)
return metadata.Metadata.read(self._file)

@@ -155,7 +157,7 @@ class Level:
def _golevel(level, f, ename, newentry, metastore, tmax):
# close nodes back up the tree
assert(level)
- default_meta_ofs = metastore.store(metadata.Metadata())
+ default_meta_ofs = metastore.store(empty_metadata)
while ename[:len(level.ename)] != level.ename:
n = BlankNewEntry(level.ename[-1], default_meta_ofs, tmax)
n.flags |= IX_EXISTS
@@ -616,7 +618,7 @@ class Writer:
meta_ofs, 0, 0)
else:
assert(endswith)
- meta_ofs = self.metastore.store(metadata.Metadata())
+ meta_ofs = self.metastore.store(empty_metadata)
e = BlankNewEntry(basename, meta_ofs, self.tmax)
e.gitmode = gitmode
e.sha = sha
diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index 77e3615a..fec5af98 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -5,6 +5,7 @@
# This code is covered under the terms of the GNU Library General
# Public License as described in the bup LICENSE file.

+from copy import deepcopy
from errno import EACCES, EINVAL, ENOTTY, ENOSYS, EOPNOTSUPP
from io import BytesIO
from time import gmtime, strftime
@@ -778,7 +779,7 @@ class Metadata:
'size', 'symlink_target', 'hardlink_target',
'linux_attr', 'linux_xattr', 'posix1e_acl')

- def __init__(self, *, frozen=False):
+ def __init__(self, *, frozen=True):
self.mode = self.uid = self.gid = self.user = self.group = None
self.rdev = None
self.atime = self.mtime = self.ctime = None
@@ -922,7 +923,7 @@ class Metadata:
if tag == _rec_tag_end:
return empty
# From here on, EOF is an error.
- result = Metadata()
+ result = Metadata(frozen=False)
while True: # only exit is error (exception) or _rec_tag_end
if tag == _rec_tag_path:
result._load_path_rec(port)
@@ -943,7 +944,7 @@ class Metadata:
elif tag == _rec_tag_linux_xattr:
result._load_linux_xattr_rec(port)
elif tag == _rec_tag_end:
- return result
+ return result.freeze()
elif tag == _rec_tag_common_v1: # Should be very rare.
result._load_common_rec(port, version=1)
else: # unknown record
@@ -990,7 +991,7 @@ class Metadata:
and self._same_linux_xattr(other)


-empty_metadata = Metadata(frozen=True)
+empty_metadata = Metadata()


def from_path(path, statinfo=None, archive_path=None,
@@ -1000,7 +1001,7 @@ def from_path(path, statinfo=None, archive_path=None,
"""Return the metadata associated with the path. When normalized is
true, return the metadata appropriate for a typical save, which
may or may not be all of it."""
- result = Metadata()
+ result = Metadata(frozen=False)
result.path = archive_path
st = statinfo or xstat.lstat(path)
if after_stat:
@@ -1016,7 +1017,7 @@ def from_path(path, statinfo=None, archive_path=None,
# Only store sizes for regular files and symlinks for now.
if not (stat.S_ISREG(result.mode) or stat.S_ISLNK(result.mode)):
result.size = None
- return result
+ return result.freeze()


def save_tree(output_file, paths,
@@ -1246,8 +1247,9 @@ def start_extract(file, create_symlinks=True):
add_error(Exception('skipping risky path "%s"'
% path_msg(meta.path)))
else:
+ meta = deepcopy(meta).thaw()
meta.path = xpath
- _set_up_path(meta, create_symlinks=create_symlinks)
+ _set_up_path(meta.freeze(), create_symlinks=create_symlinks)


def finish_extract(file, restore_numeric_ids=False):
@@ -1288,7 +1290,9 @@ def extract(file, restore_numeric_ids=False, create_symlinks=True):
add_error(Exception('skipping risky path "%s"'
% path_msg(meta.path)))
else:
+ meta = deepcopy(meta).thaw()
meta.path = xpath
+ meta.freeze()
if verbose:
print('+', path_msg(meta.path), file=sys.stderr)
_set_up_path(meta, create_symlinks=create_symlinks)
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 282c06a8..950581cd 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -101,7 +101,8 @@ def _previous_conversion(dstrepo, item, vfs_dir, db, mapping):
assert item.meta.size == size
else: # must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
- item.meta.thaw().size = size
+ item.meta.thaw()
+ item.meta.size = size
item.meta.freeze()
# it's in the DB and in the destination repo
if chunked is None: # dir, not file
@@ -124,7 +125,7 @@ def _blob_replacement(repo, meta, content):
# REVIEW: does all this seem reasonable?
now = time.time()
oid = repo.write_data(content)
- rm = Metadata()
+ rm = Metadata(frozen=False)
rm.mode = default_file_mode
rm.rdev = 0
rm.atime = rm.mtime = rm.ctime = now
@@ -137,7 +138,7 @@ def _blob_replacement(repo, meta, content):
else:
rm.uid = rm.gid = 0
rm.user = rm.group = b''
- return Item(oid=oid, meta=rm)
+ return Item(oid=oid, meta=rm.freeze())

def _replacement_item(repo, item, kind, kind_msg, repair_id, missing_oid):
# Currently assumes any trailer manipulations will preserve
@@ -273,10 +274,12 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs):

git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
if have_meta:
+ # FIXME: not tested?
if item.meta.size is None:
- # must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
+ item.meta.thaw()
item.meta.size = len(item.meta.symlink_target)
+ item.meta.freeze()
else:
assert item.meta.size == len(item.meta.symlink_target)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)
@@ -448,10 +451,12 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
return

if isinstance(item.meta, metadata.Metadata):
+ # FIXME: not tested?
if item.meta.size is None:
- # must not modify vfs results (see vfs docs)
item = vfs.copy_item(item)
+ item.meta.thaw()
item.meta.size = item_size
+ item.meta.freeze()
else:
assert item.meta.size == item_size, (item.meta.size, item_size)
chunked = 1 if S_ISDIR(git_mode) else 0
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index fa3d375f..b82db022 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -598,7 +598,7 @@ class LostMetadata(Metadata):
"""Representation for metadata that's been lost, e.g. due to a bug
like the one that dropped bupm entries."""
def __init__(self, mode):
- super().__init__()
+ super().__init__(frozen=False)
self.mode = mode
self.freeze()

@@ -1284,8 +1284,8 @@ def augment_item_meta(repo, item, *, include_size=False, public=False):
m.size = maybe_public(m.mode, _compute_item_size(repo, item))
return item._replace(meta=m.freeze())
return item
- # m is mode
- meta = Metadata()
+ assert isinstance(m, int), item
+ meta = Metadata(frozen=False)
meta.mode = m
if S_ISLNK(m):
if isinstance(item, FakeLink):
@@ -1296,7 +1296,7 @@ def augment_item_meta(repo, item, *, include_size=False, public=False):
meta.size = len(target)
elif include_size:
meta.size = maybe_public(m, _compute_item_size(repo, item))
- return item._replace(meta=meta)
+ return item._replace(meta=meta.freeze())

def fill_in_metadata_if_dir(repo, item):
"""If item is a directory and item.meta is not a Metadata instance,
diff --git a/test/ext/test-rewrite b/test/ext/test-rewrite
index d9d40104..36394393 100755
--- a/test/ext/test-rewrite
+++ b/test/ext/test-rewrite
@@ -73,13 +73,15 @@ WVSTART rewrite after size not stored
# now do a hack to save without saving the size in metadata ...
WVPASS mkdir -p "$tmpdir/mod"
cat > "$tmpdir/mod/metadata_encode_no_size.py" << EOF
+from copy import deepcopy
from bup import metadata, vfs

_orig_encode_common = metadata.Metadata._encode_common
def _new_encode_common(self):
- self.size = None
+ m = deepcopy(self).thaw()
+ m.size = None
print("encoding common with self.size None")
- return _orig_encode_common(self)
+ return _orig_encode_common(m)
metadata.Metadata._encode_common = _new_encode_common

vfs._compute_item_size = lambda repo, item: -1122334455
diff --git a/test/int/test_index.py b/test/int/test_index.py
index a8ea4240..030a986b 100644
--- a/test/int/test_index.py
+++ b/test/int/test_index.py
@@ -85,7 +85,7 @@ def test_index_dirty(tmpdir):
orig_cwd = os.getcwd()
try:
os.chdir(tmpdir)
- default_meta = metadata.Metadata()
+ default_meta = metadata.empty_metadata

with index.MetaStoreWriter(b'index.meta.tmp') as ms1, \
index.MetaStoreWriter(b'index2.meta.tmp') as ms2, \
diff --git a/test/int/test_vfs.py b/test/int/test_vfs.py
index 90f9328f..b1f4eb91 100644
--- a/test/int/test_vfs.py
+++ b/test/int/test_vfs.py
@@ -80,6 +80,8 @@ def run_augment_item_meta_tests(repo,
wvpass(isinstance(file_item.meta, Metadata))
wvpass(isinstance(link_item.meta, Metadata))
# Note: normally, modifying item.meta values is forbidden
+ file_item.meta.thaw()
+ link_item.meta.thaw()
file_item.meta.size = file_item.meta.size or vfs.item_size(repo, file_item)
link_item.meta.size = link_item.meta.size or vfs.item_size(repo, link_item)

@@ -181,8 +183,8 @@ def test_misc(tmpdir):
wvpasseq(4, vfs.item_size(repo, link_item))
wvpasseq(7, vfs.item_size(repo, file_item))
meta = metadata.from_path(fsencode(__file__))
- meta.size = 42
- fake_item = file_item._replace(meta=meta)
+ meta.thaw().size = 42
+ fake_item = file_item._replace(meta=meta.freeze())
wvpasseq(42, vfs.item_size(repo, fake_item))

_, fakelink_item = vfs.resolve(repo, b'/test/latest', follow=False)[-1]
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
So that the commands will show up in the test output, etc.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
test/ext/test-comparative-split-join | 8 ++------
test/ext/test-fsck | 13 ++++---------
test/ext/test-init | 9 ++++-----
test/ext/test-main | 8 +++-----
test/ext/test-split-files-config | 4 +---
test/ext/test-validate-object-links | 8 ++------
test/ext/test-validate-ref-links | 13 +++++++------
7 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/test/ext/test-comparative-split-join b/test/ext/test-comparative-split-join
index f8995295..dbc6554d 100755
--- a/test/ext/test-comparative-split-join
+++ b/test/ext/test-comparative-split-join
@@ -113,12 +113,8 @@ test-split-join()
-o "ref: refs/heads/master" = "$(< this-bup/HEAD)"
WVPASS rm {other,this}-bup/HEAD
# Exit should be 0 or 5 depending on whether that bup inits repo ids.
- git config --file other-bup/config --unset bup.repo.id;
- rc=$?
- WVPASS test "$rc" = 0 -o "$rc" = 5
- WVPASS git config --file this-bup/config --unset bup.repo.id
- rc=$?
- WVPASS test "$rc" = 0 -o "$rc" = 5
+ WVEXPRC '[05]' git config --file other-bup/config --unset bup.repo.id;
+ WVEXPRC '[05]' git config --file this-bup/config --unset bup.repo.id
WVPASS "$top/dev/compare-trees" --no-times other-bup/ this-bup/

WVPASS cd "$orig_dir"
diff --git a/test/ext/test-fsck b/test/ext/test-fsck
index 4234f00e..c290759a 100755
--- a/test/ext/test-fsck
+++ b/test/ext/test-fsck
@@ -61,8 +61,10 @@ WVPASS bup damage "$BUP_DIR"/objects/pack/*.pack -n10 -s1024 --percent 0.4 -S0
WVFAIL bup fsck --quick

if ! bup fsck --par2-ok; then
+ set -x
bup fsck --quick -rvv -j9
rc=$?
+ set +x
WVPASSNE 0 "$rc"
WVPASSNE 1 "$rc"
else
@@ -113,15 +115,8 @@ else
WVPASS bup damage "$BUP_DIR"/objects/pack/*.pack -n600 -s1 --equal -S0
WVFAIL bup fsck

- bup fsck -rvv # too many errors to be repairable
- rc=$?
- WVPASSNE 0 "$rc"
- WVPASSNE 1 "$rc"
-
- bup fsck -r # too many errors to be repairable
- rc=$?
- WVPASSNE 0 "$rc"
- WVPASSNE 1 "$rc"
+ WVEXPRC '[!01]' bup fsck -rvv # too many errors to be repairable
+ WVEXPRC '[!01]' bup fsck -r # too many errors to be repairable
fi


diff --git a/test/ext/test-init b/test/ext/test-init
index 228e4855..92474f1b 100755
--- a/test/ext/test-init
+++ b/test/ext/test-init
@@ -11,11 +11,10 @@ bup() { "$top/bup" "$@"; }

WVPASS cd "$tmpdir"

+
WVSTART 'command without init fails'
WVPASS mkdir foo
-bup -d nope save -t foo
-rc=$?
-WVPASSEQ 2 "$rc"
+WVEXPRC 2 bup -d nope save -t foo
WVPASS rmdir foo

WVSTART '-d repo argument'
@@ -52,8 +51,8 @@ WVPASS test -d repo/refs/heads
WVPASS test -d repo/objects/pack
WVPASS rm -rf repo

-bup init /dev/null || rc=$?
-WVPASSEQ "$rc" "$bup_exit_failure"
+WVEXPRC "$bup_exit_failure" bup init /dev/null
+

WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test-main b/test/ext/test-main
index 23590972..ddf1ddb8 100755
--- a/test/ext/test-main
+++ b/test/ext/test-main
@@ -12,14 +12,12 @@ bup() { "$top/bup" "$@"; }

WVPASS cd "$tmpdir"

-WVSTART 'main'
-
-bup
-rc=$?
-WVPASSEQ "$rc" 2

+WVSTART 'main'
+WVEXPRC 2 bup
# Check --x=y handling
WVPASS bup --bup-dir=repo init

+
WVPASS cd "$top"
WVPASS rm -r "$tmpdir"
diff --git a/test/ext/test-split-files-config b/test/ext/test-split-files-config
index ab89a184..b88768ee 100755
--- a/test/ext/test-split-files-config
+++ b/test/ext/test-split-files-config
@@ -20,9 +20,7 @@ WVPASS bup random --seed "$RANDOM" "$size" > data

WVSTART "split with no setting is the same as legacy:13"
WVPASS bup init
-git config -f "$BUP_DIR/config" --unset bup.split.files
-rc=$?
-WVPASSEQ 5 "$rc"
+WVEXPRC 5 git config -f "$BUP_DIR/config" --unset bup.split.files
tree1="$(WVPASS bup split -t data)" || exit $?
WVPASS rm -r bup
WVPASS bup init
diff --git a/test/ext/test-validate-object-links b/test/ext/test-validate-object-links
index d8e7638f..0898eb04 100755
--- a/test/ext/test-validate-object-links
+++ b/test/ext/test-validate-object-links
@@ -29,12 +29,8 @@ bupm_oid="$(WVPIPE git ls-tree src:a | WVPASS head -1 | WVPASS btl-ent-oid)" \
|| exit $?
echo "$bupm_oid" | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR"

-set -x
-bup validate-object-links > validate-out
-rc=$?
-set +x
-cat validate-out
-WVPASSEQ 1 "$rc"
+WVEXPRC 1 bup validate-object-links > validate-out
+btl-display-file validate-out

src_a_oid="$(git rev-parse src:a)"
WVPASS grep -E "^no $bupm_oid for $src_a_oid" validate-out
diff --git a/test/ext/test-validate-ref-links b/test/ext/test-validate-ref-links
index fc2c6ac7..efdd7d9a 100755
--- a/test/ext/test-validate-ref-links
+++ b/test/ext/test-validate-ref-links
@@ -14,12 +14,13 @@ bup() { "$top/bup" "$@"; }

expect-one-src-missing()
{
- set -x
- bup validate-ref-links "$@" > validate.log 2>&1
- rc=$?
- set +x
- cat validate.log
- WVPASSEQ 1 "$rc"
+ if test $# -eq 0; then
+ args=''
+ else
+ args="$(printf " %q" "$@")"
+ fi
+ WVEXPRC 1 eval "bup validate-ref-links $args > validate.log 2>&1"
+ btl-display-file validate.log
WVPASSEQ 1 "$(grep -cE '^missing ' validate.log)"
}

--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Handle more cases with respect to missing or incorrect symlink
information. Previously, we only handled cases where there was no
information at all.

When there's no information at all, continue replacing that with a
repair file and requiring --repair.

Fix the previous code to "restore" a symlink blob from the
metadata (when we have the latter but not the former). Previously it
was unreachable because vfs.readlink() doesn't throw
MissingObject. When we do "restore", treat it as a repair, reporting a
Bup-Repair-ID id, adding a Bup-Restored-Link trailer, and exiting with
status 3.

Since it's straightforward, detect the presumably rare situation where
the symlink target in the metadata doesn't match the target in the
blob. Refuse to fix it without --repair (i.e. repairs.destructive)
because we can't change that tree while we're including trees in the
rewrite db --- just mention --repair and EXIT_FAILURE for now. When
repairing, replace the path's blob with the metadata's target and add
a Bup-Fixed-Link-Blob trailer.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
Documentation/bup-get.1.md | 12 +++-
lib/bup/repair.py | 26 ++++++-
lib/bup/rewrite.py | 88 ++++++++++++++++-------
lib/bup/vfs.py | 1 +
test/ext/test-get-repair-symlinks | 112 ++++++++++++++++++++++++++++++
test/ext/test-get-rewrite-missing | 2 +
6 files changed, 213 insertions(+), 28 deletions(-)
create mode 100755 test/ext/test-get-repair-symlinks

diff --git a/Documentation/bup-get.1.md b/Documentation/bup-get.1.md
index 3ef5da5c..3c81cc0b 100644
--- a/Documentation/bup-get.1.md
+++ b/Documentation/bup-get.1.md
@@ -142,7 +142,9 @@ used to help test before/after results.)
\--rewrite
: rewrite the data according to the destination repository
configuration, e.g. its `bup.split.files`, and `bup.split.trees`
- values. Currently, `--rewrite`, `---repair`, or `--copy` must be
+ values. Some incidental repairs may be performed during the
+ transfer when they do not materially alter the result (see REPAIRS
+ below). Currently, `--rewrite`, `---repair`, or `--copy` must be
specified whenever the source and destination repository
configurations differ in a relevant way, and so far, `--rewrite`
is only supported for appends and picks. This option is also
@@ -229,7 +231,8 @@ METHODs excludes differ from those for the previous METHOD.
# REPAIRS

`bup get` can fix (or mitigate) a number of known issues during the
-transfer when `--repair` is requested.
+transfer when `--repair` is requested, and a subset of "incidental"
+repairs may also be performed during a `--rewrite`.

* Versions of `bup` at or after 0.25 and before 0.30.1 might rarely
drop metadata entries for non-directories (which can be detected by
@@ -255,6 +258,11 @@ transfer when `--repair` is requested.
for additional information.

+"Incidental" repairs may also be performed --- repairs that do not
+functionally alter the result. For example, bup records symlink
+targets in two places, but generally only refers to one of them. If
+the other one is missing, it can and will be restored from the first.
+
# EXAMPLES

# Update or copy the archives branch in src-repo to the local repository.
diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index c20b1883..ba86cd60 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -15,7 +15,8 @@ def valid_repair_id(s):
class Repairs:
# Used, for example, to track all repairs in a bup get process
__slots__ = ('id', 'destructive', 'command', '_others', '_repaired_save',
- '_replaced_files', '_replaced_meta')
+ '_replaced_files', '_replaced_meta', '_restored_symlink_blobs',
+ '_fixed_symlink_blobs')
def __init__(self, id, destructive, command):
assert valid_repair_id(id)
self.id = id
@@ -25,8 +26,13 @@ class Repairs:
self._repaired_save = {} # requires 3.7+ dict ordering
self._replaced_files = []
self._replaced_meta = []
+ self._restored_symlink_blobs = []
+ self._fixed_symlink_blobs = []
def repair_count(self):
- return len(self._replaced_files) + len(self._replaced_meta) \
+ return len(self._replaced_files) \
+ + len(self._replaced_meta) \
+ + len(self._restored_symlink_blobs) \
+ + len(self._fixed_symlink_blobs) \
+ self._others
def note_incidental_repair(self):
# "Safe" repairs that don't involve the repair id.
@@ -49,6 +55,16 @@ class Repairs:
log(b'repairs needed, repair-id: %s\n' % self.id)
self._remember_save(path)
self._replaced_files.append((render_path(path[3:]), oid, new_oid))
+ def link_blob_restored(self, path, oid):
+ if self.repair_count() == 0:
+ log(b'repairs needed, repair-id: %s\n' % self.id)
+ self._remember_save(path)
+ self._restored_symlink_blobs.append((render_path(path[3:]), oid))
+ def link_blob_fixed(self, path, prev_blob):
+ if self.repair_count() == 0:
+ log(b'repairs needed, repair-id: %s\n' % self.id)
+ self._remember_save(path)
+ self._fixed_symlink_blobs.append((render_path(path[3:]), prev_blob))
def repair_trailers(self, repair_id):
assert valid_repair_id(repair_id)
if not self.repair_count():
@@ -60,6 +76,12 @@ class Repairs:
for path, oid, new_oid in self._replaced_files:
trailers.append(b'Bup-Replaced: %s %s'
% (hexlify(new_oid), enc_sh(path)))
+ for path, oid in self._restored_symlink_blobs:
+ trailers.append(b'Bup-Restored-Link-Blob: %s %s'
+ % (hexlify(oid), enc_sh(path)))
+ for path, prev_blob in self._fixed_symlink_blobs:
+ trailers.append(b'Bup-Fixed-Link-Blob: was %s for %s'
+ % (enc_sh(prev_blob), enc_sh(path)))
for path in self._replaced_meta:
trailers.append(b'Bup-Lost-Meta: %s' % enc_sh(path))
return trailers
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index f200f091..59cacfba 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -6,7 +6,7 @@ from os.path import join as joinp
from re import Pattern
from stat import S_ISDIR, S_ISLNK, S_IRWXG, S_IRWXO, S_ISREG
from typing import Any, Sequence
-import sqlite3, time
+import sqlite3, sys, time

from bup import hashsplit, metadata, vfs
from bup.commit import commit_message
@@ -18,7 +18,8 @@ from bup.hashsplit import \
GIT_MODE_SYMLINK,
GIT_MODE_TREE,
split_to_blob_or_tree)
-from bup.helpers import hostname, log, mkdirp, should_rx_exclude_path, temp_dir
+from bup.helpers import \
+ EXIT_FAILURE, hostname, log, mkdirp, should_rx_exclude_path, temp_dir
from bup.io import path_msg, qsql_id
from bup.metadata import Metadata
from bup.path import xdg_cache
@@ -26,7 +27,8 @@ from bup.pwdgrp import userfullname, username
from bup.repair import Repairs
from bup.tree import Stack
from bup.vfs import \
- (Item,
+ (FakeLink,
+ Item,
LostMetadata,
MissingObject,
default_exec_mode,
@@ -118,6 +120,19 @@ def _meta_replaced(path, repairs):
fs_path = render_path(path[1:])
log(f'warning: metadata lost for {path_msg(fs_path)}\n')

+def _restored_link_blob(path, oid, repairs):
+ fs_path = render_path(path)
+ repairs.link_blob_restored(path, oid)
+ ep = path_msg(fs_path)
+ log(f'warning: restored missing symlink blob {oid.hex()} for {ep}\n')
+
+def _fixed_link_blob(path, oid, meta_link, blob_link, repairs):
+ fs_path = render_path(path)
+ repairs.link_blob_fixed(path, blob_link)
+ ep = path_msg(fs_path)
+ log(f'warning: symlink {ep} ({oid.hex()}) blob {blob_link} != {meta_link}\n')
+ log(f'set {ep} symlink blob to {oid.hex()} -> {meta_link}\n')
+
def _path_repaired(path, oid, replacement_oid, missing_oid, repairs):
fs_path = render_path(path)
repairs.path_replaced(path, oid, replacement_oid)
@@ -256,27 +271,6 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs):
name, item = path[-1]
assert isinstance(name, bytes)
have_meta = isinstance(item.meta, metadata.Metadata)
-
- try:
- target = vfs.readlink(srcrepo, item)
- except MissingObject as ex:
- if have_meta and item.symlink_target is not None:
- repairs.note_indidental_repair()
- pm = path_msg(render_path(path))
- log(f'warning: symlink data replaced from metadata for {pm}\n')
- target = item.symlink_target
- else:
- if not repairs.destructive:
- raise ex
- replacement = _replacement_symlink_item(dstrepo, item,
- repairs.id, ex.oid)
- _path_repaired(path, item.oid, replacement.oid, ex.oid, repairs)
- assert replacement.meta.mode == default_file_mode
- stack.append_to_current(name, default_file_mode, default_file_mode,
- replacement.oid, replacement.meta)
- return
-
- git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
if have_meta:
# FIXME: not tested?
if item.meta.size is None:
@@ -286,6 +280,52 @@ def _rewrite_link(path, item_mode, srcrepo, dstrepo, stack, repairs):
item.meta.freeze()
else:
assert item.meta.size == len(item.meta.symlink_target)
+
+ if isinstance(item, FakeLink):
+ target = item.target
+ elif isinstance(item.meta, Metadata):
+ target = item.meta.symlink_target
+ else:
+ target = None
+
+ change = None # restore/replace/fix
+ target_blob = None
+ if not isinstance(item, FakeLink):
+ _, kind, _, it = vfs.get_ref(srcrepo, hexlify(item.oid))
+ if not kind:
+ change = 'restore' if target else 'replace'
+ else:
+ assert kind == b'blob', kind
+ target_blob = b''.join(it)
+ if target_blob != target: # very unlikely, but easy to handle
+ if not repairs.destructive:
+ ep = path_msg(render_path(path))
+ log(f'Symlink with mismatched targets (can --repair): {ep}\n')
+ sys.exit(EXIT_FAILURE)
+ # This defers to target since bup (cf. vfs.readlink())
+ # always prefers it.
+ change = 'fix'
+
+ if change == 'replace':
+ # No metadata symlink info, and blob was missing (git or pre-meta bup)
+ if not repairs.destructive:
+ raise MissingObject(item.oid)
+ replacement = \
+ _replacement_symlink_item(dstrepo, item, repairs.id, item.oid)
+ _path_repaired(path, item.oid, replacement.oid, item.oid, repairs)
+ assert replacement.meta.mode == default_file_mode
+ stack.append_to_current(name, default_file_mode, default_file_mode,
+ replacement.oid, replacement.meta)
+ return
+
+ git_mode, oid = GIT_MODE_SYMLINK, dstrepo.write_symlink(target)
+ # REVIEW: ok if oid != item.oid?
+ if change == 'restore':
+ _restored_link_blob(path, item.oid, repairs)
+ elif change == 'fix':
+ _fixed_link_blob(path, item.oid, target, target_blob, repairs)
+ else:
+ assert change is None, change
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

def _remember_file_rewrite(from_oid, to_oid, chunked, size, wdbc, mapping):
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 86f3dec4..c83f3342 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -479,6 +479,7 @@ def _readlink(repo, oid):
def readlink(repo, item):
"""Return the link target of item, which must be a symlink. Reads the
target from the repository if necessary."""
+ # Consider rewrite._rewrite_link when making changes here
assert repo
assert S_ISLNK(item_mode(item))
if isinstance(item, FakeLink):
diff --git a/test/ext/test-get-repair-symlinks b/test/ext/test-get-repair-symlinks
new file mode 100755
index 00000000..faa179a2
--- /dev/null
+++ b/test/ext/test-get-repair-symlinks
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh
+. ./test/lib/btl.sh
+
+set -o pipefail
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+bup() { "$top/bup" "$@"; }
+
+
+WVPASS cd "$tmpdir"
+WVPASS bup init
+
+
+### symlinks with metadata (.bupm entry)
+
+WVPASS mkdir -p src
+WVPASS echo 1 > src/1
+WVPASS ln -s 1 src/2
+WVPASS bup index src
+WVPASS bup save --strip -n src src
+
+two_oid="$(WVPASS git ls-tree src: | grep -E '2$' | btl-ent-oid)"
+echo "$two_oid" | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" || exit $?
+
+WVEXPRC 2 bup join src: > /dev/null
+
+
+WVSTART 'symlinks with metadata but no blob are noticed'
+WVPASS bup init dst
+WVEXPRC 2 eval 'bup -d dst get -s bup --append src 2> >(tee err.log)'
+WVPASS grep -E "object .*${two_oid}.* is missing" err.log
+WVPASS rm -rf dst
+
+
+for method in --rewrite --repair; do
+ WVSTART "symlinks with metadata but no blob are implicitly repaired by $method"
+ WVPASS bup init dst
+ WVEXPRC 3 eval "bup -d dst get -s bup $method --append src 2> >(tee err.log)"
+ WVPASS bup -d dst join src: > /dev/null
+ WVPASS git --git-dir dst log -n1 src | tee commit-msg
+ WVPASS grep -E "^[ ]+Bup-Restored-Link-Blob: $two_oid 2\$" commit-msg
+ WVPASS rm -rf dst
+done
+
+
+## symlinks with mismatched metadata and blob (very unlikely)
+
+# Change src:2 symlink to point a "not-1" blob (2 is third line in tree)
+not_1="$(echo -n not-1 | WVPASS git hash-object -w --stdin)" || exit $?
+not_1_ent="120000 blob $not_1 2"
+not_1_tree="$(WVPASS git ls-tree src | WVPASS sed -Ee "3s/.*/$not_1_ent/")" || exit $?
+not_1_tree_oid="$(echo "$not_1_tree" | WVPASS git mktree)" || exit $?
+not_1_save=$(WVPASS git commit-tree "$not_1_tree_oid" -p src -m 'not 1') || exit $?
+WVPASS git branch -f src "$not_1_save"
+
+WVSTART "symlinks with mismatched targets are noticed by --rewrite"
+WVPASS bup init dst
+WVEXPRC 2 eval 'bup -d dst get -s bup --rewrite --pick src/latest 2> >(tee err.log)'
+WVPASS grep -qF "Symlink with mismatched targets (can --repair)" err.log
+WVPASS rm -rf dst
+
+WVSTART "symlinks with mismatched targets are repaired by --repair"
+WVPASS bup init dst
+WVEXPRC 3 eval "bup -d dst get -s bup --repair --pick src/latest 2> >(tee err.log)"
+WVPASS bup -d dst join src: > /dev/null
+WVPASS git --git-dir dst log -n1 src | tee commit-msg
+WVPASS grep -E "^[ ]+Bup-Fixed-Link-Blob: was not-1 for 2\$" commit-msg
+WVPASS rm -rf dst
+
+WVPASS rm -rf bup
+
+
+### symlinks with no metadata (e.g. git or older bup)
+
+WVPASS git init --bare -b src bup
+WVPASS git --work-tree src add .
+WVPASS git --work-tree src commit -am commit
+WVPASS git gc --aggressive # ensure no loose objects (for perforate)
+
+two_oid="$(WVPASS git ls-tree src: | grep -E '2$' | btl-ent-oid)"
+echo "$two_oid" | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" || exit $?
+
+WVEXPRC 2 bup join src: > /dev/null
+
+
+WVSTART 'symlinks completely missing are noticed by --copy/--rewrite'
+WVPASS bup init dst
+WVEXPRC 2 eval 'bup -d dst get -s bup --append src 2> >(tee err.log)'
+WVPASS grep -E "object .*${two_oid}.* is missing" err.log
+WVEXPRC 2 eval 'bup -d dst get -s bup --rewrite --append src 2> >(tee err.log)'
+WVPASS grep -E "object .*${two_oid}.* is missing" err.log
+WVPASS rm -rf dst
+
+
+WVSTART "symlinks completely missing are repaired by --repair"
+WVPASS bup init dst
+WVEXPRC 3 eval "bup -d dst get -s bup --repair --append src 2> >(tee err.log)"
+WVPASS bup -d dst join src: > /dev/null
+two_rep_oid="$(WVPASS git --git-dir dst ls-tree src | grep -E '2$' | btl-ent-oid)"
+WVPASS git --git-dir dst log -n1 src | tee commit-msg
+WVPASS grep -E "^[ ]+Bup-Replaced: $two_rep_oid 2\$" commit-msg
+WVPASS rm -rf dst
+
+
+WVPASS cd "$top"
+WVPASS rm -rf "$tmpdir"
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index d7295039..66572078 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -15,6 +15,8 @@ bup() { "$top/bup" "$@"; }
# FIXME: consider checking expected compare-trees differences.


+# Missing symlinks are tested in test-get-repair-symlinks
+
WVPASS cd "$tmpdir"
WVPASS bup init
WVPASS git config bup.split.trees true
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Previously, for something like "get --repair --append ..." we were
accumulating the repair traliers across all of the saves
(commits). Now, include only the repair trailers that apply to the
save being committed in that save's commit message.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/get.py | 109 ++++++++++++++++--------------
lib/bup/repair.py | 5 +-
lib/bup/rewrite.py | 18 ++++-
test/ext/test-get-rewrite-missing | 53 +++++++++++++--
4 files changed, 122 insertions(+), 63 deletions(-)

diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index 1200d3b2..d7179c0c 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -29,9 +29,9 @@ from bup.helpers import \
tty_width)
from bup.io import path_msg
from bup.pwdgrp import userfullname, username
-from bup.repair import Repairs, valid_repair_id
+from bup.repair import valid_repair_id
from bup.repo import LocalRepo, make_repo
-from bup.rewrite import Rewriter
+from bup.rewrite import RepairInfo, Rewriter


argspec = (
@@ -137,12 +137,12 @@ class Spec:
src: bytes
dest: bytes
ignore_missing: bool
- repairs: Optional[Repairs] = None
+ repair_info: Optional[RepairInfo] = None
excludes: Optional[list[Pattern]] = None
rewriter: Optional[Union[bool, Rewriter]] = None
def __post_init__(self):
- assert not (self.ignore_missing and self.repairs), \
- (self.ignore_missing, self.repairs)
+ assert not (self.ignore_missing and self.repair_info), \
+ (self.ignore_missing, self.repair_info)

def spec_msg(s):
if not s.dest:
@@ -193,7 +193,8 @@ def parse_args(args):
misuse(f'--{method} cannot {mode} (only picks and appends)')
if repair_id is None:
repair_id = str(uuid4()).encode('ascii')
- rc = Repairs(repair_id, mode == 'repair', get_argvb())
+ rc = RepairInfo(id=repair_id, destructive=(mode == 'repair'),
+ command=get_argvb())
rw = True
elif mode == 'copy': # explicitly specified no rewrite/repair
rw = False
@@ -202,7 +203,7 @@ def parse_args(args):
else:
raise Exception(f'invalid get target mode {mode!r}')
return Spec(method=method, src=src, dest=dest, excludes=excludes,
- rewriter=rw, ignore_missing=ignore_missing, repairs=rc)
+ rewriter=rw, ignore_missing=ignore_missing, repair_info=rc)

pending_method_context = {} # dict to preserve insertion order
remaining = args[1:] # Skip argv[0]
@@ -332,6 +333,13 @@ def get_random_item(name, hash, src_repo, dest_repo, ignore_missing):
dest_repo.just_write(item.oid, item.type, item.data)


+@dataclass(slots=True, frozen=True)
+class GetResult:
+ oid: Optional[bytes] = None
+ tree: Optional[bytes] = None
+ repairs: int = 0
+
+
def transfer_commit(name, hash, parent, src_repo, dest_repo, ignore_missing):
now = time.time()
items = parse_commit(get_cat_data(src_repo.cat(hash), b'commit'))
@@ -344,11 +352,11 @@ def transfer_commit(name, hash, parent, src_repo, dest_repo, ignore_missing):
author, items.author_sec, items.author_offset,
committer, now, None,
items.message)
- return c, tree
+ return GetResult(c, tree)


def append_commit(src_loc, parent, src_repo, dest_repo, rewriter, excludes,
- repairs, ignore_missing):
+ repair_info, ignore_missing):
if not rewriter:
assert isinstance(src_loc, (bytes, Loc)), src_loc
oidx = src_loc if isinstance(src_loc, bytes) else hexlify(src_loc.hash)
@@ -363,21 +371,25 @@ def append_commit(src_loc, parent, src_repo, dest_repo, rewriter, excludes,
root, ref, save = path
assert isinstance(save[1], (vfs.Commit, vfs.FakeLink)), path
assert isinstance(ref[1], vfs.RevList), path
- return rewriter.append_save(path, parent, src_repo, dest_repo, excludes,
- repairs)
+ save_oid, tree_oid, repairs = \
+ rewriter.append_save(path, parent, src_repo, dest_repo, excludes,
+ repair_info)
+ return GetResult(save_oid, tree_oid, repairs.repair_count())

def append_commits(src_loc, dest_hash, src_repo, dest_repo, rewriter, excludes,
- repairs, ignore_missing):
+ repair_info, ignore_missing):
if not rewriter:
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()
last_c, tree = dest_hash, None
for commit in commits:
- last_c, tree = append_commit(commit, last_c, src_repo, dest_repo,
- rewriter, excludes, repairs,
- ignore_missing)
+ res = append_commit(commit, last_c, src_repo, dest_repo, rewriter,
+ excludes, repair_info, ignore_missing)
+ last_c = res.oid
+ tree = res.tree
+ assert res.repairs == 0
assert tree is not None
- return last_c, tree
+ return GetResult(last_c, tree)

# Friendlier checking was done during resolve_*
assert isinstance(src_loc, Loc), src_loc
@@ -398,14 +410,15 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, rewriter, excludes,
commits = list(src_repo.rev_list(hexlify(src_loc.hash)))
commits.reverse()

- last_c, tree = dest_hash, None
+ last_c, tree, repair_count = dest_hash, None, 0
for commit in commits:
coid = unhexlify(commit)
- last_c, tree = rewriter.append_save(path + (entry_for_coid[coid],),
- last_c, src_repo, dest_repo,
- excludes, repairs)
+ last_c, tree, repairs = \
+ rewriter.append_save(path + (entry_for_coid[coid],), last_c,
+ src_repo, dest_repo, excludes, repair_info)
+ repair_count += repairs.repair_count()
assert tree is not None
- return last_c, tree
+ return GetResult(last_c, tree, repair_count)


GitLoc = namedtuple('GitLoc', ('ref', 'hash', 'type'))
@@ -585,7 +598,7 @@ def handle_ff(item, src_repo, dest_repo):
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
item.spec.ignore_missing)
commit_items = parse_commit(get_cat_data(src_repo.cat(src_oidx), b'commit'))
- return item.src.hash, unhexlify(commit_items.tree)
+ return GetResult(item.src.hash, unhexlify(commit_items.tree))
misuse('destination is not an ancestor of source for %s'
% spec_msg(item.spec))
# misuse() doesn't return
@@ -631,12 +644,12 @@ def handle_append(item, src_repo, dest_repo):
commit = dest_repo.write_commit(item.src.hash, parent,
userline, now, None,
userline, now, None, msg)
- return commit, item.src.hash
+ return GetResult(commit, item.src.hash)
if item.dest.hash:
assert item.dest.type in ('branch', 'commit', 'save'), item.dest
return append_commits(item.src, item.dest.hash, src_repo, dest_repo,
item.spec.rewriter, item.spec.excludes,
- item.spec.repairs, item.spec.ignore_missing)
+ item.spec.repair_info, item.spec.ignore_missing)


def resolve_pick(spec, src_repo, dest_repo):
@@ -681,13 +694,13 @@ def handle_pick(item, src_repo, dest_repo):
if item.dest.type in ('branch', 'commit', 'save'):
return append_commit(item.src, item.dest.hash, src_repo, dest_repo,
item.spec.rewriter, item.spec.excludes,
- item.spec.repairs, item.spec.ignore_missing)
+ item.spec.repair_info, item.spec.ignore_missing)
assert item.dest.path.startswith(b'/.tag/'), item.dest
# no parent; either dest is a non-commit tag and we should clobber
# it, or dest doesn't exist.
return append_commit(item.src, None, src_repo, dest_repo,
item.spec.rewriter, item.spec.excludes,
- item.spec.repairs, item.spec.ignore_missing)
+ item.spec.repair_info, item.spec.ignore_missing)


def resolve_new_tag(spec, src_repo, dest_repo):
@@ -715,7 +728,7 @@ def handle_new_tag(item, src_repo, dest_repo):
assert item.dest.path.startswith(b'/.tag/')
get_random_item(item.spec.src, hexlify(item.src.hash),
src_repo, dest_repo, item.spec.ignore_missing)
- return (item.src.hash,)
+ return GetResult(item.src.hash)


def resolve_replace(spec, src_repo, dest_repo):
@@ -747,13 +760,13 @@ def handle_replace(item, src_repo, dest_repo):
if item.dest.path.startswith(b'/.tag/'):
get_random_item(item.spec.src, hexlify(item.src.hash),
src_repo, dest_repo, item.spec.ignore_missing)
- return (item.src.hash,)
+ return GetResult(item.src.hash)
assert(item.dest.type == 'branch' or not item.dest.type)
src_oidx = hexlify(item.src.hash)
get_random_item(item.spec.src, src_oidx, src_repo, dest_repo,
item.spec.ignore_missing)
commit_items = parse_commit(get_cat_data(src_repo.cat(src_oidx), b'commit'))
- return item.src.hash, unhexlify(commit_items.tree)
+ return GetResult(item.src.hash, unhexlify(commit_items.tree))


def resolve_unnamed(spec, src_repo, dest_repo):
@@ -769,7 +782,7 @@ def resolve_unnamed(spec, src_repo, dest_repo):
def handle_unnamed(item, src_repo, dest_repo):
get_random_item(item.spec.src, hexlify(item.src.hash),
src_repo, dest_repo, item.spec.ignore_missing)
- return (None,)
+ return GetResult()


def resolve_targets(specs, src_repo, dest_repo):
@@ -830,6 +843,7 @@ def log_item(name, type, opt, tree=None, commit=None, tag=None):


def get_everything(opt):
+ repair_count = 0
with LocalRepo(repo_dir=opt.source) as src_repo, \
make_repo(derive_repo_addr(remote=opt.remote, die=misuse),
compression_level=opt.compress) as dest_repo:
@@ -897,21 +911,19 @@ def get_everything(opt):
cur_ref = cur_ref or dest_hash

handler = handlers[item.spec.method]
- item_result = handler(item, src_repo, dest_repo)
- if len(item_result) > 1:
- new_id, tree = item_result
- else:
- new_id = item_result[0]
+ get_res = handler(item, src_repo, dest_repo)
+ repair_count += get_res.repairs

if not dest_ref:
log_item(item.spec.src, item.src.type, opt)
else:
- updated_refs[dest_ref] = (orig_ref, new_id)
+ updated_refs[dest_ref] = (orig_ref, get_res.oid)
if dest_ref.startswith(b'refs/tags/'):
- log_item(item.spec.src, item.src.type, opt, tag=new_id)
+ log_item(item.spec.src, item.src.type, opt,
+ tag=get_res.oid)
else:
log_item(item.spec.src, item.src.type, opt,
- tree=tree, commit=new_id)
+ tree=get_res.tree, commit=get_res.oid)

# Only update the refs at the very end, once the destination repo
# finished writing, so that if something goes wrong above, the old
@@ -931,6 +943,14 @@ def get_everything(opt):
except (git.GitError, client.ClientError) as ex:
note_error('unable to update ref %r: %s\n' % (ref_name, ex))

+ if repair_count and not saved_errors:
+ msg = ('Repairs were needed and successful; see above. Additional'
+ ' information may be found in the git log. Search for '
+ ' "Repair-ID:" in "git --git-dir REPO log ..." for the related'
+ ' references.\n')
+ log(f'\n{fill(msg, width=tty_width(), break_on_hyphens=False)}\n')
+ return EXIT_RECOVERED
+ return 0

def main(argv):
opt = parse_args(argv)
@@ -942,15 +962,4 @@ def main(argv):
if not opt.target_specs:
misuse('no methods specified')

- get_everything(opt)
-
- if any(spec.repairs and spec.repairs.repair_count() \
- for spec in opt.target_specs) \
- and not saved_errors:
- msg = ('Repairs were needed and successful; see above. Additional'
- ' information may be found in the git log. Search for '
- ' "Repair-ID:" in "git --git-dir REPO log ..." for the related'
- ' references.\n')
- log(f'\n{fill(msg, width=tty_width(), break_on_hyphens=False)}\n')
- return EXIT_RECOVERED
- return 0
+ return get_everything(opt)
diff --git a/lib/bup/repair.py b/lib/bup/repair.py
index ba86cd60..f64f8a56 100644
--- a/lib/bup/repair.py
+++ b/lib/bup/repair.py
@@ -14,14 +14,13 @@ def valid_repair_id(s):

class Repairs:
# Used, for example, to track all repairs in a bup get process
- __slots__ = ('id', 'destructive', 'command', '_others', '_repaired_save',
+ __slots__ = ('id', 'destructive', '_others', '_repaired_save',
'_replaced_files', '_replaced_meta', '_restored_symlink_blobs',
'_fixed_symlink_blobs')
- def __init__(self, id, destructive, command):
+ def __init__(self, id, destructive):
assert valid_repair_id(id)
self.id = id
self.destructive = destructive
- self.command = command
self._others = 0
self._repaired_save = {} # requires 3.7+ dict ordering
self._replaced_files = []
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index 59cacfba..2126694b 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -511,6 +511,12 @@ def _rewrite_save_item(save_path, path, replacement_dir, srcrepo, dstrepo,
git_mode = _maybe_exec_mode(git_mode, item.meta)
stack.append_to_current(name, item_mode, git_mode, oid, item.meta)

+@dataclass(slots=True, frozen=True)
+class RepairInfo:
+ id: bytes
+ destructive: bool
+ command: Sequence[bytes]
+
class Rewriter:
def __init__(self, *, split_cfg, db=None):
assert isinstance(db, (bytes, type(None)))
@@ -543,12 +549,17 @@ class Rewriter:
pass

def append_save(self, save_path, parent, srcrepo, dstrepo, excludes,
- repairs):
+ repair_info):
+ """Create a new save from save_path with the given parent.
+ Return (save_oid, tree_oid, repairs).
+
+ """
# Strict for now
assert isinstance(parent, (bytes, type(None))), parent
if parent:
assert len(parent) == 20, parent
assert all(isinstance(x, Pattern) for x in excludes)
+ assert isinstance(repair_info, RepairInfo), repair_info
assert len(save_path) == 3, (len(save_path), save_path)
assert isinstance(save_path[1][1], vfs.RevList)
leaf_name, leaf_item = save_path[2]
@@ -563,6 +574,7 @@ class Rewriter:
# Currently, the workdb must always be ready to commit (see finally below)
with closing(self._db_conn.cursor()) as dbc:
try:
+ repairs = Repairs(repair_info.id, repair_info.destructive)
# Maintain a stack of information representing the current
# location in the archive being constructed.
stack = \
@@ -599,13 +611,13 @@ class Rewriter:
author = ci.author_name + b' <' + ci.author_mail + b'>'
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
trailers = repairs.repair_trailers(repairs.id)
- msg = commit_message(ci.message, repairs.command,
+ msg = commit_message(ci.message, repair_info.command,
trailers)
return (dstrepo.write_commit(tree, parent,
author,
ci.author_sec, ci.author_offset,
committer, time.time(), None,
msg),
- tree)
+ tree, repairs)
finally:
self._db_conn.commit() # the workdb is always ready for commit
diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index 66572078..9d855298 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -284,17 +284,23 @@ WVPASS echo 2 > src/a/2
WVPASS echo 3 > src/a/3
WVPASS bup index src
WVPASS bup save --strip -n src src
+# Create a src-two-saves for "accumulation test" later
+WVPASS git branch src-two-saves src
+WVPASS echo 4 > src/a/4
+WVPASS bup index src
+WVPASS bup save --strip -n src-two-saves src
+
WVPASS readarray -t saves < <(bup ls src)
save_date="${saves[0]}"
src_oid="$(git rev-parse src)"
unset saves
two_oid="$(WVPASS git ls-tree src:a | grep -E '2$' | btl-ent-oid)"
echo "$two_oid" | WVPASS "$top/dev/perforate-repo" --drop-oids "$BUP_DIR" || exit $?
-WVPASS mv bup bup-123
+WVPASS mv bup bup-1234


WVSTART '--rewrite (not --repair) on broken save after --repair still fails'
-WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
# Check that non-rewrite doesn't notice the missing object (b/c same repo)
WVEXPRC 3 bup get --repair --append: src r1 --copy --append: src r2
# Then that a second *rewrite* can't handle the missing file
@@ -306,7 +312,7 @@ WVPASS grep -E 'raise MissingObject' get.log


WVSTART '--ignore-missing on broken save after --repair'
-WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
WVPASS bup init bup-dest
WVEXPRC 2 eval 'bup -d bup-dest get -s bup' \
'--repair --append: src r1' \
@@ -321,14 +327,14 @@ WVPASS grep -E '^skipping missing source object' get.log


WVSTART '--copy --rewrite --repair'
-WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
WVEXPRC 3 eval 'bup get --rewrite --repair --repair-id r1 --append: src r1' \
'2>&1 | tee get.log'
WVPASSEQ 1 "$(git show r1:a/2 | grep -cE '^Bup-Replacement-Info: r1$')"
-WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
WVEXPRC 2 eval 'bup get --repair --rewrite --append: src r1 2>&1 | tee get.log'
WVPASS grep -E 'raise MissingObject' get.log
-WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
WVPASS rm -rf bup-dest && bup init bup-dest
WVEXPRC 2 eval 'bup -d bup-dest get -s bup --repair --copy --append: src r1' \
'2>&1 | tee get.log'
@@ -337,7 +343,7 @@ WVPASS rm -rf bup bup-dest


WVSTART 'multiple gets with differing repair-ids'
-WVPASS rm -rf bup && WVPASS cp -pPR bup-123 bup
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
WVEXPRC 3 eval 'bup get --repair' \
'--repair-id repair-1 --append: src r1' \
'--repair-id repair-2 --append: src r2' \
@@ -383,5 +389,38 @@ WVPASSEQ 1 "$(WVPASS grep -cE '^Bup-Replacement-Info: repair-1$' replacement-1)"
WVPASSEQ 1 "$(WVPASS grep -cE '^Bup-Replacement-Info: repair-2$' replacement-2)"


+WVSTART "trailers don't accumulate across repaired saves"
+WVPASS rm -rf bup && WVPASS cp -pPR bup-1234 bup
+WVEXPRC 3 eval 'bup get --repair' \
+ '--repair-id repair-id --append: src-two-saves r1' \
+ '2>&1 | tee get.log'
+WVPASS git show -s --pretty=email r1^ | tee repair-msg-1
+WVPASS git show -s --pretty=email r1 | tee repair-msg-2
+WVPASS git interpret-trailers --parse < repair-msg-1 | tee repair-trailers-1
+WVPASS git interpret-trailers --parse < repair-msg-2 | tee repair-trailers-2
+btl-display-file repair-trailers-1
+readarray -t trailers < repair-trailers-1
+wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save --strip -n src src'
+wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get --repair .* src-two-saves r1'
+wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: repair-id$"
+wv-match-rx "${trailers[5]}" "^Bup-Repaired-Save: [0-9a-f]{40} src-two-saves/"
+wv-match-rx "${trailers[6]}" "^Bup-Replaced: [0-9a-f]{40} a/2$"
+WVPASSEQ '' "${trailers[7]}" # end-of-line
+unset trailers
+btl-display-file repair-trailers-2
+readarray -t trailers < repair-trailers-2
+wv-match-rx "${trailers[0]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[1]}" '^Bup-Argv: [^ ]+/bup.* save --strip -n src-two-saves src'
+wv-match-rx "${trailers[2]}" "^Bup-Version: ${bup_ver//+/\\+}$"
+wv-match-rx "${trailers[3]}" '^Bup-Argv: [^ ]+/bup.* get --repair .* src-two-saves r1'
+wv-match-rx "${trailers[4]}" "^Bup-Repair-ID: repair-id$"
+wv-match-rx "${trailers[5]}" "^Bup-Repaired-Save: [0-9a-f]{40} src-two-saves/"
+wv-match-rx "${trailers[6]}" "^Bup-Replaced: [0-9a-f]{40} a/2$"
+WVPASSEQ '' "${trailers[7]}" # end-of-line
+unset trailers
+
+
WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
note/main.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/note/main.md b/note/main.md
index 30143352..797c3979 100644
--- a/note/main.md
+++ b/note/main.md
@@ -155,9 +155,9 @@ General
(and then our use of git) didn't allow otherwise.

* The commit message format has changed to place the command in a
- POSIX quoted `Bup-Argv` trailer (git-interpret-trailers(1)) and the
- version in a `Bup-Version` trailer, but note that the format is not
- settled, i.e. may continue to change. The command quoting avoids
+ POSIX quoted `Bup-Argv` trailer (`git-interpret-trailers`(1)) and
+ the version in a `Bup-Version` trailer, but note that the format is
+ not settled, i.e. may continue to change. The command quoting avoids
quoting arguments when possible, single quotes when there's no
single quote or newline, and falls back to `$'...'` quoting
otherwise.
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/join.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/bup/cmd/join.py b/lib/bup/cmd/join.py
index 88bce059..1d5a6c0f 100644
--- a/lib/bup/cmd/join.py
+++ b/lib/bup/cmd/join.py
@@ -5,7 +5,7 @@ import sys
from bup import options
from bup.compat import argv_bytes
from bup.config import derive_repo_addr
-from bup.helpers import linereader, log
+from bup.helpers import EXIT_FAILURE, linereader, log
from bup.io import byte_stream
from bup.repo import make_repo

@@ -42,6 +42,6 @@ def main(argv):
except KeyError as e:
outfile.flush()
log('error: %s\n' % e)
- ret = 1
+ ret = EXIT_FAILURE

sys.exit(ret)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
...since we don't need it.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/vfs.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index e5fd1467..8fa9e2aa 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -1336,8 +1336,7 @@ def join(repo, ref):
if typ == b'blob':
yield from it
elif typ == b'tree':
- treefile = b''.join(it)
- for ent_mode, ent_name, ent_oid in tree_iter(treefile):
+ for ent_mode, ent_name, ent_oid in tree_iter(b''.join(it)):
yield from _join(*get_oidx(repo, hexlify(ent_oid)), path + [ent_name])
elif typ == b'commit':
treeline = b''.join(it).split(b'\n')[0]
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/cmd/rewrite.py | 2 ++
1 file changed, 2 insertions(+)

diff --git a/lib/bup/cmd/rewrite.py b/lib/bup/cmd/rewrite.py
index c9912603..6951cbee 100755
--- a/lib/bup/cmd/rewrite.py
+++ b/lib/bup/cmd/rewrite.py
@@ -190,6 +190,8 @@ def rewrite_branch(srcrepo, src, dstrepo, dst, excludes, workdb, fatal):

vfs_branch = vfs.resolve(srcrepo, src)
item = vfs_branch[-1][1]
+ if not item:
+ fatal(f'cannot access {path_msg(src)} in source\n')
commit_oid_name = {
c[1].coid: c[0]
for c in vfs.contents(srcrepo, item)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Consider the git "kind" (BUP_CHUNKED, BUP_NORMAL) when picking the
default mode to centralize the selection in _default_mode_for_gitmode.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/vfs.py | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 793b3e6f..f4333643 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -89,6 +89,7 @@ import re
from bup import git
from bup.git import \
(BUP_CHUNKED,
+ BUP_NORMAL,
MissingObject,
GitError,
find_tree_entry,
@@ -117,12 +118,15 @@ default_exec_mode = S_IFREG | _exec_perms
default_dir_mode = S_IFDIR | _exec_perms
default_symlink_mode = S_IFLNK | _exec_perms

-def _default_mode_for_gitmode(gitmode):
+def _default_mode_for_gitinfo(gitmode, kind):
+ assert kind in (BUP_CHUNKED, BUP_NORMAL)
if S_ISREG(gitmode):
if gitmode & S_IXUSR:
return default_exec_mode
return default_file_mode
if S_ISDIR(gitmode):
+ if kind == BUP_CHUNKED:
+ return default_file_mode
return default_dir_mode
if S_ISLNK(gitmode):
return default_symlink_mode
@@ -602,14 +606,15 @@ def _tree_items_except_dot(oid, entries, names=None, bupm=None):
def tree_item(ent_oid, kind, gitmode):
if kind == BUP_CHUNKED:
assert S_ISDIR(gitmode), (ent_oid, kind, gitmode)
- meta = read_nondir_meta(bupm, default_file_mode)
+ meta = read_nondir_meta(bupm, _default_mode_for_gitinfo(gitmode, kind))
return Chunky(oid=ent_oid, meta=meta)

if S_ISDIR(gitmode):
# No metadata here (accessable via '.' inside ent_oid).
- return Item(meta=default_dir_mode, oid=ent_oid)
+ return Item(meta=_default_mode_for_gitinfo(gitmode, kind),
+ oid=ent_oid)

- meta = read_nondir_meta(bupm, _default_mode_for_gitmode(gitmode))
+ meta = read_nondir_meta(bupm, _default_mode_for_gitinfo(gitmode, kind))
return Item(oid=ent_oid, meta=meta)

tree_ents = ordered_tree_entries(entries, bupm)
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:35 PM (3 days ago) Dec 10
to bup-...@googlegroups.com
Change errors to start with "error: " and to put the path at the end
after a colon. Switch from add_error to note_error to avoid
truncation.

Signed-off-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
lib/bup/metadata.py | 119 ++++++++++++++++++++------------------
test/int/test_metadata.py | 8 +--
2 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/lib/bup/metadata.py b/lib/bup/metadata.py
index dc808fbc..9ee5fe65 100644
--- a/lib/bup/metadata.py
+++ b/lib/bup/metadata.py
@@ -15,12 +15,12 @@ from bup import vint, xstat
from bup.drecurse import recursive_dirlist
from bup.helpers import \
(EXIT_FAILURE,
- add_error,
- mkdirp,
- log,
- is_superuser,
format_filesize,
- getgroups)
+ getgroups,
+ is_superuser,
+ log,
+ mkdirp,
+ note_error)
from bup.io import path_msg
from bup.pwdgrp import pwd_from_uid, pwd_from_name, grp_from_gid, grp_from_name
from bup.xstat import utime, lutime
@@ -400,8 +400,8 @@ class Metadata:
# FIXME: S_ISDOOR, S_IFMPB, S_IFCMP, S_IFNWK, ... see stat(2).
else:
assert(not self._recognized_file_type())
- add_error('not creating "%s" with unrecognized mode "0x%x"\n'
- % (path_msg(path), self.mode))
+ note_error(f'error: unrecognized mode 0{self.mode:o},'
+ f' not creating: {path_msg(path)}\n')

def _apply_common_rec(self, path, restore_numeric_ids=False):
if not self.mode:
@@ -414,7 +414,8 @@ class Metadata:
lutime(path, (self.atime or 0, self.mtime or 0))
except OSError as e:
if e.errno == errno.EACCES:
- raise ApplyError('lutime: %s' % e)
+ erm = e.strerror or e.errno
+ raise ApplyError(f'lutime ({erm}): {path_msg(path)}')
else:
raise
else:
@@ -422,7 +423,8 @@ class Metadata:
utime(path, (self.atime or 0, self.mtime or 0))
except OSError as e:
if e.errno == errno.EACCES:
- raise ApplyError('utime: %s' % e)
+ erm = e.strerror or e.errno
+ raise ApplyError(f'utime ({erm}): {path_msg(path)}')
else:
raise

@@ -458,11 +460,12 @@ class Metadata:
os.lchown(path, uid, gid)
except OSError as e:
if e.errno == errno.EPERM:
- add_error('lchown: %s' % e)
+ erm = e.strerror or e.errno
+ note_error(f'error: lchown ({erm}): {path_msg(path)}\n')
elif sys.platform.startswith('cygwin') \
- and e.errno == errno.EINVAL:
- add_error('lchown: unknown uid/gid (%d/%d) for %s'
- % (uid, gid, path_msg(path)))
+ and e.errno == errno.EINVAL:
+ note_error('error: lchown: unknown uid/gid (%d/%d): %s\n'
+ % (uid, gid, path_msg(path)))
else:
raise

@@ -505,7 +508,8 @@ class Metadata:
# one that was in place when we did stat()
self.size = len(self.symlink_target)
except OSError as e:
- add_error('readlink: %s' % e)
+ erm = e.strerror or e.errno
+ note_error(f'error: readlink ({erm}): {path_msg(path)}\n')

def _encode_symlink_target(self):
return self.symlink_target
@@ -583,11 +587,11 @@ class Metadata:
acl = acls[i]
if b',' in acl:
if path:
- msg = f'Unexpected comma in ACL entry; ignoring {acl!r}' \
- f' for {path_msg(path)}\n'
+ msg = 'error: unexpected comma in ACL entry;' \
+ f' ignoring {acl!r}: {path_msg(path)}\n'
else:
- msg = f'Unexpected comma in ACL entry; ignoring {acl!r}\n'
- add_error(msg)
+ msg = f'error: unexpected comma in ACL entry; ignoring {acl!r}\n'
+ note_error(msg)
return None
acls[i] = acl.replace(b'\n', b',')
return acls
@@ -609,8 +613,8 @@ class Metadata:
return

if not apply_acl:
- add_error("%s: can't restore ACLs; posix1e support missing.\n"
- % path_msg(path))
+ note_error("error: can't restore POSIX1e ACL (no support): %s\n"
+ % path_msg(path))
return

try:
@@ -621,15 +625,13 @@ class Metadata:
else:
apply_acl(path, acls[offs])
except IOError as e:
- if e.errno == errno.EINVAL:
- # libacl returns with errno set to EINVAL if a user
- # (or group) doesn't exist
- raise ApplyError("POSIX1e ACL: can't create %r for %r"
- % (acls, path_msg(path)))
- elif e.errno in (errno.EPERM, errno.EOPNOTSUPP):
- raise ApplyError('POSIX1e ACL applyto: %s' % e)
- else:
+ # libacl returns with errno set to EINVAL if a user (or
+ # group) doesn't exist
+ if e.errno not in (errno.EINVAL, errno.EPERM, errno.EOPNOTSUPP):
raise
+ erm = e.strerror or e.errno
+ msg = f'POSIX1e ACL apply failed ({erm}): {path_msg(path)}'
+ raise ApplyError(msg)


## Linux attributes (lsattr(1), chattr(1))
@@ -644,16 +646,17 @@ class Metadata:
self.linux_attr = attr
except OSError as e:
if e.errno == errno.EACCES:
- add_error('read Linux attr: %s' % e)
+ erm = e.strerror or e.errno
+ note_error(f'error: attr read failed ({erm}): {path_msg(path)}\n')
elif e.errno in (ENOTTY, ENOSYS, EOPNOTSUPP):
# Assume filesystem doesn't support attrs.
return
elif e.errno == EINVAL:
global _warned_about_attr_einval
if not _warned_about_attr_einval:
- log("Ignoring attr EINVAL;"
- + " if you're not using ntfs-3g, please report: "
- + path_msg(path) + '\n')
+ log('Ignoring attr EINVAL;'
+ " if you're not using ntfs-3g, please report:"
+ f' {path_msg(path)}\n')
_warned_about_attr_einval = True
return
else:
@@ -679,19 +682,23 @@ class Metadata:
if self.linux_attr:
check_linux_file_attr_api()
if not set_linux_file_attr:
- add_error("%s: can't restore linuxattrs: "
- "linuxattr support missing.\n" % path_msg(path))
+ note_error("error: can't restore linuxattrs (no support): %s\n"
+ % path_msg(path))
return
try:
set_linux_file_attr(path, self.linux_attr)
except OSError as e:
if e.errno in (EACCES, ENOTTY, EOPNOTSUPP, ENOSYS):
- raise ApplyError('Linux chattr: %s (0x%s)'
- % (e, hex(self.linux_attr)))
+ raise ApplyError('chattr(0x%s) failed (%s): %s'
+ % (hex(self.linux_attr),
+ e.strerror or e.errno,
+ path_msg(path)))
elif e.errno == EINVAL:
- msg = "if you're not using ntfs-3g, please report"
- raise ApplyError('Linux chattr: %s (0x%s) (%s)'
- % (e, hex(self.linux_attr), msg))
+ raise ApplyError('chattr(0x%s) failed (%s),'
+ ' please report if this is not ntfs-3g: %s'
+ % (hex(self.linux_attr),
+ e.strerror or e.errno,
+ path_msg(path)))
else:
raise

@@ -741,8 +748,8 @@ class Metadata:
def _apply_linux_xattr_rec(self, path, restore_numeric_ids=False):
if not xattr:
if self.linux_xattr:
- add_error("%s: can't restore xattr; xattr support missing.\n"
- % path_msg(path))
+ note_error("error: can't restore xattrs (no support): %s\n"
+ % path_msg(path))
return
if not self.linux_xattr:
return
@@ -750,7 +757,8 @@ class Metadata:
existing_xattrs = set(xattr.list(path, nofollow=True))
except IOError as e:
if e.errno == errno.EACCES:
- raise ApplyError('xattr.set %r: %s' % (path_msg(path), e))
+ erm = e.strerror or e.errno
+ raise ApplyError(f'xattr.set ({erm}): {path_msg(path)}')
else:
raise
for k, v in self.linux_xattr:
@@ -759,19 +767,19 @@ class Metadata:
try:
xattr.set(path, k, v, nofollow=True)
except IOError as e:
- if e.errno in (errno.EPERM, errno.EOPNOTSUPP):
- raise ApplyError('xattr.set %r: %s' % (path_msg(path), e))
- else:
+ if e.errno not in (errno.EPERM, errno.EOPNOTSUPP):
raise
+ erm = e.strerror or e.errno
+ raise ApplyError(f'xattr.set ({erm}): {path_msg(path)}')
existing_xattrs -= frozenset([k])
for k in existing_xattrs:
try:
xattr.remove(path, k, nofollow=True)
except IOError as e:
- if e.errno in (errno.EPERM, errno.EACCES):
- raise ApplyError('xattr.remove %r: %s' % (path_msg(path), e))
- else:
+ if e.errno not in (errno.EPERM, errno.EACCES):
raise
+ erm = e.strerror or e.errno
+ raise ApplyError(f'xattr.remove ({erm}): {path_msg(path)}')

__slots__ = ('_frozen',
'mode', 'uid', 'gid', 'user', 'group', 'rdev',
@@ -966,8 +974,8 @@ class Metadata:
if not path:
raise Exception('Metadata.apply_to_path() called with no path')
if not self._recognized_file_type():
- add_error('not applying metadata to "%s"' % path_msg(path)
- + ' with unrecognized mode "0x%x"\n' % self.mode)
+ note_error(f'error: unrecognized mode {self.mode:o},'
+ f' not applying metadata: {path_msg(path)}\n')
return
num_ids = restore_numeric_ids
for apply_metadata in (self._apply_common_rec,
@@ -977,7 +985,7 @@ class Metadata:
try:
apply_metadata(path, restore_numeric_ids=num_ids)
except ApplyError as e:
- add_error(e)
+ note_error(f'error: {e}\n')

def same_file(self, other):
"""Compare this to other for equivalency. Return true if
@@ -1247,8 +1255,7 @@ def start_extract(file, create_symlinks=True):
print(path_msg(meta.path), file=sys.stderr)
xpath = _clean_up_extract_path(meta.path)
if not xpath:
- add_error(Exception('skipping risky path "%s"'
- % path_msg(meta.path)))
+ note_error(f'error: skipping risky path: {path_msg(meta.path)}\n')
else:
meta = deepcopy(meta).thaw()
meta.path = xpath
@@ -1262,8 +1269,7 @@ def finish_extract(file, restore_numeric_ids=False):
break
xpath = _clean_up_extract_path(meta.path)
if not xpath:
- add_error(Exception('skipping risky path "%s"'
- % path_msg(meta.path)))
+ note_error(f'error: skipping risky path: {path_msg(meta.path)}\n')
else:
if os.path.isdir(meta.path):
all_dirs.append(meta)
@@ -1290,8 +1296,7 @@ def extract(file, restore_numeric_ids=False, create_symlinks=True):
break
xpath = _clean_up_extract_path(meta.path)
if not xpath:
- add_error(Exception('skipping risky path "%s"'
- % path_msg(meta.path)))
+ note_error(f'error: skipping risky path: {path_msg(meta.path)}\n')
else:
meta = deepcopy(meta).thaw()
meta.path = xpath
diff --git a/test/int/test_metadata.py b/test/int/test_metadata.py
index a3cea90a..3c7a28fa 100644
--- a/test/int/test_metadata.py
+++ b/test/int/test_metadata.py
@@ -168,7 +168,7 @@ def test_from_path_error(tmpdir):
print('saved_errors:', helpers.saved_errors, file=sys.stderr)
WVPASS(len(helpers.saved_errors) == 1)
errmsg = _first_err()
- WVPASS(errmsg.startswith('read Linux attr'))
+ WVPASS(errmsg.startswith('error: attr read failed '))
clear_errors()


@@ -209,11 +209,11 @@ def test_apply_to_path_restricted_access(tmpdir):
os.chmod(parent, 0o000)
m.apply_to_path(path)
print(b'saved_errors:', helpers.saved_errors, file=sys.stderr)
- expected_errors = ['utime: ']
+ expected_errors = ['error: utime ']
if m.linux_attr and _linux_attr_supported(tmpdir):
- expected_errors.append('Linux chattr: ')
+ expected_errors.append('error: chattr(')
if metadata.xattr and m.linux_xattr:
- expected_errors.append("xattr.set ")
+ expected_errors.append('error: xattr.set ')
WVPASS(len(helpers.saved_errors) == len(expected_errors))
for i in range(len(expected_errors)):
assert str(helpers.saved_errors[i]).startswith(expected_errors[i])
--
2.47.3

Rob Browning

unread,
Dec 10, 2025, 1:19:36 PM (3 days ago) Dec 10
to bup-...@googlegroups.com, Johannes Berg
From: Johannes Berg <joha...@sipsolutions.net>

Rather than denote a split tree by a top-level .bupd.DEPTH.bupd marker
file, encode the information in a suffix to every split subtree name,
e.g.

dir/j..5.bupd/jo..4.bupd/...
...
dir/s..5.bupd/st..4.bupd/...
...

Compression should make the storage impact negligible, and this makes
it possible to understand the structure of a path from the path alone,
i.e. without needing access to the repo in order to search trees for
the .bupd.DEPTH.bupd files.

Signed-off-by: Johannes Berg <joha...@sipsolutions.net>
Reviewed-by: Rob Browning <r...@defaultvalue.org>
Tested-by: Rob Browning <r...@defaultvalue.org>
---
DESIGN.md | 48 ++--
lib/bup/tree.py | 16 +-
lib/bup/vfs.py | 31 +--
test/ext/test-get-rewrite-missing | 5 +-
test/ext/test-treesplit | 67 +++--
test/int/test_treesplit.py | 390 +++++++++++++++---------------
test/int/test_vfs.py | 18 +-
7 files changed, 286 insertions(+), 289 deletions(-)

diff --git a/DESIGN.md b/DESIGN.md
index c55c1997..d1bd3561 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -435,29 +435,31 @@ When a tree is split, a large directory like

might become

- dir/.b/{.bupm,aa,...,ii}
- dir/.bupd.1.bupd
- dir/j/{jj,...,rr}
- dir/s/{ss,...,zz}
-
-where the ".bupd.1.bupd" inside dir/ indicates that the tree for dir/
-was split, and the number (here "1") describes the number of levels
-that were created (just one in this case). The names in an
-intermediate level (inside dir/, but not the leaves -- in this
-example, ".b", "j", "s", etc.) are derived from the first filename
-contained within each subtree, abbreviated to the shortest valid
-unique prefix. At any level, the names contained in a subtree will
-always be greater than or equal to the name of the subtree itself and
-less than the name of the next subtree at that level. This makes it
-possible to know which split subtree to read at every level when
-looking for a given filename.
-
-When parsing the split tree depth info file name, i.e. `.bupd.1.bupd`
-in the example above, any extra bytes in the name after the
-`.bupd.DEPTH` prefix, and before the final `.bupd` suffix must be
-ignored. The `DEPTH` will always be a sequence of `[0-9]+`, and any
-extra bytes will begin with `.`, e.g. `.bupd.3.SOMETHING.NEW.bupd`.
-This allows future extensions.
+ dir/.b..1.bupd/{.bupm,aa,...,ii}
+ dir/j..1.bupd/{jj,...,rr}
+ dir/s..1.bupd/{ss,...,zz}
+
+where the "..1.bupd" suffix indicates that the tree for dir/ was
+split, and the number (here "1") describes the number of levels that
+were created (just one in this case). The names in an intermediate
+level (inside dir/, but not the leaves -- in this example, ".b", "j",
+"s", etc.) are derived from the first filename contained within each
+subtree, abbreviated to the shortest valid unique prefix. At any
+level, the names contained in a subtree will always be greater than or
+equal to the name of the subtree itself and less than the name of the
+next subtree at that level. This makes it possible to know which
+split subtree to read at every level when looking for a given
+filename.
+
+Every name inside the split tree will be of the format
+`PREFIX..DEPTH.bupd` where `DEPTH` is the number of levels remaining
+in the split tree before reaching the "leaves". When parsing the name,
+i.e. `PREFIX..1.bupd`, the `..` is a separator between the `PREFIX`
+and any supplemental information. Supplemental information like the
+`DEPTH` will always be added just before existing information (right
+to left), and will never contain `..`. This allows future extensions,
+e.g. `PREFIX..SOMETHING_NEW_WITHOUT_DOTS.3.bupd`. The `DEPTH` will
+always be a sequence of `[0-9]+`.

Detailed Metadata
-----------------
diff --git a/lib/bup/tree.py b/lib/bup/tree.py
index 594f60fa..27ae97ea 100644
--- a/lib/bup/tree.py
+++ b/lib/bup/tree.py
@@ -7,7 +7,6 @@ from bup._helpers import RecordHashSplitter
from bup.hashsplit import \
(BUP_TREE_BLOBBITS,
GIT_MODE_TREE,
- GIT_MODE_FILE,
split_to_blob_or_tree)
from bup.git import shalist_item_sort_key, mangle_name
from bup.helpers import add_error
@@ -90,9 +89,10 @@ def _abbreviate_tree_names(names):
outnames.append(out)
return outnames

-def _abbreviate_item_names(items):
+def _abbreviate_item_names(items, level):
"""Set each item's name to an abbreviation that's still unique
with respect to the other items."""
+ assert isinstance(level, int), level
names = []
for item in items:
names.append(item.first_full_name)
@@ -100,7 +100,7 @@ def _abbreviate_item_names(items):
names.append(item.last_full_name)
abbrevnames = _abbreviate_tree_names(names)
for abbrev_name, item in zip(abbrevnames, items):
- item.name = abbrev_name
+ item.name = abbrev_name + (b'..%d.bupd' % level)


class StackDir:
@@ -266,15 +266,11 @@ class Stack:

if len(splits) == 1:
# If the level is 0, this is an unsplit tree, otherwise it's
- # the top of a split tree, so add the .bupd marker.
+ # the top of a split tree
if level > 0:
assert len(items) == len(splits[0])
assert all(lambda x, y: x is y for x, y in zip(items, splits[0]))
- _abbreviate_item_names(items)
- sentinel_sha = self._repo.write_data(b'')
- items.append(RawTreeItem(b'.bupd.%d.bupd' % level,
- GIT_MODE_FILE, GIT_MODE_FILE,
- sentinel_sha, None))
+ _abbreviate_item_names(items, level)
return self._write_tree(dir_meta, items)

# This tree level was split
@@ -287,7 +283,7 @@ class Stack:
split_items[-1].name))
else: # "inner" nodes (not top, not leaf), abbreviate names
for split_items in splits:
- _abbreviate_item_names(split_items)
+ _abbreviate_item_names(split_items, level)
# "internal" (not top, not leaf) trees don't have a .bupm
newtree.append(SplitTreeItem(split_items[0].name,
self._write_tree(None, split_items,
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 8fa9e2aa..0987a132 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -756,27 +756,23 @@ def _split_subtree_items(repo, level, oid, entries, names, want_meta, root=True)
raise EOFError('EOF instead of split tree placeholder metadata')
yield from _tree_items_except_dot(oid, entries, names, bupm)
else:
- for _, mangled_name, sub_oid in entries:
+ validate = b'.%d.bupd' % level
+ for _, name, sub_oid in entries:
if root:
- if mangled_name == b'.bupm':
+ if name == b'.bupm':
continue
- if mangled_name.endswith(b'.bupd'):
- continue
- assert not mangled_name.endswith(b'.bup'), \
- f'found {path_msg(mangled_name)} in split subtree'
- if not mangled_name.endswith(b'.bupl'):
- assert mangled_name[-5:-1] != b'.bup', \
- f'found {path_msg(mangled_name)} in split subtree'
+ assert name.endswith(validate), \
+ f'found {path_msg(name)} in split subtree but should end with {validate}'
yield from _split_subtree_items(repo, level - 1, sub_oid,
tree_entries(_get_tree_object(repo, sub_oid)),
names, want_meta, False)

-_tree_depth_rx = re.compile(br'\.bupd\.([0-9]+)(?:\..*)?\.bupd')
+_tree_depth_rx = re.compile(br'.*\.([0-9]+)\.bupd')

def _parse_tree_depth(mangled_name):
- """Return the tree DEPTH from a mangled_name like
- .bupd.DEPTH.bupd, but leave open the possibility of future
- .bupd.DEPTH.*.bupd extensions.
+ """Return the tree DEPTH from a mangled_name like foo..DEPTH.bupd,
+ but leave open the possibility of future foo..*.DEPTH.bupd
+ extensions.

"""
m = _tree_depth_rx.fullmatch(mangled_name)
@@ -801,14 +797,19 @@ def tree_items(repo, oid, tree_data, names, *, want_meta=True, repair=False):
entries = tree_entries(tree_data)
depth = None
bupm_oid = None
+ split_or_not_known = False
for _, mangled_name, sub_oid in entries:
if mangled_name.endswith(b'.bupd'):
depth = _parse_tree_depth(mangled_name)
if not dot_requested: # all other metadata in "leaf" .bupm files
break
- if mangled_name == b'.bupm':
+ split_or_not_known = True
+ elif mangled_name == b'.bupm':
bupm_oid = sub_oid
- break
+ if split_or_not_known or depth:
+ break
+ else:
+ split_or_not_known = True
if mangled_name > b'.bupm':
break

diff --git a/test/ext/test-get-rewrite-missing b/test/ext/test-get-rewrite-missing
index 75aad701..d7295039 100755
--- a/test/ext/test-get-rewrite-missing
+++ b/test/ext/test-get-rewrite-missing
@@ -59,7 +59,7 @@ WVPASS git ls-tree src | WVPASS grep -E $'\tpartial-file\.bup$' \
WVPASS git ls-tree -r src:partial-file.bup | WVPASS sed -n 7p \
| btl-ent-oid > partial-file-hole

-WVPASS git ls-tree src:split-tree | WVPASS grep -E $'\t\.bupd.[0-9]+\.bupd$' \
+WVPASS git ls-tree src:split-tree | WVPASS grep -E $'\.\.[0-9]+\.bupd$' \
| btl-ent-oid > bupd-oid
WVPASSEQ 41 $(wc -c < bupd-oid)

@@ -222,8 +222,7 @@ WVPASS grep -E "^Missing: $(< partial-file-hole)" partial-file-replacement

# FIXME: still need tests for missing split-tree leaf .bupms, missing
# split-tree mid-level or leaf tree (which would require three
-# levels). No missing .bupd test since that'd just make the split-tree
-# a normal tree.
+# levels).

# FIXME: test trailers for incomplete split trees as above

diff --git a/test/ext/test-treesplit b/test/ext/test-treesplit
index 99d9ceb8..0921542a 100755
--- a/test/ext/test-treesplit
+++ b/test/ext/test-treesplit
@@ -76,40 +76,39 @@ bup ls -lan /src/latest/ | WVFAIL grep 1970-01-01

# Collapse any split .bupm/* files into a single .bupm
WVPASSEQ "$(git ls-tree -r --name-only src | sed -E 's%(^|/)\.bupm/.*%\1\.bupm%' | uniq)" \
-".bupd.2.bupd
-.bupm
-000/0001/.bupm
-000/0001/00010001
-000/0001/00020002
-000/0001/00030003
-000/0001/00040004
-000/0001/00050005
-000/0006/.bupm
-000/0006/00060006
-000/0006/00070007
-000/0006/00080008
-000/0006/00090009
-000/0006/00100010
-000/0011/.bupm
-000/0011/00110011
-000/0011/00120012
-000/0011/00130013
-000/0011/00140014
-000/0011/00150015
-000/0016/.bupm
-000/0016/00160016
-000/0016/00170017
-000/0016/00180018
-000/0016/00190019
-000/0016/00200020
-000/0021/.bupm
-000/0021/00210021
-000/0021/00220022
-000/0021/00230023
-000/0021/00240024
-000/0021/00250025
-0026/0/.bupm
-0026/0/00260026"
+".bupm
+000..2.bupd/0001..1.bupd/.bupm
+000..2.bupd/0001..1.bupd/00010001
+000..2.bupd/0001..1.bupd/00020002
+000..2.bupd/0001..1.bupd/00030003
+000..2.bupd/0001..1.bupd/00040004
+000..2.bupd/0001..1.bupd/00050005
+000..2.bupd/0006..1.bupd/.bupm
+000..2.bupd/0006..1.bupd/00060006
+000..2.bupd/0006..1.bupd/00070007
+000..2.bupd/0006..1.bupd/00080008
+000..2.bupd/0006..1.bupd/00090009
+000..2.bupd/0006..1.bupd/00100010
+000..2.bupd/0011..1.bupd/.bupm
+000..2.bupd/0011..1.bupd/00110011
+000..2.bupd/0011..1.bupd/00120012
+000..2.bupd/0011..1.bupd/00130013
+000..2.bupd/0011..1.bupd/00140014
+000..2.bupd/0011..1.bupd/00150015
+000..2.bupd/0016..1.bupd/.bupm
+000..2.bupd/0016..1.bupd/00160016
+000..2.bupd/0016..1.bupd/00170017
+000..2.bupd/0016..1.bupd/00180018
+000..2.bupd/0016..1.bupd/00190019
+000..2.bupd/0016..1.bupd/00200020
+000..2.bupd/0021..1.bupd/.bupm
+000..2.bupd/0021..1.bupd/00210021
+000..2.bupd/0021..1.bupd/00220022
+000..2.bupd/0021..1.bupd/00230023
+000..2.bupd/0021..1.bupd/00240024
+000..2.bupd/0021..1.bupd/00250025
+0026..2.bupd/0..1.bupd/.bupm
+0026..2.bupd/0..1.bupd/00260026"

WVPASS cd "$top"
WVPASS rm -rf "$tmpdir"
diff --git a/test/int/test_treesplit.py b/test/int/test_treesplit.py
index 64f7c344..5a461840 100644
--- a/test/int/test_treesplit.py
+++ b/test/int/test_treesplit.py
@@ -107,203 +107,201 @@ split_src = ['00055f95-8cf7-4a01-8819-f6423c731b1a',
'66d48a1b-0bdc-47b9-b471-55aaeb5d6062',
'66d6571f-a65c-483a-9c44-905baac6ca1c']

-split_1 = ['.bupd.1.bupd',
- '.bupm',
- '00/.bupm',
- '00/00055f95-8cf7-4a01-8819-f6423c731b1a',
- '00/01bf344a-deaf-4ffb-8cc6-ad86b03c63e2',
- '00/01cc9d94-7006-461e-aace-c5919e1ceb9a',
- '00/01cf2c47-43ff-4427-865e-01788a3bb910',
- '01e/.bupm',
- '01e/01e1e4bd-6950-4694-a259-f7d66600e776',
- '01e/0c2e3b73-2a44-487c-9aaa-f7428dc3d015',
- '01e/0c3028cc-90f2-4f46-ad3b-94a21498e2ce',
- '01e/0c34aa4a-a479-437a-82b7-e19208a46be8',
- '0c3a/.bupm',
- '0c3a/0c3a4773-6c7f-4efd-9edc-5628698b65bc',
- '0c3a/191b396b-fec7-47f6-a5ba-089ce9ca2956',
- '0c3a/192697c4-b855-4c93-9f3d-2e66a4879c6b',
- '0c3a/192ba072-38f1-4aaa-8515-a334febaeb34',
- '192c/.bupm',
- '192c/192cf67d-d6ee-4e76-8f7c-da7546812bb1',
- '192c/1f1d9ecc-5ad0-4c70-8b7c-9b4eab55d271',
- '192c/1f1d9f04-9d74-4cee-bb4d-b7eae08d7f50',
- '192c/1f21d232-42f0-49d2-bbfd-3561bdb2cd78',
- '1f29/.bupm',
- '1f29/1f2936a1-0ece-4335-90e4-cc1883b9dd93',
- '1f29/1f98f6db-dc1e-4ea7-99e4-7f385c9aa363',
- '1f29/1f9ce8b4-b0b2-463a-872d-bdc902427b26',
- '1f29/1f9d4cf1-57a9-4897-9572-5044e2bda6a8',
- '1fa/.bupm',
- '1fa/1fa53c9a-d8f0-4b93-9a96-90ad6cbf0295',
- '1fa/24ad9373-5d28-49c5-9649-5bbd29e52c7b',
- '1fa/24af5545-e51c-4b26-b72a-ea758bdec9ae',
- '1fa/24af964f-d47d-44ab-a593-9cb213c89869',
- '24b/.bupm',
- '24b/24bc10a8-ac57-4859-9a29-509fc0ff7dc0',
- '24b/265195da-ef80-47e0-8df8-a134c57af25d',
- '24b/26534c1b-a7ba-4737-9612-14b24e729006',
- '24b/26572cc0-b704-4352-a5a3-d7d4e7f571a6',
- '266/.bupm',
- '266/26640125-4b33-4bde-ba4e-8751d15894c5',
- '266/27ae178a-fba2-4b80-a37e-6d14b610ca1f',
- '266/27b40c6f-c5a2-4d90-90a9-6a76612d9935',
- '266/27b75972-28a3-40cf-a73b-1b7e554d2e84',
- '27b8/.bupm',
- '27b8/27b8eb00-0faa-4b8f-9d1a-19a83a9eec07',
- '27b8/2c55f6cb-7ce4-49df-a260-ed8850b3b055',
- '27b8/2c56fa06-72f4-4a94-b5f3-0e6199f88c36',
- '27b8/2c58a92d-172e-45a8-837e-36dba98b95cb',
- '2c6/.bupm',
- '2c6/2c61e2a2-a45f-48d0-b1d7-aa31ae9344b9',
- '2c6/3921bf33-75ca-4507-8fdf-a881ff106484',
- '2c6/392295d4-4df0-4066-a223-a4dec17849a8',
- '2c6/3922ed17-3726-4640-88c7-30fc1b181893',
- '392f/.bupm',
- '392f/392fce08-523d-4ae2-b860-3de201252edb',
- '392f/477066dd-44cb-4a24-bcb7-414a23edaa9f',
- '392f/4774c87c-9a9c-45a8-89a0-fdca65694a97',
- '392f/47773c76-3611-4d32-a10a-a02a7b903e2b',
- '477d/.bupm',
- '477d/477d130b-d0ed-4b93-9da7-a83a1f775257',
- '477d/480214dc-e63c-4089-b5b2-713419a20681',
- '477d/4806c496-7650-4647-934c-2c6c22c15209',
- '477d/480abcd9-330e-45b5-81ad-a4fb9accef84',
- '480c/.bupm',
- '480c/480ca556-ecc9-4b75-9dfb-4951af51df53',
- '480c/48c45a41-11b0-4bc0-a2a4-9d4c53a9a8ba',
- '480c/48c5e1d1-1f94-4ac5-b35e-d74f3e2de569',
- '480c/48ca9b38-a77c-49b0-a8af-a40b876498b5',
- '48e/.bupm',
- '48e/48ef2c71-dfac-4160-a152-8f46bb78cc24',
- '48e/50935f33-3c48-4547-99b1-41843bd8fcbb',
- '48e/50960add-9801-43df-9b77-1108046d9190',
- '48e/509ad170-9107-47bb-bc02-9839290f48d9',
- '50a/.bupm',
- '50a/50a173fc-c6bb-4d4e-a5fd-3c26268145b8',
- '50a/52399901-eb4f-433f-8942-9bb8bbc020f1',
- '50a/523b3f3d-2d89-4a4c-b6a5-a9401684b1be',
- '50a/5243f3b8-dba5-430a-b204-c494f94b5bc8',
- '524d/.bupm',
- '524d/524dfc5d-dd60-4f08-a8e0-7b63dd13110c',
- '524d/5ae4e67c-b751-4126-80f4-f551c7f8ea9b',
- '524d/5ae9aea6-859e-44fe-9d6c-39b1f8c21296',
- '524d/5aec70e7-fa39-4ebd-8285-e79565db5454',
- '5af/.bupm',
- '5af/5af4985e-b509-41ff-b1fb-90eeab8599e2',
- '5af/61d07f00-69ab-41da-b2c1-ad35ff21b6c2',
- '5af/61d6a7a1-bb38-4c1b-a1d4-c80ca8320170',
- '5af/61d9986c-be88-4616-bb2a-e805b9d2e614',
- '61d9a/.bupm',
- '61d9a/61d9a4e9-88f8-4f06-90f6-a0e2a8d00fff',
- '61d9a/66541d38-ccb2-4474-b30c-1a020b3418d1',
- '61d9a/666290b0-faa6-42c7-8bea-5be5e38f9e7d',
- '61d9a/6662e657-1a38-483a-9947-188478718454',
- '6667/.bupm',
- '6667/666712d7-0cb7-4b39-bc86-bb1f792cb75c',
- '6667/66cf73bd-7627-46d3-900b-b11ca122ac9e',
- '6667/66d0d5de-e1f7-4225-9c58-324ab1f0e46a',
- '6667/66d48a1b-0bdc-47b9-b471-55aaeb5d6062']
+split_1 = ['.bupm',
+ '00..1.bupd/.bupm',
+ '00..1.bupd/00055f95-8cf7-4a01-8819-f6423c731b1a',
+ '00..1.bupd/01bf344a-deaf-4ffb-8cc6-ad86b03c63e2',
+ '00..1.bupd/01cc9d94-7006-461e-aace-c5919e1ceb9a',
+ '00..1.bupd/01cf2c47-43ff-4427-865e-01788a3bb910',
+ '01e..1.bupd/.bupm',
+ '01e..1.bupd/01e1e4bd-6950-4694-a259-f7d66600e776',
+ '01e..1.bupd/0c2e3b73-2a44-487c-9aaa-f7428dc3d015',
+ '01e..1.bupd/0c3028cc-90f2-4f46-ad3b-94a21498e2ce',
+ '01e..1.bupd/0c34aa4a-a479-437a-82b7-e19208a46be8',
+ '0c3a..1.bupd/.bupm',
+ '0c3a..1.bupd/0c3a4773-6c7f-4efd-9edc-5628698b65bc',
+ '0c3a..1.bupd/191b396b-fec7-47f6-a5ba-089ce9ca2956',
+ '0c3a..1.bupd/192697c4-b855-4c93-9f3d-2e66a4879c6b',
+ '0c3a..1.bupd/192ba072-38f1-4aaa-8515-a334febaeb34',
+ '192c..1.bupd/.bupm',
+ '192c..1.bupd/192cf67d-d6ee-4e76-8f7c-da7546812bb1',
+ '192c..1.bupd/1f1d9ecc-5ad0-4c70-8b7c-9b4eab55d271',
+ '192c..1.bupd/1f1d9f04-9d74-4cee-bb4d-b7eae08d7f50',
+ '192c..1.bupd/1f21d232-42f0-49d2-bbfd-3561bdb2cd78',
+ '1f29..1.bupd/.bupm',
+ '1f29..1.bupd/1f2936a1-0ece-4335-90e4-cc1883b9dd93',
+ '1f29..1.bupd/1f98f6db-dc1e-4ea7-99e4-7f385c9aa363',
+ '1f29..1.bupd/1f9ce8b4-b0b2-463a-872d-bdc902427b26',
+ '1f29..1.bupd/1f9d4cf1-57a9-4897-9572-5044e2bda6a8',
+ '1fa..1.bupd/.bupm',
+ '1fa..1.bupd/1fa53c9a-d8f0-4b93-9a96-90ad6cbf0295',
+ '1fa..1.bupd/24ad9373-5d28-49c5-9649-5bbd29e52c7b',
+ '1fa..1.bupd/24af5545-e51c-4b26-b72a-ea758bdec9ae',
+ '1fa..1.bupd/24af964f-d47d-44ab-a593-9cb213c89869',
+ '24b..1.bupd/.bupm',
+ '24b..1.bupd/24bc10a8-ac57-4859-9a29-509fc0ff7dc0',
+ '24b..1.bupd/265195da-ef80-47e0-8df8-a134c57af25d',
+ '24b..1.bupd/26534c1b-a7ba-4737-9612-14b24e729006',
+ '24b..1.bupd/26572cc0-b704-4352-a5a3-d7d4e7f571a6',
+ '266..1.bupd/.bupm',
+ '266..1.bupd/26640125-4b33-4bde-ba4e-8751d15894c5',
+ '266..1.bupd/27ae178a-fba2-4b80-a37e-6d14b610ca1f',
+ '266..1.bupd/27b40c6f-c5a2-4d90-90a9-6a76612d9935',
+ '266..1.bupd/27b75972-28a3-40cf-a73b-1b7e554d2e84',
+ '27b8..1.bupd/.bupm',
+ '27b8..1.bupd/27b8eb00-0faa-4b8f-9d1a-19a83a9eec07',
+ '27b8..1.bupd/2c55f6cb-7ce4-49df-a260-ed8850b3b055',
+ '27b8..1.bupd/2c56fa06-72f4-4a94-b5f3-0e6199f88c36',
+ '27b8..1.bupd/2c58a92d-172e-45a8-837e-36dba98b95cb',
+ '2c6..1.bupd/.bupm',
+ '2c6..1.bupd/2c61e2a2-a45f-48d0-b1d7-aa31ae9344b9',
+ '2c6..1.bupd/3921bf33-75ca-4507-8fdf-a881ff106484',
+ '2c6..1.bupd/392295d4-4df0-4066-a223-a4dec17849a8',
+ '2c6..1.bupd/3922ed17-3726-4640-88c7-30fc1b181893',
+ '392f..1.bupd/.bupm',
+ '392f..1.bupd/392fce08-523d-4ae2-b860-3de201252edb',
+ '392f..1.bupd/477066dd-44cb-4a24-bcb7-414a23edaa9f',
+ '392f..1.bupd/4774c87c-9a9c-45a8-89a0-fdca65694a97',
+ '392f..1.bupd/47773c76-3611-4d32-a10a-a02a7b903e2b',
+ '477d..1.bupd/.bupm',
+ '477d..1.bupd/477d130b-d0ed-4b93-9da7-a83a1f775257',
+ '477d..1.bupd/480214dc-e63c-4089-b5b2-713419a20681',
+ '477d..1.bupd/4806c496-7650-4647-934c-2c6c22c15209',
+ '477d..1.bupd/480abcd9-330e-45b5-81ad-a4fb9accef84',
+ '480c..1.bupd/.bupm',
+ '480c..1.bupd/480ca556-ecc9-4b75-9dfb-4951af51df53',
+ '480c..1.bupd/48c45a41-11b0-4bc0-a2a4-9d4c53a9a8ba',
+ '480c..1.bupd/48c5e1d1-1f94-4ac5-b35e-d74f3e2de569',
+ '480c..1.bupd/48ca9b38-a77c-49b0-a8af-a40b876498b5',
+ '48e..1.bupd/.bupm',
+ '48e..1.bupd/48ef2c71-dfac-4160-a152-8f46bb78cc24',
+ '48e..1.bupd/50935f33-3c48-4547-99b1-41843bd8fcbb',
+ '48e..1.bupd/50960add-9801-43df-9b77-1108046d9190',
+ '48e..1.bupd/509ad170-9107-47bb-bc02-9839290f48d9',
+ '50a..1.bupd/.bupm',
+ '50a..1.bupd/50a173fc-c6bb-4d4e-a5fd-3c26268145b8',
+ '50a..1.bupd/52399901-eb4f-433f-8942-9bb8bbc020f1',
+ '50a..1.bupd/523b3f3d-2d89-4a4c-b6a5-a9401684b1be',
+ '50a..1.bupd/5243f3b8-dba5-430a-b204-c494f94b5bc8',
+ '524d..1.bupd/.bupm',
+ '524d..1.bupd/524dfc5d-dd60-4f08-a8e0-7b63dd13110c',
+ '524d..1.bupd/5ae4e67c-b751-4126-80f4-f551c7f8ea9b',
+ '524d..1.bupd/5ae9aea6-859e-44fe-9d6c-39b1f8c21296',
+ '524d..1.bupd/5aec70e7-fa39-4ebd-8285-e79565db5454',
+ '5af..1.bupd/.bupm',
+ '5af..1.bupd/5af4985e-b509-41ff-b1fb-90eeab8599e2',
+ '5af..1.bupd/61d07f00-69ab-41da-b2c1-ad35ff21b6c2',
+ '5af..1.bupd/61d6a7a1-bb38-4c1b-a1d4-c80ca8320170',
+ '5af..1.bupd/61d9986c-be88-4616-bb2a-e805b9d2e614',
+ '61d9a..1.bupd/.bupm',
+ '61d9a..1.bupd/61d9a4e9-88f8-4f06-90f6-a0e2a8d00fff',
+ '61d9a..1.bupd/66541d38-ccb2-4474-b30c-1a020b3418d1',
+ '61d9a..1.bupd/666290b0-faa6-42c7-8bea-5be5e38f9e7d',
+ '61d9a..1.bupd/6662e657-1a38-483a-9947-188478718454',
+ '6667..1.bupd/.bupm',
+ '6667..1.bupd/666712d7-0cb7-4b39-bc86-bb1f792cb75c',
+ '6667..1.bupd/66cf73bd-7627-46d3-900b-b11ca122ac9e',
+ '6667..1.bupd/66d0d5de-e1f7-4225-9c58-324ab1f0e46a',
+ '6667..1.bupd/66d48a1b-0bdc-47b9-b471-55aaeb5d6062']

-split_2 = ['.bupd.2.bupd',
- '.bupm',
- '0/00/.bupm',
- '0/00/00055f95-8cf7-4a01-8819-f6423c731b1a',
- '0/00/01bf344a-deaf-4ffb-8cc6-ad86b03c63e2',
- '0/00/01cc9d94-7006-461e-aace-c5919e1ceb9a',
- '0/00/01cf2c47-43ff-4427-865e-01788a3bb910',
- '0/01e/.bupm',
- '0/01e/01e1e4bd-6950-4694-a259-f7d66600e776',
- '0/01e/0c2e3b73-2a44-487c-9aaa-f7428dc3d015',
- '0/01e/0c3028cc-90f2-4f46-ad3b-94a21498e2ce',
- '0/01e/0c34aa4a-a479-437a-82b7-e19208a46be8',
- '0/0c3a/.bupm',
- '0/0c3a/0c3a4773-6c7f-4efd-9edc-5628698b65bc',
- '0/0c3a/191b396b-fec7-47f6-a5ba-089ce9ca2956',
- '0/0c3a/192697c4-b855-4c93-9f3d-2e66a4879c6b',
- '0/0c3a/192ba072-38f1-4aaa-8515-a334febaeb34',
- '0/192c/.bupm',
- '0/192c/192cf67d-d6ee-4e76-8f7c-da7546812bb1',
- '0/192c/1f1d9ecc-5ad0-4c70-8b7c-9b4eab55d271',
- '0/192c/1f1d9f04-9d74-4cee-bb4d-b7eae08d7f50',
- '0/192c/1f21d232-42f0-49d2-bbfd-3561bdb2cd78',
- '0/1f29/.bupm',
- '0/1f29/1f2936a1-0ece-4335-90e4-cc1883b9dd93',
- '0/1f29/1f98f6db-dc1e-4ea7-99e4-7f385c9aa363',
- '0/1f29/1f9ce8b4-b0b2-463a-872d-bdc902427b26',
- '0/1f29/1f9d4cf1-57a9-4897-9572-5044e2bda6a8',
- '0/1fa/.bupm',
- '0/1fa/1fa53c9a-d8f0-4b93-9a96-90ad6cbf0295',
- '0/1fa/24ad9373-5d28-49c5-9649-5bbd29e52c7b',
- '0/1fa/24af5545-e51c-4b26-b72a-ea758bdec9ae',
- '0/1fa/24af964f-d47d-44ab-a593-9cb213c89869',
- '0/24b/.bupm',
- '0/24b/24bc10a8-ac57-4859-9a29-509fc0ff7dc0',
- '0/24b/265195da-ef80-47e0-8df8-a134c57af25d',
- '0/24b/26534c1b-a7ba-4737-9612-14b24e729006',
- '0/24b/26572cc0-b704-4352-a5a3-d7d4e7f571a6',
- '0/266/.bupm',
- '0/266/26640125-4b33-4bde-ba4e-8751d15894c5',
- '0/266/27ae178a-fba2-4b80-a37e-6d14b610ca1f',
- '0/266/27b40c6f-c5a2-4d90-90a9-6a76612d9935',
- '0/266/27b75972-28a3-40cf-a73b-1b7e554d2e84',
- '0/27b8/.bupm',
- '0/27b8/27b8eb00-0faa-4b8f-9d1a-19a83a9eec07',
- '0/27b8/2c55f6cb-7ce4-49df-a260-ed8850b3b055',
- '0/27b8/2c56fa06-72f4-4a94-b5f3-0e6199f88c36',
- '0/27b8/2c58a92d-172e-45a8-837e-36dba98b95cb',
- '0/2c6/.bupm',
- '0/2c6/2c61e2a2-a45f-48d0-b1d7-aa31ae9344b9',
- '0/2c6/3921bf33-75ca-4507-8fdf-a881ff106484',
- '0/2c6/392295d4-4df0-4066-a223-a4dec17849a8',
- '0/2c6/3922ed17-3726-4640-88c7-30fc1b181893',
- '0/392f/.bupm',
- '0/392f/392fce08-523d-4ae2-b860-3de201252edb',
- '0/392f/477066dd-44cb-4a24-bcb7-414a23edaa9f',
- '0/392f/4774c87c-9a9c-45a8-89a0-fdca65694a97',
- '0/392f/47773c76-3611-4d32-a10a-a02a7b903e2b',
- '0/477d/.bupm',
- '0/477d/477d130b-d0ed-4b93-9da7-a83a1f775257',
- '0/477d/480214dc-e63c-4089-b5b2-713419a20681',
- '0/477d/4806c496-7650-4647-934c-2c6c22c15209',
- '0/477d/480abcd9-330e-45b5-81ad-a4fb9accef84',
- '0/480c/.bupm',
- '0/480c/480ca556-ecc9-4b75-9dfb-4951af51df53',
- '0/480c/48c45a41-11b0-4bc0-a2a4-9d4c53a9a8ba',
- '0/480c/48c5e1d1-1f94-4ac5-b35e-d74f3e2de569',
- '0/480c/48ca9b38-a77c-49b0-a8af-a40b876498b5',
- '0/48e/.bupm',
- '0/48e/48ef2c71-dfac-4160-a152-8f46bb78cc24',
- '0/48e/50935f33-3c48-4547-99b1-41843bd8fcbb',
- '0/48e/50960add-9801-43df-9b77-1108046d9190',
- '0/48e/509ad170-9107-47bb-bc02-9839290f48d9',
- '0/50a/.bupm',
- '0/50a/50a173fc-c6bb-4d4e-a5fd-3c26268145b8',
- '0/50a/52399901-eb4f-433f-8942-9bb8bbc020f1',
- '0/50a/523b3f3d-2d89-4a4c-b6a5-a9401684b1be',
- '0/50a/5243f3b8-dba5-430a-b204-c494f94b5bc8',
- '0/524d/.bupm',
- '0/524d/524dfc5d-dd60-4f08-a8e0-7b63dd13110c',
- '0/524d/5ae4e67c-b751-4126-80f4-f551c7f8ea9b',
- '0/524d/5ae9aea6-859e-44fe-9d6c-39b1f8c21296',
- '0/524d/5aec70e7-fa39-4ebd-8285-e79565db5454',
- '0/5af/.bupm',
- '0/5af/5af4985e-b509-41ff-b1fb-90eeab8599e2',
- '0/5af/61d07f00-69ab-41da-b2c1-ad35ff21b6c2',
- '0/5af/61d6a7a1-bb38-4c1b-a1d4-c80ca8320170',
- '0/5af/61d9986c-be88-4616-bb2a-e805b9d2e614',
- '0/61d9a/.bupm',
- '0/61d9a/61d9a4e9-88f8-4f06-90f6-a0e2a8d00fff',
- '0/61d9a/66541d38-ccb2-4474-b30c-1a020b3418d1',
- '0/61d9a/666290b0-faa6-42c7-8bea-5be5e38f9e7d',
- '0/61d9a/6662e657-1a38-483a-9947-188478718454',
- '0/6667/.bupm',
- '0/6667/666712d7-0cb7-4b39-bc86-bb1f792cb75c',
- '0/6667/66cf73bd-7627-46d3-900b-b11ca122ac9e',
- '0/6667/66d0d5de-e1f7-4225-9c58-324ab1f0e46a',
- '0/6667/66d48a1b-0bdc-47b9-b471-55aaeb5d6062',
- '66d6/6/.bupm',
- '66d6/6/66d6571f-a65c-483a-9c44-905baac6ca1c']
+split_2 = ['.bupm',
+ '0..2.bupd/00..1.bupd/.bupm',
+ '0..2.bupd/00..1.bupd/00055f95-8cf7-4a01-8819-f6423c731b1a',
+ '0..2.bupd/00..1.bupd/01bf344a-deaf-4ffb-8cc6-ad86b03c63e2',
+ '0..2.bupd/00..1.bupd/01cc9d94-7006-461e-aace-c5919e1ceb9a',
+ '0..2.bupd/00..1.bupd/01cf2c47-43ff-4427-865e-01788a3bb910',
+ '0..2.bupd/01e..1.bupd/.bupm',
+ '0..2.bupd/01e..1.bupd/01e1e4bd-6950-4694-a259-f7d66600e776',
+ '0..2.bupd/01e..1.bupd/0c2e3b73-2a44-487c-9aaa-f7428dc3d015',
+ '0..2.bupd/01e..1.bupd/0c3028cc-90f2-4f46-ad3b-94a21498e2ce',
+ '0..2.bupd/01e..1.bupd/0c34aa4a-a479-437a-82b7-e19208a46be8',
+ '0..2.bupd/0c3a..1.bupd/.bupm',
+ '0..2.bupd/0c3a..1.bupd/0c3a4773-6c7f-4efd-9edc-5628698b65bc',
+ '0..2.bupd/0c3a..1.bupd/191b396b-fec7-47f6-a5ba-089ce9ca2956',
+ '0..2.bupd/0c3a..1.bupd/192697c4-b855-4c93-9f3d-2e66a4879c6b',
+ '0..2.bupd/0c3a..1.bupd/192ba072-38f1-4aaa-8515-a334febaeb34',
+ '0..2.bupd/192c..1.bupd/.bupm',
+ '0..2.bupd/192c..1.bupd/192cf67d-d6ee-4e76-8f7c-da7546812bb1',
+ '0..2.bupd/192c..1.bupd/1f1d9ecc-5ad0-4c70-8b7c-9b4eab55d271',
+ '0..2.bupd/192c..1.bupd/1f1d9f04-9d74-4cee-bb4d-b7eae08d7f50',
+ '0..2.bupd/192c..1.bupd/1f21d232-42f0-49d2-bbfd-3561bdb2cd78',
+ '0..2.bupd/1f29..1.bupd/.bupm',
+ '0..2.bupd/1f29..1.bupd/1f2936a1-0ece-4335-90e4-cc1883b9dd93',
+ '0..2.bupd/1f29..1.bupd/1f98f6db-dc1e-4ea7-99e4-7f385c9aa363',
+ '0..2.bupd/1f29..1.bupd/1f9ce8b4-b0b2-463a-872d-bdc902427b26',
+ '0..2.bupd/1f29..1.bupd/1f9d4cf1-57a9-4897-9572-5044e2bda6a8',
+ '0..2.bupd/1fa..1.bupd/.bupm',
+ '0..2.bupd/1fa..1.bupd/1fa53c9a-d8f0-4b93-9a96-90ad6cbf0295',
+ '0..2.bupd/1fa..1.bupd/24ad9373-5d28-49c5-9649-5bbd29e52c7b',
+ '0..2.bupd/1fa..1.bupd/24af5545-e51c-4b26-b72a-ea758bdec9ae',
+ '0..2.bupd/1fa..1.bupd/24af964f-d47d-44ab-a593-9cb213c89869',
+ '0..2.bupd/24b..1.bupd/.bupm',
+ '0..2.bupd/24b..1.bupd/24bc10a8-ac57-4859-9a29-509fc0ff7dc0',
+ '0..2.bupd/24b..1.bupd/265195da-ef80-47e0-8df8-a134c57af25d',
+ '0..2.bupd/24b..1.bupd/26534c1b-a7ba-4737-9612-14b24e729006',
+ '0..2.bupd/24b..1.bupd/26572cc0-b704-4352-a5a3-d7d4e7f571a6',
+ '0..2.bupd/266..1.bupd/.bupm',
+ '0..2.bupd/266..1.bupd/26640125-4b33-4bde-ba4e-8751d15894c5',
+ '0..2.bupd/266..1.bupd/27ae178a-fba2-4b80-a37e-6d14b610ca1f',
+ '0..2.bupd/266..1.bupd/27b40c6f-c5a2-4d90-90a9-6a76612d9935',
+ '0..2.bupd/266..1.bupd/27b75972-28a3-40cf-a73b-1b7e554d2e84',
+ '0..2.bupd/27b8..1.bupd/.bupm',
+ '0..2.bupd/27b8..1.bupd/27b8eb00-0faa-4b8f-9d1a-19a83a9eec07',
+ '0..2.bupd/27b8..1.bupd/2c55f6cb-7ce4-49df-a260-ed8850b3b055',
+ '0..2.bupd/27b8..1.bupd/2c56fa06-72f4-4a94-b5f3-0e6199f88c36',
+ '0..2.bupd/27b8..1.bupd/2c58a92d-172e-45a8-837e-36dba98b95cb',
+ '0..2.bupd/2c6..1.bupd/.bupm',
+ '0..2.bupd/2c6..1.bupd/2c61e2a2-a45f-48d0-b1d7-aa31ae9344b9',
+ '0..2.bupd/2c6..1.bupd/3921bf33-75ca-4507-8fdf-a881ff106484',
+ '0..2.bupd/2c6..1.bupd/392295d4-4df0-4066-a223-a4dec17849a8',
+ '0..2.bupd/2c6..1.bupd/3922ed17-3726-4640-88c7-30fc1b181893',
+ '0..2.bupd/392f..1.bupd/.bupm',
+ '0..2.bupd/392f..1.bupd/392fce08-523d-4ae2-b860-3de201252edb',
+ '0..2.bupd/392f..1.bupd/477066dd-44cb-4a24-bcb7-414a23edaa9f',
+ '0..2.bupd/392f..1.bupd/4774c87c-9a9c-45a8-89a0-fdca65694a97',
+ '0..2.bupd/392f..1.bupd/47773c76-3611-4d32-a10a-a02a7b903e2b',
+ '0..2.bupd/477d..1.bupd/.bupm',
+ '0..2.bupd/477d..1.bupd/477d130b-d0ed-4b93-9da7-a83a1f775257',
+ '0..2.bupd/477d..1.bupd/480214dc-e63c-4089-b5b2-713419a20681',
+ '0..2.bupd/477d..1.bupd/4806c496-7650-4647-934c-2c6c22c15209',
+ '0..2.bupd/477d..1.bupd/480abcd9-330e-45b5-81ad-a4fb9accef84',
+ '0..2.bupd/480c..1.bupd/.bupm',
+ '0..2.bupd/480c..1.bupd/480ca556-ecc9-4b75-9dfb-4951af51df53',
+ '0..2.bupd/480c..1.bupd/48c45a41-11b0-4bc0-a2a4-9d4c53a9a8ba',
+ '0..2.bupd/480c..1.bupd/48c5e1d1-1f94-4ac5-b35e-d74f3e2de569',
+ '0..2.bupd/480c..1.bupd/48ca9b38-a77c-49b0-a8af-a40b876498b5',
+ '0..2.bupd/48e..1.bupd/.bupm',
+ '0..2.bupd/48e..1.bupd/48ef2c71-dfac-4160-a152-8f46bb78cc24',
+ '0..2.bupd/48e..1.bupd/50935f33-3c48-4547-99b1-41843bd8fcbb',
+ '0..2.bupd/48e..1.bupd/50960add-9801-43df-9b77-1108046d9190',
+ '0..2.bupd/48e..1.bupd/509ad170-9107-47bb-bc02-9839290f48d9',
+ '0..2.bupd/50a..1.bupd/.bupm',
+ '0..2.bupd/50a..1.bupd/50a173fc-c6bb-4d4e-a5fd-3c26268145b8',
+ '0..2.bupd/50a..1.bupd/52399901-eb4f-433f-8942-9bb8bbc020f1',
+ '0..2.bupd/50a..1.bupd/523b3f3d-2d89-4a4c-b6a5-a9401684b1be',
+ '0..2.bupd/50a..1.bupd/5243f3b8-dba5-430a-b204-c494f94b5bc8',
+ '0..2.bupd/524d..1.bupd/.bupm',
+ '0..2.bupd/524d..1.bupd/524dfc5d-dd60-4f08-a8e0-7b63dd13110c',
+ '0..2.bupd/524d..1.bupd/5ae4e67c-b751-4126-80f4-f551c7f8ea9b',
+ '0..2.bupd/524d..1.bupd/5ae9aea6-859e-44fe-9d6c-39b1f8c21296',
+ '0..2.bupd/524d..1.bupd/5aec70e7-fa39-4ebd-8285-e79565db5454',
+ '0..2.bupd/5af..1.bupd/.bupm',
+ '0..2.bupd/5af..1.bupd/5af4985e-b509-41ff-b1fb-90eeab8599e2',
+ '0..2.bupd/5af..1.bupd/61d07f00-69ab-41da-b2c1-ad35ff21b6c2',
+ '0..2.bupd/5af..1.bupd/61d6a7a1-bb38-4c1b-a1d4-c80ca8320170',
+ '0..2.bupd/5af..1.bupd/61d9986c-be88-4616-bb2a-e805b9d2e614',
+ '0..2.bupd/61d9a..1.bupd/.bupm',
+ '0..2.bupd/61d9a..1.bupd/61d9a4e9-88f8-4f06-90f6-a0e2a8d00fff',
+ '0..2.bupd/61d9a..1.bupd/66541d38-ccb2-4474-b30c-1a020b3418d1',
+ '0..2.bupd/61d9a..1.bupd/666290b0-faa6-42c7-8bea-5be5e38f9e7d',
+ '0..2.bupd/61d9a..1.bupd/6662e657-1a38-483a-9947-188478718454',
+ '0..2.bupd/6667..1.bupd/.bupm',
+ '0..2.bupd/6667..1.bupd/666712d7-0cb7-4b39-bc86-bb1f792cb75c',
+ '0..2.bupd/6667..1.bupd/66cf73bd-7627-46d3-900b-b11ca122ac9e',
+ '0..2.bupd/6667..1.bupd/66d0d5de-e1f7-4225-9c58-324ab1f0e46a',
+ '0..2.bupd/6667..1.bupd/66d48a1b-0bdc-47b9-b471-55aaeb5d6062',
+ '66d6..2.bupd/6..1.bupd/.bupm',
+ '66d6..2.bupd/6..1.bupd/66d6571f-a65c-483a-9c44-905baac6ca1c']

def pruned_ls_files(parent, output):
files = []
diff --git a/test/int/test_vfs.py b/test/int/test_vfs.py
index b1f4eb91..68bcdb8f 100644
--- a/test/int/test_vfs.py
+++ b/test/int/test_vfs.py
@@ -374,14 +374,16 @@ def test_duplicate_save_dates(tmpdir):
tuple(sorted(x[0] for x in vfs.contents(repo, revlist))))

def test_tree_depth_parsing():
- assert vfs._parse_tree_depth(b'.bupd.1.bupd') == 1
- assert vfs._parse_tree_depth(b'.bupd.42.bupd') == 42
- assert vfs._parse_tree_depth(b'.bupd.42.something-else.bupd') == 42
- for x in (b'.bupd..bupd',
- b'.bupd.-1.bupd',
- b'.bupd.?.bupd',
- b'.bupd.???.bupd',
- b'.bupd.???.bupx',
+ assert vfs._parse_tree_depth(b'foo..1.bupd') == 1
+ assert vfs._parse_tree_depth(b'foo..42.bupd') == 42
+ assert vfs._parse_tree_depth(b'foo..extension.42.bupd') == 42
+ for x in (b'foo.bupd',
+ b'foo..bupd',
+ b'foo...bupd',
+ b'foo..-1.bupd',
+ b'foo..?.bupd',
+ b'foo..???.bupd',
+ b'foo..???.bupx',
b'.bupm'):
with pytest.raises(Exception) as exinfo:
vfs._parse_tree_depth(x)
--
2.47.3

Reply all
Reply to author
Forward
0 new messages