Instead of yielding a differing number of items, where the first is
summary information (oidx, type, size), followed by "all the data" if
and only if include_data is true, just return (oidx, type, size,
data_iterator) where the data_iterator is None when include_data is
false. The caller is still required to consume the iterator, before
issuing any other requests so that the remote call can be finished
properly.
For CatPipe and a local repo the "remote" call is to the git cat-file
subprocess, and for a RemoteRepo, it's to the bup server.
This simplifies the calls a bit, as can be seen in the changes.
Thanks to Johannes Berg for help figuring out what we wanted.
lib/bup/cmd/get.py | 13 ++++----
lib/bup/cmd/split.py | 3 +-
lib/bup/cmd/validate_object_links.py | 3 +-
lib/bup/gc.py | 6 ++--
lib/bup/git.py | 47 +++++++++++++---------------
lib/bup/protocol.py | 7 ++---
lib/bup/repo/base.py | 8 +++--
lib/bup/repo/local.py | 7 +----
lib/bup/repo/remote.py | 41 ++++++++++++------------
lib/bup/rewrite.py | 4 +--
lib/bup/rm.py | 2 +-
lib/bup/vfs.py | 6 ++--
test/int/test_commit.py | 4 +--
test/int/test_git.py | 16 +++++-----
14 files changed, 76 insertions(+), 91 deletions(-)
diff --git a/lib/bup/cmd/get.py b/lib/bup/cmd/get.py
index cf648121..63ea45f1 100644
--- a/lib/bup/cmd/get.py
+++ b/lib/bup/cmd/get.py
@@ -12,7 +12,7 @@ import os, re, sys, textwrap, time
from bup import client, compat, git, hashsplit, vfs
from bup.commit import commit_message
from bup.compat import dataclass, dataclass_frozen_for_testing, get_argvb
-from bup.git import MissingObject, get_cat_data, parse_commit, walk_object
+from bup.git import MissingObject, get_commit_items, walk_object
from bup.helpers import \
(EXIT_FAILURE,
EXIT_RECOVERED,
@@ -315,7 +315,7 @@ def get_random_item(hash, src_repo, dest_repo, ignore_missing):
return dest_repo.exists(unhexlify(oid))
def get_ref(oidx, include_data=False):
assert include_data
- yield from
src_repo.cat(oidx)
+ return
src_repo.cat(oidx)
for item in walk_object(get_ref, hash, stop_at=already_seen,
include_data=True, result='item'):
assert isinstance(item, git.WalkItem)
@@ -357,7 +357,7 @@ class GetResult:
def transfer_commit(hash, parent, src_repo, dest_repo, ignore_missing):
now = time.time()
- items = parse_commit(get_cat_data(
src_repo.cat(hash), b'commit'))
+ items = get_commit_items(hash,
src_repo.cat)
tree = unhexlify(items.tree)
author = b'%s <%s>' % (items.author_name, items.author_mail)
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
@@ -437,8 +437,7 @@ def append_commits(src_loc, dest_hash, src_repo, dest_repo, rewriter, excludes,
GitLoc = namedtuple('GitLoc', ('ref', 'hash', 'type'))
def find_git_item(ref, repo):
- it =
repo.cat(ref)
- oidx, typ, _ = next(it)
+ oidx, typ, _, it =
repo.cat(ref)
# FIXME: don't include_data once repo supports it
for _ in it: pass
if not oidx:
@@ -609,7 +608,7 @@ def handle_ff(item, src_repo, dest_repo):
if not dest_oidx or dest_oidx in src_repo.rev_list(src_oidx):
# Can fast forward.
get_random_item(src_oidx, src_repo, dest_repo, item.spec.ignore_missing)
- commit_items = parse_commit(get_cat_data(
src_repo.cat(src_oidx), b'commit'))
+ commit_items = get_commit_items(src_oidx,
src_repo.cat)
return GetResult(item.src.hash, unhexlify(commit_items.tree))
misuse('destination is not an ancestor of source for %s'
% spec_msg(item.spec))
@@ -772,7 +771,7 @@ def handle_replace(item, src_repo, dest_repo):
assert(item.dest.type == 'branch' or not item.dest.type)
src_oidx = hexlify(item.src.hash)
get_random_item(src_oidx, src_repo, dest_repo, item.spec.ignore_missing)
- commit_items = parse_commit(get_cat_data(
src_repo.cat(src_oidx), b'commit'))
+ commit_items = get_commit_items(src_oidx,
src_repo.cat)
return GetResult(item.src.hash, unhexlify(commit_items.tree))
diff --git a/lib/bup/cmd/split.py b/lib/bup/cmd/split.py
index 55fc6619..fd0028ae 100644
--- a/lib/bup/cmd/split.py
+++ b/lib/bup/cmd/split.py
@@ -213,8 +213,7 @@ def main(argv):
if line:
line = line.strip()
try:
- it = cp.get(line.strip())
- next(it, None) # skip the file info
+ it = cp.get(line.strip())[3] # skip the file info
except KeyError as e:
add_error('error: %s' % e)
continue
diff --git a/lib/bup/cmd/validate_object_links.py b/lib/bup/cmd/validate_object_links.py
index 75ad5213..5855dbc5 100644
--- a/lib/bup/cmd/validate_object_links.py
+++ b/lib/bup/cmd/validate_object_links.py
@@ -63,8 +63,7 @@ class Pack:
data = zlib.decompress(data)
yield oid, git._typermap[kind], data
elif kind in (5, 6, 7): # reserved obj_ofs_delta obj_ref_delta
- it = self._cp.get(hexlify(oid))
- _, tp, _ = next(it, None) # cannot return None
+ _, tp, _, it = self._cp.get(hexlify(oid))
data = b''.join(it)
if tp == b'blob':
continue
diff --git a/lib/bup/gc.py b/lib/bup/gc.py
index 8ffe73b3..531e3f0d 100644
--- a/lib/bup/gc.py
+++ b/lib/bup/gc.py
@@ -182,8 +182,7 @@ def sweep(live_objects, live_trees, existing_count, cat_pipe, threshold,
must_rewrite = False
live_in_this_pack = set()
for sha in idx:
- tmp_it = cat_pipe.get(hexlify(sha), include_data=False)
- _, typ, _ = next(tmp_it)
+ typ = cat_pipe.get(hexlify(sha), include_data=False)[1]
if typ != b'blob':
is_live = sha in live_trees
if not is_live:
@@ -218,8 +217,7 @@ def sweep(live_objects, live_trees, existing_count, cat_pipe, threshold,
reprogress()
for sha in idx:
if sha in live_in_this_pack:
- item_it = cat_pipe.get(hexlify(sha))
- _, typ, _ = next(item_it)
+ _, typ, _, item_it = cat_pipe.get(hexlify(sha))
repo.just_write(sha, typ, b''.join(item_it))
assert idx_name.endswith(b'.idx')
stale_packs.append(idx_name[:-4])
diff --git a/lib/bup/git.py b/lib/bup/git.py
index 10ba638e..d6dd8df1 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -95,15 +95,11 @@ def git_config_get(path, option, *, opttype=None):
raise GitError('%r returned %d' % (cmd, rc))
-def get_cat_data(cat_iterator, expected_type):
- _, kind, _ = next(cat_iterator)
- if kind != expected_type:
- raise Exception('expected %r, saw %r' % (expected_type, kind))
- return b''.join(cat_iterator)
-
-
-def get_commit_items(id, cp):
- return parse_commit(get_cat_data(cp.get(id), b'commit'))
+def get_commit_items(ref, get_ref):
+ _, kind, _, it = get_ref(ref)
+ if kind != b'commit':
+ raise Exception(f'{path_msg(ref)} is {kind}, not commit')
+ return parse_commit(b''.join(it))
def repo(sub = b'', repo_dir=None):
@@ -1393,8 +1389,9 @@ class CatPipe:
env=_gitenv(self.repo_dir))
def get(self, ref, include_data=True):
- """Yield (oidx, type, size), followed by the data referred to by ref.
- If ref does not exist, only yield (None, None, None).
+ """Return (oidx, type, size, data_iterator). When
+ include_data is true, the data_iterator will be None. When
+ the ref is missing, all elements will be None.
"""
if not self.p or self.p.poll() is not None:
@@ -1425,8 +1422,7 @@ class CatPipe:
% (ref, p.poll() or 'none'))
if hdr.endswith(b' missing\n'):
self.inprogress = None
- yield None, None, None
- return
+ return None, None, None, None
info = hdr.split(b' ')
if len(info) != 3 or len(info[0]) != 40:
raise GitError('expected object (id, type, size), got %r' % info)
@@ -1435,18 +1431,18 @@ class CatPipe:
if not include_data:
self.inprogress = None
- yield oidx, typ, size
- return
+ return oidx, typ, size, None
- try:
- yield oidx, typ, size
- yield from chunkyreader(p.stdout, size)
- readline_result = p.stdout.readline()
- assert readline_result == b'\n'
- self.inprogress = None
- except Exception as ex:
- self.close()
- raise ex
+ def data_iterator():
+ try:
+ yield from chunkyreader(p.stdout, size)
+ readline_result = p.stdout.readline()
+ assert readline_result == b'\n'
+ self.inprogress = None
+ except Exception as ex:
+ self.close()
+ raise ex
+ return oidx, typ, size, data_iterator()
_catpipe_for = {}
@@ -1542,8 +1538,7 @@ def walk_object(get_ref, oidx, *, stop_at=None, include_data=None,
# must have data for commits, trees, or unknown
got_data = (exp_typ in (b'commit', b'tree', None)) or include_data
- item_it = get_ref(oidx, include_data=got_data)
- get_oidx, typ, _ = next(item_it, None) # cannot return None
+ get_oidx, typ, _, item_it = get_ref(oidx, include_data=got_data)
if not get_oidx:
item = WalkItem(oid=unhexlify(oidx), type=exp_typ, name=name,
mode=mode, data=False)
diff --git a/lib/bup/protocol.py b/lib/bup/protocol.py
index 855ae400..c091386f 100644
--- a/lib/bup/protocol.py
+++ b/lib/bup/protocol.py
@@ -338,12 +338,11 @@ class Server:
# For now, avoid potential deadlock by just reading them all
for ref in tuple(lines_until_sentinel(self.conn, b'\n', Exception)):
ref = ref[:-1]
- it =
self.repo.cat(ref)
- info = next(it)
- if not info[0]:
+ oidx, kind, size, it =
self.repo.cat(ref)
+ if not oidx:
self.conn.write(b'missing\n')
continue
- self.conn.write(b'%s %s %d\n' % info)
+ self.conn.write(b'%s %s %d\n' % (oidx, kind, size))
for buf in it:
self.conn.write(buf)
self.conn.ok()
diff --git a/lib/bup/repo/base.py b/lib/bup/repo/base.py
index bfe49972..f45e5e53 100644
--- a/lib/bup/repo/base.py
+++ b/lib/bup/repo/base.py
@@ -69,9 +69,11 @@ class RepoProtocol:
@notimplemented
def cat(self, ref):
- """
- If ref does not exist, yield (None, None, None). Otherwise yield
- (oidx, type, size), and then all of the data associated with ref.
+ """Return (oidx, type, size, data_iterator). When the ref is
+ missing, all elements will be None. For some repositories
+ (like RemoteRepo), the iterator must be consumed before
+ calling other repository methods.
+
"""
@notimplemented
diff --git a/lib/bup/repo/local.py b/lib/bup/repo/local.py
index 6fe2c2f5..2fffc866 100644
--- a/lib/bup/repo/local.py
+++ b/lib/bup/repo/local.py
@@ -110,12 +110,7 @@ class LocalRepo(RepoProtocol):
return git.update_ref(refname, newval, oldval, repo_dir=self.repo_dir)
def cat(self, ref):
- it = self._cp.get(ref)
- info = next(it, None) # cannot return None
- oidx = info[0]
- yield info
- if oidx: yield from it
- assert not next(it, None)
+ return self._cp.get(ref)
def join(self, ref):
return vfs.join(self, ref)
diff --git a/lib/bup/repo/remote.py b/lib/bup/repo/remote.py
index 9f6e9de7..a399a3a1 100644
--- a/lib/bup/repo/remote.py
+++ b/lib/bup/repo/remote.py
@@ -62,27 +62,28 @@ class RemoteRepo(RepoProtocol):
def is_remote(self): return True
def cat(self, ref):
- # Yield all the data here so that we don't finish the
- # cat_batch iterator (triggering its cleanup) until all of the
- # data has been read. Otherwise we'd be out of sync with the
- # server. If the ref is 40 hex digits, then assume it's an
- # oid, and verify that the data provided by the remote
+ # The data iterator must be consumed before any other client
+ # interactions. If the ref is 40 hex digits, then assume it's
+ # an oid, and verify that the data provided by the remote
# actually has that oid. If not, throw.
- items = self.client.cat_batch((ref,))
- oidx, typ, size, it = info = next(items, None) # cannot return None
- yield info[:-1]
- if oidx:
- if not _oidx_rx.fullmatch(ref):
- yield from it
- else:
- actual_oid = git.start_sha1(typ, size)
- for data in it:
- actual_oid.update(data)
- yield data
- actual_oid = actual_oid.digest()
- if hexlify(actual_oid) != ref:
- raise Exception(f'received {actual_oid.hex()}, expected oid {ref}')
- assert not next(items, None)
+ def hash_checked_data(kind, size, it, expected, batch):
+ actual_oid = git.start_sha1(kind, size)
+ for data in it:
+ actual_oid.update(data)
+ yield data
+ actual_oid = actual_oid.digest()
+ if hexlify(actual_oid) != expected:
+ raise Exception(f'received {actual_oid.hex()}, expected oid {expected}')
+ # causes client to finish the call
+ assert not next(batch, None)
+
+ batch = self.client.cat_batch((ref,))
+ oidx, typ, size, it = items = next(batch, None) # cannot return None
+ if not oidx:
+ return items
+ if not _oidx_rx.fullmatch(ref):
+ return items
+ return *items[:-1], hash_checked_data(typ, size, it, ref, batch)
def write_commit(self, tree, parent,
author, adate_sec, adate_tz,
diff --git a/lib/bup/rewrite.py b/lib/bup/rewrite.py
index eec972f1..f7d83648 100755
--- a/lib/bup/rewrite.py
+++ b/lib/bup/rewrite.py
@@ -11,7 +11,7 @@ import sqlite3, sys, time
from bup import hashsplit, metadata, vfs
from bup.commit import commit_message
from bup.compat import dataclass
-from bup.git import get_cat_data, parse_commit
+from bup.git import get_commit_items
from bup.hashsplit import \
(GIT_MODE_EXEC,
GIT_MODE_FILE,
@@ -604,7 +604,7 @@ class Rewriter:
tree = stack.pop() # and the root to get the tree
save_oidx = hexlify(save_path[2][1].coid)
- ci = parse_commit(get_cat_data(
srcrepo.cat(save_oidx), b'commit'))
+ ci = get_commit_items(save_oidx,
srcrepo.cat)
author = ci.author_name + b' <' + ci.author_mail + b'>'
committer = b'%s <%s@%s>' % (userfullname(), username(), hostname())
trailers = repairs.repair_trailers(
repairs.id)
diff --git a/lib/bup/rm.py b/lib/bup/rm.py
index 66cb371f..3d26efc8 100644
--- a/lib/bup/rm.py
+++ b/lib/bup/rm.py
@@ -8,7 +8,7 @@ from bup.helpers import add_error, die_if_errors, log, saved_errors
from
bup.io import path_msg
def append_commit(hash, parent, cp, writer):
- ci = get_commit_items(hash, cp)
+ ci = get_commit_items(hash, cp.get)
tree = unhexlify(ci.tree)
author = b'%s <%s>' % (ci.author_name, ci.author_mail)
committer = b'%s <%s>' % (ci.committer_name, ci.committer_mail)
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 799e217a..d12f1219 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -144,8 +144,7 @@ def get_ref(repo, ref):
If ref is missing, yield (None, None, None, None).
"""
- it =
repo.cat(ref)
- found_oidx, obj_t, size = next(it)
+ found_oidx, obj_t, size, it =
repo.cat(ref)
if not found_oidx:
return None, None, None, None
return found_oidx, obj_t, size, it
@@ -589,8 +588,7 @@ def root_items(repo, names=None, want_meta=True):
for ref in names:
if ref in (b'.', b'.tag'):
continue
- it =
repo.cat(b'refs/heads/' + ref)
- oidx, typ, size_ = next(it, None) # cannot return None
+ oidx, typ, size_, it =
repo.cat(b'refs/heads/' + ref)
if not oidx:
continue
assert typ == b'commit'
diff --git a/test/int/test_commit.py b/test/int/test_commit.py
index 6a4ec41f..49c6c8e7 100644
--- a/test/int/test_commit.py
+++ b/test/int/test_commit.py
@@ -58,7 +58,7 @@ def test_commit_parsing(tmpdir):
coff = (int(coffs[-4:-2]) * 60 * 60) + (int(coffs[-2:]) * 60)
if coffs[-5] == b'-'[0]:
coff = - coff
- commit_items = git.get_commit_items(commit, git.catpipe())
+ commit_items = git.get_commit_items(commit, git.catpipe().get)
assert parents == b''
WVPASSEQ(commit_items.parents, [])
WVPASSEQ(commit_items.tree, tree)
@@ -77,7 +77,7 @@ def test_commit_parsing(tmpdir):
exc(b'git', b'commit', b'-am', b'Do something else')
child = exo(b'git', b'show-ref', b'-s', b'main').strip()
parents = showval(child, b'%P')
- commit_items = git.get_commit_items(child, git.catpipe())
+ commit_items = git.get_commit_items(child, git.catpipe().get)
WVPASSEQ(commit_items.parents, [commit])
finally:
os.chdir(orig_cwd)
diff --git a/test/int/test_git.py b/test/int/test_git.py
index 453c0090..a5229596 100644
--- a/test/int/test_git.py
+++ b/test/int/test_git.py
@@ -302,7 +302,7 @@ def test_new_commit(tmpdir):
cdate_sec, cdate_tz_sec,
b'There is a small mailbox here')
- commit_items = git.get_commit_items(hexlify(commit), git.catpipe())
+ commit_items = git.get_commit_items(hexlify(commit), git.catpipe().get)
local_author_offset = localtime(adate_sec).tm_gmtoff
local_committer_offset = localtime(cdate_sec).tm_gmtoff
WVPASSEQ(tree, unhexlify(commit_items.tree))
@@ -317,7 +317,7 @@ def test_new_commit(tmpdir):
WVPASSEQ(cdate_sec, commit_items.committer_sec)
WVPASSEQ(local_committer_offset, commit_items.committer_offset)
- commit_items = git.get_commit_items(hexlify(commit_off), git.catpipe())
+ commit_items = git.get_commit_items(hexlify(commit_off), git.catpipe().get)
WVPASSEQ(tree, unhexlify(commit_items.tree))
WVPASSEQ(1, len(commit_items.parents))
WVPASSEQ(parent, unhexlify(commit_items.parents[0]))
@@ -397,16 +397,16 @@ def test_cat_pipe(tmpdir):
size = int(exo(b'git', b'--git-dir', bupdir,
b'cat-file', b'-s', b'src'))
- it = git.catpipe().get(b'src')
- assert (oidx, typ, size) == next(it)
- data = b''.join(it)
+ info = git.catpipe().get(b'src')
+ assert (oidx, typ, size) == info[:-1]
+ data = b''.join(info[3])
assert data.startswith(b'tree ')
assert b'\nauthor ' in data
assert b'\ncommitter ' in data
- it = git.catpipe().get(b'src', include_data=False)
- assert (oidx, typ, size) == next(it)
- assert b'' == b''.join(it)
+ info = git.catpipe().get(b'src', include_data=False)
+ assert (oidx, typ, size) == info[:-1]
+ assert info[3] is None
def _create_idx(d, i):
--
2.47.3