Documentation/bup-repack.md | 67 ++++++++++++++++++++++
README.md | 5 ++
cmd/repack-cmd.py | 132 ++++++++++++++++++++++++++++++++++++++++++++
lib/bup/git.py | 69 +++++++++++++++++++++++
t/test.sh | 48 +++++++++++++++-
5 files changed, 319 insertions(+), 2 deletions(-)
create mode 100644 Documentation/bup-repack.md
create mode 100755 cmd/repack-cmd.py
diff --git a/Documentation/bup-repack.md b/Documentation/bup-repack.md
new file mode 100644
index 0000000..2bc07f5
--- /dev/null
+++ b/Documentation/bup-repack.md
@@ -0,0 +1,67 @@
+% bup-save(1) Bup %BUP_VERSION%
+% Zoran Zaric <
z...@zoranzaric.de>
+% %BUP_DATE%
+
+# NAME
+
+bup-repack - repack a repository to free up space
+
+# SYNOPSIS
+
+bup repack [-n] [-q] [-#] [-f]
+
+# DESCRIPTION
+
+`bup repack` repacks all objects in a repository into new
+packfiles. It traverses the history and saves only needed
+objects.
+
+`bup repack` iterates over the existing packfiles and
+deletes each, after writing the last needed object to the
+new packfile. Because of this repacking a repository can
+use the biggest existing packfile's filesize as additional
+diskspace.
+
+# OPTIONS
+
+-n,--dry-run
+: don't do anything just print out what would be done
+
+-q, --quiet
+: disable progress messages.
+
+-*#*, --compress=*#*
+: set the compression level to # (a value from 0-9, where
+ 9 is the highest and 0 is no compression). The default
+ is 1 (fast, loose compression). WARNING: Changing the
+ compression level will change all objects and will result
+ in duplicate objects if the new compression level isn't
+ set on new saves.
+
+-f, --force
+: ignore free space check.
+
+
+# EXAMPLE
+
+ $ bup index -ux /etc
+ Indexing: 1981, done.
+
+ $ bup save -r myserver: -n my-pc-backup --bwlimit=50k /etc
+ Reading index: 1981, done.
+ Saving: 100.00% (998/998k, 1981/1981 files), done.
+
+ $ bup repack
+ Traversing my-pc-backup to find needed objects...
+ Traversing objects: 54323, done.
+ Writing new packfiles...
+ Writing objects: 28115, done.
+
+
+# SEE ALSO
+
+`bup-save`(1)
+
+# BUP
+
+Part of the `bup`(1) suite.
diff --git a/README.md b/README.md
index b3a937e..4b0e516 100644
--- a/README.md
+++ b/README.md
@@ -332,6 +332,11 @@ mailing list (see below) if you'd like to help.
We'll have to do it in a totally different way. There are lots of
options. For now: make sure you've got lots of disk space :)
+ UPDATE: With `bup repack` you actually can prune away old backups. Making
+ backups unreachable from backup-sets doesn't have a UI yet though and is
+ left as an exercise for the reader. *hint* *hint* `git filter-branch` can be
+ useful for this.
+
- bup has never been tested on anything but Linux, MacOS, and Windows+Cygwin.
There's nothing that makes it *inherently* non-portable, though, so
diff --git a/cmd/repack-cmd.py b/cmd/repack-cmd.py
new file mode 100755
index 0000000..02f0bbd
--- /dev/null
+++ b/cmd/repack-cmd.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+import sys, os
+from bup import git, options
+from bup.helpers import *
+
+def run(argv):
+ # at least in python 2.5, using "stdout=2" or "stdout=sys.stderr" below
+ # doesn't actually work, because subprocess closes fd #2 right before
+ # execing for some reason. So we work around it by duplicating the fd
+ # first.
+ fd = os.dup(2) # copy stderr
+ try:
+ p = subprocess.Popen(argv, stdout=fd, close_fds=False)
+ return p.wait()
+ finally:
+ os.close(fd)
+
+
+optspec = """
+bup repack
+--
+q,quiet don't show progress meter
+v,verbose increase log output (can be used more than once)
+n,dry-run don't do anything, just print what would be done
+f,force ignore the space check
+#,compress= set compression level to # (0-9, 9 is highest) [1] (See WARNING in manpage!)
+"""
+o = options.Options(optspec)
+(opt, flags, extra) = o.parse(sys.argv[1:])
+
+git.check_repo_or_die()
+
+handle_ctrl_c()
+
+if not opt.force:
+ # this only works on unix
+ vfs_stats = os.statvfs(git.repo())
+ free_space = vfs_stats.f_bsize * vfs_stats.f_bavail
+ if not opt.force and free_space < git.max_pack_size * 2:
+ o.fatal("insufficent space")
+
+cp = git.CatPipe()
+
+opt.progress = (istty2 and not opt.quiet)
+refs = git.list_refs()
+refnames = [name for name, sha in refs]
+
+pl = git.PackIdxList(git.repo('objects/pack'))
+total_objects = len(pl)
+
+needed_objects = git.NeededObjects(pl)
+
+# Find needed objects reachable from commits
+traversed_objects_counter = 0
+
+for refname in refnames:
+ if not refname.startswith('refs/heads/'):
+ continue
+ log('Traversing %s to find needed objects...\n' % refname[11:])
+ for date, sha in ((date, sha.encode('hex')) for date, sha in
+ git.rev_list(refname)):
+ for type, sha_ in git.traverse_commit(cp, sha, needed_objects):
+ traversed_objects_counter += 1
+ qprogress('Traversing objects (%d/%d)\r' %
+ (traversed_objects_counter, total_objects))
+
+# Find needed objects reachable from tags
+tags = git.tags()
+if len(tags) > 0:
+ for key in tags:
+ log('Traversing tag %s to find needed objects...\n' % ", ".join(tags[key]))
+ for type, sha in git.traverse_commit(cp, sha, needed_objects):
+ traversed_objects_counter += 1
+ qprogress('Traversing objects (%d/%d)\r' %
+ (traversed_objects_counter, total_objects))
+skipped_objects = total_objects - traversed_objects_counter
+if skipped_objects == 0:
+ progress('Traversing objects (%d/%d), done.\n' %
+ (traversed_objects_counter, total_objects))
+else:
+ progress('Traversing objects (%d/%d), done. Skipped %d\n' %
+ (traversed_objects_counter, total_objects, skipped_objects))
+
+
+if traversed_objects_counter == 0:
+ o.fatal('No reachable objects found.')
+
+
+if not opt.dry_run:
+ blob_writer = git.PackWriter(compression_level=opt.compress)
+ w = git.PackWriter(compression_level=opt.compress)
+
+log('Writing new packfiles...\n')
+par2 = False
+written_object_counter = 0
+for pack in needed_objects.packs:
+ ba = needed_objects.get_bitarray_for_pack(
pack.name)
+ for offset, sha in pack.hashes_sorted_by_ofs():
+ idx = pack._idx_from_hash(sha)
+ if idx in ba:
+ it = iter(cp.get(sha.encode('hex')))
+ type = it.next()
+ content = "".join(it)
+ if not opt.dry_run:
+ if opt.verbose:
+ print "writing %s %s" % (sha.encode('hex'), type)
+ if type == 'blob':
+ blob_writer._write(sha, type, content)
+ else:
+ w._write(sha, type, content)
+ needed_objects.remove(sha.encode('hex'))
+ written_object_counter += 1
+ qprogress('Writing objects: %d\r' % written_object_counter)
+ else:
+ it = iter(cp.get(sha.encode('hex')))
+ type = it.next()
+ if opt.verbose:
+ print "not writing %s %s" % (sha.encode('hex'), type)
+ if not opt.dry_run:
+ os.unlink(
pack.name)
+ os.unlink(
pack.name[:-3] + "pack")
+ if os.path.exists(
pack.name[:-3] + "par2"):
+ par2 = True
+ os.unlink(
pack.name[:-3] + "par2")
+progress('Writing objects: %d, done.\n' % written_object_counter)
+
+if not opt.dry_run:
+ blob_writer.close()
+ w.close()
+ if par2:
+ run(['bup', 'fsck', '-g'])
+
diff --git a/lib/bup/git.py b/lib/bup/git.py
index 8048524..d3cc13a 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -1060,3 +1060,72 @@ def tags():
tags[c].append(name) # more than one tag can point at 'c'
return tags
+
+def traverse_commit(cp, sha_hex, needed_objects):
+ if sha_hex not in needed_objects:
+ needed_objects.add(sha_hex)
+ yield ('commit', sha_hex)
+
+ it = iter(cp.get(sha_hex))
+ type = it.next()
+ assert(type == 'commit')
+ tree_sha = "".join(it).split("\n")[0][5:].rstrip(" ")
+ for obj in traverse_objects(cp, tree_sha, needed_objects):
+ yield obj
+
+
+def traverse_objects(cp, sha_hex, needed_objects):
+ if sha_hex not in needed_objects:
+ needed_objects.add(sha_hex)
+ it = iter(cp.get(sha_hex))
+ type = it.next()
+
+ if type == 'commit':
+ yield ('commit', sha_hex)
+
+ tree_sha = "".join(it).split("\n")[0][5:].rstrip(" ")
+
+ for obj in traverse_objects(cp, tree_sha, needed_objects):
+ yield obj
+
+ if type == 'tree':
+ yield ('tree', sha_hex)
+
+ for (mode,mangled_name,sha) in tree_decode("".join(it)):
+ for obj in traverse_objects(cp, sha.encode('hex'),
+ needed_objects):
+ yield obj
+
+ elif type == 'blob':
+ yield ('blob', sha_hex)
+
+class NeededObjects():
+ def __init__(self, pack_idx_list):
+ self.packs = [pack for pack in pack_idx_list.packs
+ if isinstance(pack, PackIdx)]
+ self.pack_bitarrays = dict()
+ for pack in self.packs:
+ self.pack_bitarrays[
pack.name] = BitArray(len(pack))
+
+ def __contains__(self, sha):
+ for pack in self.packs:
+ idx = pack._idx_from_hash(sha.decode('hex'))
+ if idx in self.pack_bitarrays[
pack.name]:
+ return True
+ return False
+
+ def add(self, sha):
+ for pack in self.packs:
+ idx = pack._idx_from_hash(sha.decode('hex'))
+ self.pack_bitarrays[
pack.name].add(idx)
+
+ def remove(self, sha):
+ for pack in self.packs:
+ idx = pack._idx_from_hash(sha.decode('hex'))
+ self.pack_bitarrays[
pack.name].remove(idx)
+
+ def get_bitarray_for_pack(self, name):
+ if name in self.pack_bitarrays:
+ return self.pack_bitarrays[name]
+ else:
+ return None
diff --git a/t/test.sh b/t/test.sh
index 2f1f24c..b1f1e9b 100755
--- a/t/test.sh
+++ b/t/test.sh
@@ -215,8 +215,6 @@ WVSTART "save/git-fsck"
(
set -e
cd "$BUP_DIR" || exit 1
- #git repack -Ad
- #git prune
(cd "$TOP/t/sampledata" && WVPASS bup save -vvn master /) || WVFAIL
n=$(git fsck --full --strict 2>&1 |
egrep -v 'dangling (commit|tree|blob)' |
@@ -484,3 +482,49 @@ WVPASSEQ "$(bup ls compression/latest/ | sort)" "$(ls $TOP/Documentation | sort)
COMPRESSION_9_SIZE=$(du -s $D | cut -f1)
WVPASS [ "$COMPRESSION_9_SIZE" -lt "$COMPRESSION_0_SIZE" ]
+
+WVSTART 'repack'
+D=repack.tmp
+export BUP_DIR="$TOP/$D/.bup"
+rm -rf $D
+mkdir $D
+dd if=/dev/urandom of=$D/repack-file bs=1M count=10
+bup init
+
+# Index and save test tree to source bupdir
+bup index -ux "$D"
+bup save -n repack "$D"
+bup tag foo repack
+
+sleep 3
+
+bup index -ux "$D"
+bup save -n repack "$D"
+
+WVPASS bup repack -f
+
+bup index -ux "$D"
+bup save -n repack "$D"
+
+WVPASS bup repack -f
+WVPASS bup fsck
+
+WVPASS bup restore repack/latest$TOP/$D/repack-file -C $D/out
+WVPASS diff $D/repack-file $D/out/repack-file
+
+bup index -ux "$D"
+bup save -n repack "$D"
+
+WVPASS bup repack -n -f
+
+bup index -ux "$D"
+bup save -n repack "$D"
+if bup fsck --par2-ok; then
+ bup fsck -g
+
+ WVPASS bup repack -f
+
+ WVPASSEQ $(bup ls repack/ | wc -l) "7"
+ WVPASSEQ $(ls "$BUP_DIR/objects/pack" | grep "pack$" | wc -l) $(ls "$BUP_DIR/objects/pack" | grep "par2$" | grep -v "vol" | wc -l)
+fi
+
--
1.7.12.4