[PATCH] Only export modified Git trees

59 views
Skip to first unread message

Gregory Szorc

unread,
Apr 14, 2013, 7:13:10 PM4/14/13
to hg-...@googlegroups.com
# HG changeset patch
# User Gregory Szorc <gregor...@gmail.com>
# Date 1365963101 25200
# Node ID 068504975690139254aa0cb310ccffc42ba7be5a
# Parent 5bbf8d1a304a6efe9ccc70c06effec095062ff29
Only export modified Git trees

Previously, we emitted every Git tree when updating between Mercurial
changesets. With this patch, we now only emit Git trees that changed. A
side-effect of the implementation is that we now only update in-memory
Git trees objects that changed. Before, we always touched Git trees,
invalidating them in the process and causing Dulwich to recalculate
their SHA-1. Profiling revealed this to be expensive and removing the
extra calculation shows a nice performance win.

Another optimization is to not sort the order that changed paths are
processed in. Previously, we sorted by length, longest to shortest.
Profiling revealed that the sorts took a non-trivial amount of time.
While sorted execution resulted in likely idempotent behavior, it
shouldn't be strictly required.

On the author's machine, conversion of the Mercurial repository itself
decreased from ~493s to ~333s. Even more impressive is conversion of
Firefox's main repository (which is considerably larger). Converting the
first 200 revisions of that repository decreased from ~152s to ~42s.

diff --git a/hggit/hg2git.py b/hggit/hg2git.py
--- a/hggit/hg2git.py
+++ b/hggit/hg2git.py
@@ -101,62 +101,68 @@
# *much* faster. However, it may not be accurate, especially with older
# repositories, which may not record things like deleted files
# explicitly in the manifest (which is where files() gets its data).
# The only reliable way to get the full set of changes is by looking at
# the full manifest. And, the easy way to compare two manifests is
# localrepo.status().
modified, added, removed = self._hg.status(self._rev, ctx.rev())[0:3]

+ # We track which directories/trees have modified in this update and we
+ # only export those.
+ dirty_trees = set()
+
# We first process file removals so we can prune dead trees.
- for path in sorted(removed, key=len, reverse=True):
+ for path in removed:
d = os.path.dirname(path)
tree = self._dirs.get(d, dulobjs.Tree())

del tree[os.path.basename(path)]
+ dirty_trees.add(d)

# If removing this file made the tree empty, we should delete this
# tree. This could result in parent trees losing their only child
# and so on.
if not len(tree):
self._remove_tree(d)
continue

self._dirs[d] = tree

# For every file that changed or was added, we need to calculate the
# corresponding Git blob and its tree entry. We emit the blob
# immediately and update trees to be aware of its presence.
- for path in sorted(set(modified) | set(added), key=len, reverse=True):
+ for path in set(modified) | set(added):
# Handle special Mercurial paths.
if path == '.hgsubstate':
- self._handle_subrepos(ctx)
+ self._handle_subrepos(ctx, dirty_trees)
continue

if path == '.hgsub':
continue

d = os.path.dirname(path)
tree = self._dirs.setdefault(d, dulobjs.Tree())
+ dirty_trees.add(d)

fctx = ctx[path]

entry, blob = IncrementalChangesetExporter.tree_entry(fctx,
self._blob_cache)
if blob is not None:
yield (blob, fctx.filenode())

tree.add(*entry)

# Now that all the trees represent the current changeset, recalculate
# the tree IDs and emit them. Note that we wait until now to calculate
# tree SHA-1s. This is an important difference between us and
# dulwich.index.commit_tree(), which builds new Tree instances for each
# series of blobs.
- for obj in self._populate_tree_entries():
+ for obj in self._populate_tree_entries(dirty_trees):
yield (obj, None)

self._rev = ctx.rev()

def _remove_tree(self, path):
"""Remove a (presumably empty) tree from the current changeset.

A now-empty tree may be the only child of its parent. So, we traverse
@@ -192,60 +198,83 @@
del self._dirs[parent]

if parent == '':
return

basename = os.path.basename(parent)
parent = os.path.dirname(parent)

- def _populate_tree_entries(self):
+ def _populate_tree_entries(self, dirty_trees):
self._dirs.setdefault('', dulobjs.Tree())

# Fill in missing directories.
for path in self._dirs.keys():
parent = os.path.dirname(path)

while parent != '':
parent_tree = self._dirs.get(parent, None)

if parent_tree is not None:
break

self._dirs[parent] = dulobjs.Tree()
parent = os.path.dirname(parent)

- # TODO only emit trees that have been modified.
- for d in sorted(self._dirs.keys(), key=len, reverse=True):
- tree = self._dirs[d]
+ for dirty in list(dirty_trees):
+ parent = os.path.dirname(dirty)
+
+ while parent != '':
+ if parent in dirty_trees:
+ break
+
+ dirty_trees.add(parent)
+ parent = os.path.dirname(parent)
+
+ # The root tree is always dirty but doesn't always get updated.
+ dirty_trees.add('')
+
+ # We only need to recalculate and export dirty trees.
+ for d in sorted(dirty_trees, key=len, reverse=True):
+ # Only happens for deleted directories.
+ try:
+ tree = self._dirs[d]
+ except KeyError:
+ continue
+
yield tree

if d == '':
continue

parent_tree = self._dirs[os.path.dirname(d)]

# Accessing the tree's ID is what triggers SHA-1 calculation and is
# the expensive part (at least if the tree has been modified since
- # the last time we retrieved its ID).
+ # the last time we retrieved its ID). Also, assigning an entry to a
+ # tree (even if it already exists) invalidates the existing tree
+ # and incurs SHA-1 recalculation. So, it's in our interest to avoid
+ # invalidating trees. Since we only update the entries of dirty
+ # trees, this should hold true.
parent_tree[os.path.basename(d)] = (stat.S_IFDIR, tree.id)

- def _handle_subrepos(self, ctx):
+ def _handle_subrepos(self, ctx, dirty_trees):
substate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
sub = util.OrderedDict()

if '.hgsub' in ctx:
sub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())

for path, sha in substate.iteritems():
# Ignore non-Git repositories keeping state in .hgsubstate.
if path in sub and not sub[path].startswith('[git]'):
continue

d = os.path.dirname(path)
+ dirty_trees.add(d)
tree = self._dirs.setdefault(d, dulobjs.Tree())
tree.add(os.path.basename(path), dulobjs.S_IFGITLINK, sha)

@staticmethod
def tree_entry(fctx, blob_cache):
"""Compute a dulwich TreeEntry from a filectx.

A side effect is the TreeEntry is stored in the passed cache.
Reply all
Reply to author
Forward
0 new messages