# Date 1348422117 25200
# Node ID 85c4b8e2e129975f400c9810eb9bf6ce6fea4c8b
# Parent ef583ac939de39b80aaff2d1d3d9f47bf1a1c9f3
Implement TreeTracker for incremental tree calculation
This class makes exporting Mercurial changesets to Git much faster.
diff --git a/hggit/git_handler.py b/hggit/git_handler.py
--- a/hggit/git_handler.py
+++ b/hggit/git_handler.py
@@ -1,13 +1,12 @@
import os, math, urllib, re
import stat, posixpath, StringIO
from dulwich.errors import HangupException, GitProtocolError, UpdateRefsError
-from dulwich.index import commit_tree
from dulwich.objects import Blob, Commit, Tag, Tree, parse_timezone, S_IFGITLINK
from dulwich.pack import create_delta, apply_delta
from dulwich.repo import Repo
from dulwich import client
from dulwich import config as dul_config
try:
from mercurial import bookmarks
@@ -24,16 +23,18 @@
from mercurial.node import hex, bin, nullid
from mercurial import context, util as hgutil
from mercurial import error
import _ssh
import util
from overlay import overlayrepo
+from .hg2git import TreeTracker
+
RE_GIT_AUTHOR = re.compile('^(.*?) ?\<(.*?)(?:\>(.*))?$')
RE_GIT_SANITIZE_AUTHOR = re.compile('[<>\n]')
RE_GIT_AUTHOR_EXTRA = re.compile('^(.*?)\ ext:\((.*)\) <(.*)\>$')
# Test for git:// and git+ssh:// URI.
# Support several URL forms, including separating the
@@ -323,32 +324,35 @@
def export_git_objects(self):
self.init_if_missing()
nodes = [self.repo.lookup(n) for n in self.repo]
export = [node for node in nodes if not hex(node) in self._map_hg]
total = len(export)
if total:
self.ui.status(_("exporting hg objects to git\n"))
+
+ tracker = TreeTracker(self.repo)
+
for i, rev in enumerate(export):
util.progress(self.ui, 'exporting', i, total=total)
ctx = self.repo.changectx(rev)
state = ctx.extra().get('hg-git', None)
if state == 'octopus':
self.ui.debug("revision %d is a part "
"of octopus explosion\n" % ctx.rev())
continue
- self.export_hg_commit(rev)
+ self.export_hg_commit(rev, tracker)
util.progress(self.ui, 'importing', None, total=total)
# convert this commit into git objects
# go through the manifest, convert all blobs/trees we don't have
# write the commit object (with metadata info)
- def export_hg_commit(self, rev):
+ def export_hg_commit(self, rev, tracker):
self.ui.note(_("converting revision %s\n") % hex(rev))
oldenc = self.swap_out_encoding()
ctx = self.repo.changectx(rev)
extra = ctx.extra()
commit = Commit()
@@ -390,17 +394,21 @@
commit.parents.append(git_sha)
commit.message = self.get_git_message(ctx)
if 'encoding' in extra:
commit.encoding = extra['encoding']
- tree_sha = commit_tree(self.git.object_store, self.iterblobs(ctx))
+ for obj in tracker.update_changeset(ctx):
+ self.git.object_store.add_object(obj)
+
+ tree_sha = tracker.root_tree_sha
+
if tree_sha not in self.git.object_store:
raise hgutil.Abort(_('Tree SHA-1 not present in Git repo: %s' %
tree_sha))
commit.tree = tree_sha
self.git.object_store.add_object(commit)
self.map_set(
commit.id, ctx.hex())
@@ -536,53 +544,16 @@
add_extras = True
extra_message += "extra : " + key + " : " + urllib.quote(value) + "\n"
if add_extras:
message += "\n--HG--\n" + extra_message
return message
- def iterblobs(self, ctx):
- if '.hgsubstate' in ctx:
- hgsub = util.OrderedDict()
- if '.hgsub' in ctx:
- hgsub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
- hgsubstate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
- for path, sha in hgsubstate.iteritems():
- try:
- if path in hgsub and not hgsub[path].startswith('[git]'):
- # some other kind of a repository (e.g. [hg])
- # that keeps its state in .hgsubstate, shall ignore
- continue
- yield path, sha, S_IFGITLINK
- except ValueError:
- pass
-
- for f in ctx:
- if f == '.hgsubstate' or f == '.hgsub':
- continue
- fctx = ctx[f]
- blobid = self.map_git_get(hex(fctx.filenode()))
-
- if not blobid:
- blob = Blob.from_string(fctx.data())
- self.git.object_store.add_object(blob)
- self.map_set(
blob.id, hex(fctx.filenode()))
- blobid =
blob.id
-
- if 'l' in ctx.flags(f):
- mode = 0120000
- elif 'x' in ctx.flags(f):
- mode = 0100755
- else:
- mode = 0100644
-
- yield f, blobid, mode
-
def getnewgitcommits(self, refs=None):
self.init_if_missing()
# import heads and fetched tags as remote references
todo = []
done = set()
convert_list = {}
diff --git a/hggit/hg2git.py b/hggit/hg2git.py
new file mode 100644
--- /dev/null
+++ b/hggit/hg2git.py
@@ -0,0 +1,205 @@
+# This file contains code dealing specifically with converting Mercurial
+# repositories to Git repositories. Code in this file is meant to be a generic
+# library and should be usable outside the context of hg-git or an hg command.
+
+import os
+import stat
+
+from dulwich.objects import Blob
+from dulwich.objects import S_IFGITLINK
+from dulwich.objects import TreeEntry
+from dulwich.objects import Tree
+
+from mercurial import error as hgerror
+from mercurial.node import nullrev
+
+from . import util
+
+class TreeTracker(object):
+ """Tracks Git tree objects across Mercurial revisions.
+
+ The purpose of this class is to facilitate Git tree export that is more
+ optimal than brute force. The tree calculation part of this class is
+ essentially a reimplementation of dulwich.index.commit_tree. However, since
+ our implementation reuses Tree instances and only recalculates SHA-1 when
+ things change, we are much more efficient.
+
+ Callers instantiate this class against a mercurial.localrepo instance. They
+ then associate the tracker with a specific changeset by calling
+ update_changeset(). That function emits Git objects that need to be
+ exported to a Git repository. Callers then typically obtain the
+ root_tree_sha and use that as part of assembling a Git commit.
+ """
+
+ def __init__(self, hg_repo):
+ self._hg = hg_repo
+ self._rev = nullrev
+ self._dirs = {}
+ self._blob_cache = {}
+
+ @property
+ def root_tree_sha(self):
+ return self._dirs[''].id
+
+ def update_changeset(self, ctx):
+ """Set the tree to track a new Mercurial changeset.
+
+ This is a generator of dulwich Git objects. Each returned object can be
+ added to a Git store via add_object(). Some objects may already exist
+ in the Git repository. Emitted objects are either Blob or Tree
+ instances.
+
+ Emitted objects are those that have changed since the last call to
+ update_changeset.
+ """
+ # In theory we should be able to look at changectx.files(). This is
+ # *much* faster. However, it may not be accurate, especially with older
+ # repositories, which may not record things like deleted files
+ # explicitly in the manifest (which is where files() gets its data).
+ # The only reliable way to get the full set of changes is by looking at
+ # the full manifest. And, the easy way to compare two manifests is
+ # localrepo.status().
+
+ # The other members of status are only relevant when looking at the
+ # working directory.
+ modified, added, removed = self._hg.status(self._rev, ctx.rev())[0:3]
+
+ for path in sorted(removed, key=len, reverse=True):
+ d = os.path.dirname(path)
+ tree = self._dirs.get(d, Tree())
+
+ del tree[os.path.basename(path)]
+
+ if not len(tree):
+ self._remove_tree(d)
+ continue
+
+ self._dirs[d] = tree
+
+ for path in sorted(set(modified) | set(added), key=len, reverse=True):
+ if path == '.hgsubstate':
+ self._handle_subrepos(ctx)
+ continue
+
+ if path == '.hgsub':
+ continue
+
+ d = os.path.dirname(path)
+ tree = self._dirs.get(d, Tree())
+
+ fctx = ctx[path]
+
+ entry, blob = TreeTracker.tree_entry(fctx, self._blob_cache)
+ if blob is not None:
+ yield blob
+
+ tree.add(*entry)
+ self._dirs[d] = tree
+
+ for obj in self._populate_tree_entries():
+ yield obj
+
+ self._rev = ctx.rev()
+
+ def _remove_tree(self, path):
+ try:
+ del self._dirs[path]
+ except KeyError:
+ return
+
+ # Now we traverse up to the parent and delete any references.
+ if path == '':
+ return
+
+ basename = os.path.basename(path)
+ parent = os.path.dirname(path)
+ while True:
+ tree = self._dirs.get(parent, None)
+
+ # No parent entry. Nothing to remove or update.
+ if tree is None:
+ return
+
+ try:
+ del tree[basename]
+ except KeyError:
+ return
+
+ if len(tree):
+ return
+
+ # The parent tree is empty. Se, we can delete it.
+ del self._dirs[parent]
+
+ if parent == '':
+ return
+
+ basename = os.path.basename(parent)
+ parent = os.path.dirname(parent)
+
+ def _populate_tree_entries(self):
+ if '' not in self._dirs:
+ self._dirs[''] = Tree()
+
+ # Fill in missing directories.
+ for path in self._dirs.keys():
+ parent = os.path.dirname(path)
+
+ while parent != '':
+ parent_tree = self._dirs.get(parent, None)
+
+ if parent_tree is not None:
+ break
+
+ self._dirs[parent] = Tree()
+ parent = os.path.dirname(parent)
+
+ # TODO only emit trees that have been modified.
+ for d in sorted(self._dirs.keys(), key=len, reverse=True):
+ tree = self._dirs[d]
+ yield tree
+
+ if d == '':
+ continue
+
+ parent_tree = self._dirs[os.path.dirname(d)]
+ parent_tree[os.path.basename(d)] = (stat.S_IFDIR,
tree.id)
+
+ def _handle_subrepos(self, ctx):
+ substate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
+ sub = util.OrderedDict()
+
+ if '.hgsub' in ctx:
+ sub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
+
+ for path, sha in substate.iteritems():
+ # Ignore non-Git repositories keeping state in .hgsubstate.
+ if path in sub and not sub[path].startswith('[git]'):
+ continue
+
+ d = os.path.dirname(path)
+ tree = self._dirs.get(d, Tree())
+ tree.add(os.path.basename(path), S_IFGITLINK, sha)
+ self._dirs[d] = tree
+
+ @staticmethod
+ def tree_entry(fctx, blob_cache):
+ blob_id = blob_cache.get(fctx.filenode(), None)
+ blob = None
+
+ if blob_id is None:
+ blob = Blob.from_string(fctx.data())
+ blob_id =
blob.id
+ blob_cache[fctx.filenode()] = blob_id
+
+ flags = fctx.flags()
+
+ if 'l' in flags:
+ mode = 0120000
+ elif 'x' in flags:
+ mode = 0100755
+ else:
+ mode = 0100644
+
+ return (TreeEntry(os.path.basename(fctx.path()), mode, blob_id), blob)
+