# Date 1348422117 25200
# Node ID 85c4b8e2e129975f400c9810eb9bf6ce6fea4c8b
# Parent  ef583ac939de39b80aaff2d1d3d9f47bf1a1c9f3
Implement TreeTracker for incremental tree calculation
This class makes exporting Mercurial changesets to Git much faster.
diff --git a/hggit/git_handler.py b/hggit/git_handler.py
--- a/hggit/git_handler.py
+++ b/hggit/git_handler.py
@@ -1,13 +1,12 @@
 import os, math, urllib, re
 import stat, posixpath, StringIO
 
 from dulwich.errors import HangupException, GitProtocolError, UpdateRefsError
-from dulwich.index import commit_tree
 from dulwich.objects import Blob, Commit, Tag, Tree, parse_timezone, S_IFGITLINK
 from dulwich.pack import create_delta, apply_delta
 from dulwich.repo import Repo
 from dulwich import client
 from dulwich import config as dul_config
 
 try:
     from mercurial import bookmarks
@@ -24,16 +23,18 @@
 from mercurial.node import hex, bin, nullid
 from mercurial import context, util as hgutil
 from mercurial import error
 
 import _ssh
 import util
 from overlay import overlayrepo
 
+from .hg2git import TreeTracker
+
 RE_GIT_AUTHOR = re.compile('^(.*?) ?\<(.*?)(?:\>(.*))?$')
 
 RE_GIT_SANITIZE_AUTHOR = re.compile('[<>\n]')
 
 RE_GIT_AUTHOR_EXTRA = re.compile('^(.*?)\ ext:\((.*)\) <(.*)\>$')
 
 # Test for git:// and git+ssh:// URI.
 # Support several URL forms, including separating the
@@ -323,32 +324,35 @@
     def export_git_objects(self):
         self.init_if_missing()
 
         nodes = [self.repo.lookup(n) for n in self.repo]
         export = [node for node in nodes if not hex(node) in self._map_hg]
         total = len(export)
         if total:
             self.ui.status(_("exporting hg objects to git\n"))
+
+        tracker = TreeTracker(self.repo)
+
         for i, rev in enumerate(export):
             util.progress(self.ui, 'exporting', i, total=total)
             ctx = self.repo.changectx(rev)
             state = ctx.extra().get('hg-git', None)
             if state == 'octopus':
                 self.ui.debug("revision %d is a part "
                               "of octopus explosion\n" % ctx.rev())
                 continue
-            self.export_hg_commit(rev)
+            self.export_hg_commit(rev, tracker)
         util.progress(self.ui, 'importing', None, total=total)
 
 
     # convert this commit into git objects
     # go through the manifest, convert all blobs/trees we don't have
     # write the commit object (with metadata info)
-    def export_hg_commit(self, rev):
+    def export_hg_commit(self, rev, tracker):
         self.ui.note(_("converting revision %s\n") % hex(rev))
 
         oldenc = self.swap_out_encoding()
 
         ctx = self.repo.changectx(rev)
         extra = ctx.extra()
 
         commit = Commit()
@@ -390,17 +394,21 @@
 
                 commit.parents.append(git_sha)
 
         commit.message = self.get_git_message(ctx)
 
         if 'encoding' in extra:
             commit.encoding = extra['encoding']
 
-        tree_sha = commit_tree(self.git.object_store, self.iterblobs(ctx))
+        for obj in tracker.update_changeset(ctx):
+            self.git.object_store.add_object(obj)
+
+        tree_sha = tracker.root_tree_sha
+
         if tree_sha not in self.git.object_store:
             raise hgutil.Abort(_('Tree SHA-1 not present in Git repo: %s' %
                 tree_sha))
 
         commit.tree = tree_sha
 
         self.git.object_store.add_object(commit)
         self.map_set(
commit.id, ctx.hex())
 @@ -536,53 +544,16 @@
                 add_extras = True
                 extra_message += "extra : " + key + " : " +  urllib.quote(value) + "\n"
 
         if add_extras:
             message += "\n--HG--\n" + extra_message
 
         return message
 
-    def iterblobs(self, ctx):
-        if '.hgsubstate' in ctx:
-            hgsub = util.OrderedDict()
-            if '.hgsub' in ctx:
-                hgsub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
-            hgsubstate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
-            for path, sha in hgsubstate.iteritems():
-                try:
-                    if path in hgsub and not hgsub[path].startswith('[git]'):
-                        # some other kind of a repository (e.g. [hg])
-                        # that keeps its state in .hgsubstate, shall ignore
-                        continue
-                    yield path, sha, S_IFGITLINK
-                except ValueError:
-                    pass
-
-        for f in ctx:
-            if f == '.hgsubstate' or f == '.hgsub':
-                continue
-            fctx = ctx[f]
-            blobid = self.map_git_get(hex(fctx.filenode()))
-
-            if not blobid:
-                blob = Blob.from_string(fctx.data())
-                self.git.object_store.add_object(blob)
-                self.map_set(
blob.id, hex(fctx.filenode()))
-                blobid = 
blob.id
-
-            if 'l' in ctx.flags(f):
-                mode = 0120000
-            elif 'x' in ctx.flags(f):
-                mode = 0100755
-            else:
-                mode = 0100644
-
-            yield f, blobid, mode
-
     def getnewgitcommits(self, refs=None):
         self.init_if_missing()
 
         # import heads and fetched tags as remote references
         todo = []
         done = set()
         convert_list = {}
 
diff --git a/hggit/hg2git.py b/hggit/hg2git.py
new file mode 100644
--- /dev/null
+++ b/hggit/hg2git.py
@@ -0,0 +1,205 @@
+# This file contains code dealing specifically with converting Mercurial
+# repositories to Git repositories. Code in this file is meant to be a generic
+# library and should be usable outside the context of hg-git or an hg command.
+
+import os
+import stat
+
+from dulwich.objects import Blob
+from dulwich.objects import S_IFGITLINK
+from dulwich.objects import TreeEntry
+from dulwich.objects import Tree
+
+from mercurial import error as hgerror
+from mercurial.node import nullrev
+
+from . import util
+
+class TreeTracker(object):
+    """Tracks Git tree objects across Mercurial revisions.
+
+    The purpose of this class is to facilitate Git tree export that is more
+    optimal than brute force. The tree calculation part of this class is
+    essentially a reimplementation of dulwich.index.commit_tree. However, since
+    our implementation reuses Tree instances and only recalculates SHA-1 when
+    things change, we are much more efficient.
+
+    Callers instantiate this class against a mercurial.localrepo instance. They
+    then associate the tracker with a specific changeset by calling
+    update_changeset(). That function emits Git objects that need to be
+    exported to a Git repository. Callers then typically obtain the
+    root_tree_sha and use that as part of assembling a Git commit.
+    """
+
+    def __init__(self, hg_repo):
+        self._hg = hg_repo
+        self._rev = nullrev
+        self._dirs = {}
+        self._blob_cache = {}
+
+    @property
+    def root_tree_sha(self):
+        return self._dirs[''].id
+
+    def update_changeset(self, ctx):
+        """Set the tree to track a new Mercurial changeset.
+
+        This is a generator of dulwich Git objects. Each returned object can be
+        added to a Git store via add_object(). Some objects may already exist
+        in the Git repository. Emitted objects are either Blob or Tree
+        instances.
+
+        Emitted objects are those that have changed since the last call to
+        update_changeset.
+        """
+        # In theory we should be able to look at changectx.files(). This is
+        # *much* faster. However, it may not be accurate, especially with older
+        # repositories, which may not record things like deleted files
+        # explicitly in the manifest (which is where files() gets its data).
+        # The only reliable way to get the full set of changes is by looking at
+        # the full manifest. And, the easy way to compare two manifests is
+        # localrepo.status().
+
+        # The other members of status are only relevant when looking at the
+        # working directory.
+        modified, added, removed = self._hg.status(self._rev, ctx.rev())[0:3]
+
+        for path in sorted(removed, key=len, reverse=True):
+            d = os.path.dirname(path)
+            tree = self._dirs.get(d, Tree())
+
+            del tree[os.path.basename(path)]
+
+            if not len(tree):
+                self._remove_tree(d)
+                continue
+
+            self._dirs[d] = tree
+
+        for path in sorted(set(modified) | set(added), key=len, reverse=True):
+            if path == '.hgsubstate':
+                self._handle_subrepos(ctx)
+                continue
+
+            if path == '.hgsub':
+                continue
+
+            d = os.path.dirname(path)
+            tree = self._dirs.get(d, Tree())
+
+            fctx = ctx[path]
+
+            entry, blob = TreeTracker.tree_entry(fctx, self._blob_cache)
+            if blob is not None:
+                yield blob
+
+            tree.add(*entry)
+            self._dirs[d] = tree
+
+        for obj in self._populate_tree_entries():
+            yield obj
+
+        self._rev = ctx.rev()
+
+    def _remove_tree(self, path):
+        try:
+            del self._dirs[path]
+        except KeyError:
+            return
+
+        # Now we traverse up to the parent and delete any references.
+        if path == '':
+            return
+
+        basename = os.path.basename(path)
+        parent = os.path.dirname(path)
+        while True:
+            tree = self._dirs.get(parent, None)
+
+            # No parent entry. Nothing to remove or update.
+            if tree is None:
+                return
+
+            try:
+                del tree[basename]
+            except KeyError:
+                return
+
+            if len(tree):
+                return
+
+            # The parent tree is empty. Se, we can delete it.
+            del self._dirs[parent]
+
+            if parent == '':
+                return
+
+            basename = os.path.basename(parent)
+            parent = os.path.dirname(parent)
+
+    def _populate_tree_entries(self):
+        if '' not in self._dirs:
+            self._dirs[''] = Tree()
+
+        # Fill in missing directories.
+        for path in self._dirs.keys():
+            parent = os.path.dirname(path)
+
+            while parent != '':
+                parent_tree = self._dirs.get(parent, None)
+
+                if parent_tree is not None:
+                    break
+
+                self._dirs[parent] = Tree()
+                parent = os.path.dirname(parent)
+
+        # TODO only emit trees that have been modified.
+        for d in sorted(self._dirs.keys(), key=len, reverse=True):
+            tree = self._dirs[d]
+            yield tree
+
+            if d == '':
+                continue
+
+            parent_tree = self._dirs[os.path.dirname(d)]
+            parent_tree[os.path.basename(d)] = (stat.S_IFDIR, 
tree.id)
+
+    def _handle_subrepos(self, ctx):
+        substate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
+        sub = util.OrderedDict()
+
+        if '.hgsub' in ctx:
+            sub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
+
+        for path, sha in substate.iteritems():
+            # Ignore non-Git repositories keeping state in .hgsubstate.
+            if path in sub and not sub[path].startswith('[git]'):
+                continue
+
+            d = os.path.dirname(path)
+            tree = self._dirs.get(d, Tree())
+            tree.add(os.path.basename(path), S_IFGITLINK, sha)
+            self._dirs[d] = tree
+
+    @staticmethod
+    def tree_entry(fctx, blob_cache):
+        blob_id = blob_cache.get(fctx.filenode(), None)
+        blob = None
+
+        if blob_id is None:
+            blob = Blob.from_string(fctx.data())
+            blob_id = 
blob.id
+            blob_cache[fctx.filenode()] = blob_id
+
+        flags = fctx.flags()
+
+        if 'l' in flags:
+            mode = 0120000
+        elif 'x' in flags:
+            mode = 0100755
+        else:
+            mode = 0100644
+
+        return (TreeEntry(os.path.basename(fctx.path()), mode, blob_id), blob)
+