[PATCH 1 of 2] parsers: incrementally parse the revlog index in C

7 views
Skip to first unread message

Bryan O'Sullivan

unread,
Mar 22, 2012, 2:44:12 PM3/22/12
to m...@selenic.com, mercuri...@selenic.com
We only parse entries in a revlog index file when they are actually needed.

This makes a huge difference to performance on large revlogs when
performing numeric lookups, if only a few entries are used (a common
case). For instance, "hg tip" on a tree with 300,000 revs takes
0.3s before, 0.02 after.

For revlog-intensive operations (e.g. running "hg log" to completion),
the lazy approach is about 1% slower than the eager parse_index2.

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -241,8 +241,39 @@
return ret;
}

-const char nullid[20];
-const int nullrev = -1;
+/*
+ * A list-like object that decodes the contents of a RevlogNG index
+ * file on demand. It has limited support for insert and delete at the
+ * last element before the end. The last entry is always a sentinel
+ * nullid.
+ */
+typedef struct {
+ PyObject_HEAD
+ /* Type-specific fields go here. */
+ PyObject *data;
+ const char **offsets; /* populated on demand */
+ Py_ssize_t raw_length; /* original number of elements */
+ Py_ssize_t length; /* current number of elements */
+ PyObject *added; /* populated on demand */
+ int inlined;
+} indexObject;
+
+static Py_ssize_t index_length(indexObject *self)
+{
+ if (self->added == NULL)
+ return self->length;
+ return self->length + PyList_GET_SIZE(self->added);
+}
+
+static PyObject *nullentry;
+
+static long inline_scan(indexObject *self, const char **offsets);
+
+#if LONG_MAX == 0x7fffffffL
+static const char *tuple_format = "Kiiiiiis#";
+#else
+static const char *tuple_format = "kiiiiiis#";
+#endif

/* RevlogNG format (all in big endian, data may be inlined):
* 6 bytes: offset
@@ -255,138 +286,374 @@
* 4 bytes: parent 2 revision
* 32 bytes: nodeid (only 20 bytes used)
*/
-static int _parse_index_ng(const char *data, int size, int inlined,
- PyObject *index)
+static PyObject *index_get(indexObject *self, Py_ssize_t pos)
{
- PyObject *entry;
- int n = 0, err;
+ uint32_t decode[8]; /* to enforce alignment with inline data */
uint64_t offset_flags;
int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
const char *c_node_id;
- const char *end = data + size;
- uint32_t decode[8]; /* to enforce alignment with inline data */
+ const char *data;
+ Py_ssize_t length = index_length(self);
+ PyObject *entry;

- while (data < end) {
- unsigned int step;
+ if (pos >= length) {
+ PyErr_SetString(PyExc_IndexError, "revlog index out of range");
+ return NULL;
+ }

- memcpy(decode, data, 32);
- offset_flags = ntohl(decode[1]);
- if (n == 0) /* mask out version number for the first entry */
- offset_flags &= 0xFFFF;
- else {
- uint32_t offset_high = ntohl(decode[0]);
- offset_flags |= ((uint64_t)offset_high) << 32;
+ if (pos == length - 1) {
+ Py_INCREF(nullentry);
+ return nullentry;
+ }
+
+ if (pos >= self->length - 1) {
+ PyObject *obj;
+ obj = PyList_GET_ITEM(self->added, pos - self->length + 1);
+ Py_INCREF(obj);
+ return obj;
+ }
+
+ if (self->inlined && pos > 0) {
+ if (self->offsets == NULL) {
+ self->offsets = malloc(self->raw_length *
+ sizeof(*self->offsets));
+ if (self->offsets == NULL)
+ return PyErr_NoMemory();
+ inline_scan(self, self->offsets);
}
+ data = self->offsets[pos];
+ } else
+ data = PyString_AS_STRING(self->data) + pos * 64;

- comp_len = ntohl(decode[2]);
- uncomp_len = ntohl(decode[3]);
- base_rev = ntohl(decode[4]);
- link_rev = ntohl(decode[5]);
- parent_1 = ntohl(decode[6]);
- parent_2 = ntohl(decode[7]);
- c_node_id = data + 32;
+ memcpy(decode, data, 8 * sizeof(uint32_t));

- entry = Py_BuildValue("Liiiiiis#", offset_flags, comp_len,
+ offset_flags = ntohl(decode[1]);
+ if (pos == 0) /* mask out version number for the first entry */
+ offset_flags &= 0xFFFF;
+ else {
+ uint32_t offset_high = ntohl(decode[0]);
+ offset_flags |= ((uint64_t)offset_high) << 32;
+ }
+
+ comp_len = ntohl(decode[2]);
+ uncomp_len = ntohl(decode[3]);
+ base_rev = ntohl(decode[4]);
+ link_rev = ntohl(decode[5]);
+ parent_1 = ntohl(decode[6]);
+ parent_2 = ntohl(decode[7]);
+ c_node_id = data + 32;
+
+ entry = Py_BuildValue(tuple_format, offset_flags, comp_len,
uncomp_len, base_rev, link_rev,
parent_1, parent_2, c_node_id, 20);

- if (!entry)
- return 0;
+ if (entry)
+ PyObject_GC_UnTrack(entry);

- PyObject_GC_UnTrack(entry); /* don't waste time with this */
+ return entry;
+}

- if (inlined) {
- err = PyList_Append(index, entry);
- Py_DECREF(entry);
- if (err)
- return 0;
- } else
- PyList_SET_ITEM(index, n, entry); /* steals reference */
+static PyObject *index_insert(indexObject *self, PyObject *args)
+{
+ PyObject *obj, *node;
+ long offset;
+ Py_ssize_t len;

- n++;
- step = 64 + (inlined ? comp_len : 0);
- if (data + step > end || data + step < data)
- break;
- data += step;
+ if (!PyArg_ParseTuple(args, "lO", &offset, &obj))
+ return NULL;
+
+ if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) {
+ PyErr_SetString(PyExc_ValueError, "8-tuple required");
+ return NULL;
}
- if (data != end) {
- if (!PyErr_Occurred())
- PyErr_SetString(PyExc_ValueError, "corrupt index file");
+
+ node = PyTuple_GET_ITEM(obj, 7);
+ if (!PyString_Check(node) || PyString_GET_SIZE(node) != 20) {
+ PyErr_SetString(PyExc_ValueError,
+ "20-byte hash required as last element");
+ return NULL;
+ }
+
+ len = index_length(self);
+
+ if (offset < 0)
+ offset += len;
+
+ if (offset != len - 1) {
+ PyErr_SetString(PyExc_IndexError,
+ "insert only supported at index -1");
+ return NULL;
+ }
+
+ if (self->added == NULL) {
+ self->added = PyList_New(0);
+ if (self->added == NULL)
+ return NULL;
+ }
+
+ if (PyList_Append(self->added, obj) == -1)
+ return NULL;
+
+ Py_RETURN_NONE;
+}
+
+static int
+index_ass_subscript(indexObject* self, PyObject* item, PyObject* value)
+{
+ Py_ssize_t start, stop, step, slicelength;
+ Py_ssize_t length = index_length(self);
+
+ if (!PySlice_Check(item) || value != NULL) {
+ PyErr_SetString(PyExc_TypeError,
+ "revlog index only supports slice deletion");
+ return -1;
+ }
+
+ if (PySlice_GetIndicesEx((PySliceObject*)item, length,
+ &start, &stop, &step, &slicelength) < 0)
+ return -1;
+
+ if (slicelength <= 0)
+ return 0;
+
+ if ((step < 0 && start < stop) || (step > 0 && start > stop))
+ stop = start;
+
+ if (step < 0) {
+ stop = start + 1;
+ start = stop + step*(slicelength - 1) - 1;
+ step = -step;
+ }
+
+ if (step != 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "revlog index delete requires step size of 1");
+ return -1;
+ }
+
+ if (stop != length - 1) {
+ PyErr_SetString(PyExc_IndexError,
+ "revlog index deletion indices are invalid");
+ return -1;
+ }
+
+ if (start < self->length) {
+ self->length = start + 1;
+ if (self->added) {
+ Py_DECREF(self->added);
+ self->added = NULL;
+ }
return 0;
}

- /* create the magic nullid entry in the index at [-1] */
- entry = Py_BuildValue("Liiiiiis#", (uint64_t)0, 0, 0, -1, -1, -1, -1, nullid, 20);
-
- if (!entry)
- return 0;
-
- PyObject_GC_UnTrack(entry); /* don't waste time with this */
-
- if (inlined) {
- err = PyList_Append(index, entry);
- Py_DECREF(entry);
- if (err)
- return 0;
- } else
- PyList_SET_ITEM(index, n, entry); /* steals reference */
-
- return 1;
+ return PyList_SetSlice(self->added, start - self->length + 1,
+ PyList_GET_SIZE(self->added),
+ NULL);
}

-/* This function parses a index file and returns a Python tuple of the
- * following format: (index, cache)
+static long inline_scan(indexObject *self, const char **offsets)
+{
+ const char *data = PyString_AS_STRING(self->data);
+ const char *end = data + PyString_GET_SIZE(self->data);
+ const long hdrsize = 64;
+ long incr = hdrsize;
+ Py_ssize_t len = 0;
+
+ while (data + hdrsize <= end) {
+ uint32_t comp_len;
+ const char *old_data;
+ /* 3rd element of header is length of compressed inline data */
+ memcpy(&comp_len, data + 8, sizeof(uint32_t));
+ incr = hdrsize + ntohl(comp_len);
+ if (incr < hdrsize)
+ break;
+ if (offsets)
+ offsets[len] = data;
+ len++;
+ old_data = data;
+ data += incr;
+ if (data <= old_data)
+ break;
+ }
+
+ if (data != end && data + hdrsize != end) {
+ if (!PyErr_Occurred())
+ PyErr_SetString(PyExc_ValueError, "corrupt index file");
+ return -1;
+ }
+
+ return len;
+}
+
+static int index_real_init(indexObject *self, const char *data, int size,
+ PyObject *inlined_obj, PyObject *data_obj)
+{
+ self->inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
+ self->data = data_obj;
+
+ self->added = NULL;
+ self->offsets = NULL;
+ Py_INCREF(self->data);
+
+ if (self->inlined) {
+ long len = inline_scan(self, NULL);
+ if (len == -1)
+ goto bail;
+ self->raw_length = len;
+ self->length = len + 1;
+ } else {
+ if (size % 64) {
+ PyErr_SetString(PyExc_ValueError, "corrupt index file");
+ goto bail;
+ }
+ self->raw_length = size / 64;
+ self->length = self->raw_length + 1;
+ }
+
+ return 0;
+bail:
+ return -1;
+}
+
+static int
+index_init(indexObject *self, PyObject *args, PyObject *kwds)
+{
+ const char *data;
+ int size;
+ PyObject *inlined_obj;
+
+ if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
+ return -1;
+
+ return index_real_init(self, data, size, inlined_obj,
+ PyTuple_GET_ITEM(args, 0));
+}
+
+static void
+index_dealloc(indexObject *self)
+{
+ Py_DECREF(self->data);
+ Py_XDECREF(self->added);
+ free(self->offsets);
+ PyObject_Del(self);
+}
+
+static PySequenceMethods index_sequence_methods = {
+ (lenfunc)index_length, /* sq_length */
+ 0, /* sq_concat */
+ 0, /* sq_repeat */
+ (ssizeargfunc)index_get, /* sq_item */
+};
+
+static PyMappingMethods index_mapping_methods = {
+ (lenfunc)index_length, /* mp_length */
+ NULL, /* mp_subscript */
+ (objobjargproc)index_ass_subscript, /* mp_ass_subscript */
+};
+
+static PyMethodDef index_methods[] = {
+ {"insert", (PyCFunction)index_insert, METH_VARARGS,
+ "insert an index entry"},
+ {NULL} /* Sentinel */
+};
+
+static PyTypeObject indexType = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "index.index", /*tp_name*/
+ sizeof(indexObject), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ (destructor)index_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ &index_sequence_methods, /*tp_as_sequence*/
+ &index_mapping_methods, /*tp_as_mapping*/
+ 0, /*tp_hash */
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ "revlog index", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ index_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)index_init, /* tp_init */
+ 0, /* tp_alloc */
+ PyType_GenericNew, /* tp_new */
+};
+
+/*
+ * returns a tuple of the form (index, None, cache) with elements as
+ * follows:
*
- * index: a list of tuples containing the RevlogNG records
- * cache: if data is inlined, a tuple (index_file_content, 0) else None
+ * index: an index object that lazily parses the RevlogNG records
+ * cache: if data is inlined, a tuple (index_file_content, 0), else None
+ *
+ * added complications are for backwards compatibility
*/
static PyObject *parse_index2(PyObject *self, PyObject *args)
{
const char *data;
- int size, inlined;
- PyObject *rval = NULL, *index = NULL, *cache = NULL;
- PyObject *data_obj = NULL, *inlined_obj;
+ int size, ret;
+ PyObject *inlined_obj, *tuple = NULL, *cache = NULL, *none = NULL;
+ indexObject *idx;

if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
return NULL;
- inlined = inlined_obj && PyObject_IsTrue(inlined_obj);

- /* If no data is inlined, we know the size of the index list in
- * advance: size divided by the size of one revlog record (64 bytes)
- * plus one for nullid */
- index = inlined ? PyList_New(0) : PyList_New(size / 64 + 1);
- if (!index)
- goto quit;
+ idx = PyObject_New(indexObject, &indexType);

- /* set up the cache return value */
- if (inlined) {
- /* Note that the reference to data_obj is only borrowed */
- data_obj = PyTuple_GET_ITEM(args, 0);
- cache = Py_BuildValue("iO", 0, data_obj);
- if (!cache)
- goto quit;
+ if (idx == NULL)
+ goto bail;
+
+ ret = index_real_init(idx, data, size, inlined_obj,
+ PyTuple_GET_ITEM(args, 0));
+ if (ret)
+ goto bail;
+
+ if (idx->inlined) {
+ Py_INCREF(idx->data);
+ cache = Py_BuildValue("iO", 0, idx->data);
+ if (cache == NULL)
+ goto bail;
} else {
cache = Py_None;
- Py_INCREF(Py_None);
+ Py_INCREF(cache);
}

- /* actually populate the index with data */
- if (!_parse_index_ng(data, size, inlined, index))
- goto quit;
+ none = Py_None;
+ Py_INCREF(none);

- rval = Py_BuildValue("NN", index, cache);
- if (!rval)
- goto quit;
- return rval;
+ tuple = Py_BuildValue("NNN", idx, none, cache);
+ if (!tuple)
+ goto bail;
+ return tuple;

-quit:
- Py_XDECREF(index);
+bail:
+ Py_XDECREF(idx);
Py_XDECREF(cache);
- Py_XDECREF(rval);
+ Py_XDECREF(none);
+ Py_XDECREF(tuple);
return NULL;
}

-
static char parsers_doc[] = "Efficient content parsing.";

static PyMethodDef methods[] = {
@@ -396,6 +663,22 @@
{NULL, NULL}
};

+static void module_init(PyObject *mod)
+{
+ static const char nullid[20];
+
+ if (PyType_Ready(&indexType) < 0)
+ return;
+ Py_INCREF(&indexType);
+
+ PyModule_AddObject(mod, "index", (PyObject *)&indexType);
+
+ nullentry = Py_BuildValue("iiiiiiis#", 0, 0, 0,
+ -1, -1, -1, -1, nullid, 20);
+ if (nullentry)
+ PyObject_GC_UnTrack(nullentry);
+}
+
#ifdef IS_PY3K
static struct PyModuleDef parsers_module = {
PyModuleDef_HEAD_INIT,
@@ -407,12 +690,15 @@

PyMODINIT_FUNC PyInit_parsers(void)
{
- return PyModule_Create(&parsers_module);
+ PyObject *mod = PyModule_Create(&parsers_module);
+ module_init(mod);
+ return mod;
}
#else
PyMODINIT_FUNC initparsers(void)
{
- Py_InitModule3("parsers", methods, parsers_doc);
+ PyObject *mod = Py_InitModule3("parsers", methods, parsers_doc);
+ module_init(mod);
}
#endif

diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -171,10 +171,7 @@
def __init__(self):
self.size = struct.calcsize(indexformatng)

- def parseindex(self, data, inline):
- # call the C implementation to parse the index data
- index, cache = parsers.parse_index2(data, inline)
- return index, None, cache
+ parseindex = staticmethod(parsers.parse_index2)

def packentry(self, entry, node, version, rev):
p = _pack(indexformatng, *entry)
diff --git a/tests/test-parseindex2.py b/tests/test-parseindex2.py
--- a/tests/test-parseindex2.py
+++ b/tests/test-parseindex2.py
@@ -52,7 +52,6 @@

return index, cache

-
data_inlined = '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x01\x8c' \
'\x00\x00\x04\x07\x00\x00\x00\x00\x00\x00\x15\x15\xff\xff\xff' \
'\xff\xff\xff\xff\xff\xebG\x97\xb7\x1fB\x04\xcf\x13V\x81\tw\x1b' \
@@ -94,13 +93,16 @@
'\xb6\r\x98B\xcb\x07\xbd`\x8f\x92\xd9\xc4\x84\xbdK\x00\x00\x00' \
'\x00\x00\x00\x00\x00\x00\x00\x00\x00'

+def parse_index2(data, inline):
+ index, alwaysnone, chunkcache = parsers.parse_index2(data, inline)
+ return list(index), chunkcache
+
def runtest() :
-
py_res_1 = py_parseindex(data_inlined, True)
- c_res_1 = parsers.parse_index2(data_inlined, True)
+ c_res_1 = parse_index2(data_inlined, True)

py_res_2 = py_parseindex(data_non_inlined, False)
- c_res_2 = parsers.parse_index2(data_non_inlined, False)
+ c_res_2 = parse_index2(data_non_inlined, False)

if py_res_1 != c_res_1:
print "Parse index result (with inlined data) differs!"
_______________________________________________
Mercurial-devel mailing list
Mercuri...@selenic.com
http://selenic.com/mailman/listinfo/mercurial-devel

Bryan O'Sullivan

unread,
Mar 22, 2012, 2:44:13 PM3/22/12
to m...@selenic.com, mercuri...@selenic.com
This vastly speeds up node->rev lookups: "hg --time log" of rev
1000 on a linux-2.6 repo improves from 0.27 seconds to 0.08.

The only part that's not yet done is having parsers.index_getitem
throw a LookupError instead of a KeyError on lookup failure.

(Speeding up prefix-based node->rev lookup will come in a patch
that I haven't yet written, so don't expect that to be fast yet.)

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c

@@ -242,10 +242,27 @@
}

/*
- * A list-like object that decodes the contents of a RevlogNG index
- * file on demand. It has limited support for insert and delete at the
- * last element before the end. The last entry is always a sentinel
- * nullid.
+ * A base-16 trie for fast node->rev mapping.
+ *
+ * Positive value is index of the next node in the trie
+ * Negative value is a leaf: -(rev + 1)
+ * Zero is empty
+ */
+typedef struct {
+ int children[16];
+} nodetree;
+
+/*
+ * This class has two behaviours.
+ *
+ * When used in a list-like way (with integer keys), we decode an
+ * entry in a RevlogNG index file on demand. Our last entry is a
+ * sentinel, always a nullid. We have limited support for
+ * integer-keyed insert and delete, only at elements right before the
+ * sentinel.
+ *
+ * With string keys, we lazily perform a reverse mapping from node to
+ * rev, using a base-16 trie.
*/
typedef struct {
PyObject_HEAD
@@ -255,10 +272,16 @@


Py_ssize_t raw_length; /* original number of elements */

Py_ssize_t length; /* current number of elements */

PyObject *added; /* populated on demand */

- int inlined;
+ nodetree *nt;
+ int ntlength; /* # nodes in use */
+ int ntcapacity; /* # nodes allocated */
+ int ntdepth; /* maximum depth of tree */
+ int ntsplits; /* # splits performed */
+ int ntrev; /* last rev scanned */
+ int inlined; /* bool: index contains inline data */
} indexObject;

-static Py_ssize_t index_length(indexObject *self)
+static Py_ssize_t index_length(const indexObject *self)
{


if (self->added == NULL)

return self->length;
@@ -266,6 +289,7 @@
}

static PyObject *nullentry;
+static const char nullid[20];



static long inline_scan(indexObject *self, const char **offsets);

@@ -275,7 +299,27 @@


static const char *tuple_format = "kiiiiiis#";

#endif

-/* RevlogNG format (all in big endian, data may be inlined):
+/*
+ * Return a pointer to the beginning of a RevlogNG record.
+ */
+static const char *index_deref(indexObject *self, Py_ssize_t pos)


+{
+ if (self->inlined && pos > 0) {
+ if (self->offsets == NULL) {
+ self->offsets = malloc(self->raw_length *
+ sizeof(*self->offsets));
+ if (self->offsets == NULL)

+ return (const char *) PyErr_NoMemory();


+ inline_scan(self, self->offsets);
+ }

+ return self->offsets[pos];
+ }
+
+ return PyString_AS_STRING(self->data) + pos * 64;
+}
+
+/*
+ * RevlogNG format (all in big endian, data may be inlined):
* 6 bytes: offset
* 2 bytes: flags
* 4 bytes: compressed length
@@ -296,7 +340,10 @@
Py_ssize_t length = index_length(self);
PyObject *entry;

- if (pos >= length) {
+ if (pos < 0)
+ pos += length;
+
+ if (pos < 0 || pos >= length) {


PyErr_SetString(PyExc_IndexError, "revlog index out of range");

return NULL;
}
@@ -313,17 +360,9 @@
return obj;
}

- if (self->inlined && pos > 0) {
- if (self->offsets == NULL) {
- self->offsets = malloc(self->raw_length *
- sizeof(*self->offsets));
- if (self->offsets == NULL)
- return PyErr_NoMemory();
- inline_scan(self, self->offsets);
- }
- data = self->offsets[pos];
- } else
- data = PyString_AS_STRING(self->data) + pos * 64;
+ data = index_deref(self, pos);
+ if (data == NULL)
+ return NULL;


memcpy(decode, data, 8 * sizeof(uint32_t));

@@ -353,26 +392,60 @@
return entry;
}

+/*
+ * Return the 20-byte SHA of the node corresponding to the given rev.
+ */
+static const char *index_node(indexObject *self, Py_ssize_t pos)
+{


+ Py_ssize_t length = index_length(self);

+ const char *data;
+


+ if (pos == length - 1)

+ return nullid;
+
+ if (pos >= length)
+ return NULL;
+


+ if (pos >= self->length - 1) {

+ PyObject *tuple, *str;
+ tuple = PyList_GET_ITEM(self->added, pos - self->length + 1);
+ str = PyTuple_GetItem(tuple, 7);
+ return str ? PyString_AS_STRING(str) : NULL;
+ }
+
+ data = index_deref(self, pos);
+ return data ? data + 32 : NULL;
+}
+
+static int nt_insert(indexObject *self, const char *node, int rev);
+
+static int node_check(PyObject *obj, char **node, Py_ssize_t *nodelen)
+{
+ if (PyString_AsStringAndSize(obj, node, nodelen) == -1)
+ return -1;
+ if (*nodelen == 20)
+ return 0;
+ PyErr_SetString(PyExc_ValueError, "20-byte hash required");


+ return -1;
+}
+

static PyObject *index_insert(indexObject *self, PyObject *args)

{
- PyObject *obj, *node;
+ PyObject *obj;
+ char *node;
long offset;
- Py_ssize_t len;
+ Py_ssize_t len, nodelen;



if (!PyArg_ParseTuple(args, "lO", &offset, &obj))

return NULL;



if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) {

- PyErr_SetString(PyExc_ValueError, "8-tuple required");
+ PyErr_SetString(PyExc_TypeError, "8-tuple required");
return NULL;
}

- node = PyTuple_GET_ITEM(obj, 7);
- if (!PyString_Check(node) || PyString_GET_SIZE(node) != 20) {
- PyErr_SetString(PyExc_ValueError,
- "20-byte hash required as last element");
+ if (node_check(PyTuple_GET_ITEM(obj, 7), &node, &nodelen) == -1)
return NULL;
- }

len = index_length(self);

@@ -385,6 +458,12 @@
return NULL;
}

+ if (offset > INT_MAX) {
+ PyErr_SetString(PyExc_ValueError,
+ "currently only 2**31 revs supported");


+ return NULL;
+ }
+

if (self->added == NULL) {
self->added = PyList_New(0);


if (self->added == NULL)

@@ -394,21 +473,287 @@


if (PyList_Append(self->added, obj) == -1)

return NULL;

+ if (self->nt)
+ nt_insert(self, node, (int) offset);
+
Py_RETURN_NONE;
}

+static PyObject *index_stats(indexObject *self)
+{
+ PyObject *obj = PyDict_New();
+
+ if (obj == NULL)
+ return NULL;
+
+#define istat(__n, __d) \
+ if (PyDict_SetItemString(obj, __d, PyInt_FromLong(self->__n)) == -1) goto bail;
+
+ if (self->added) {
+ Py_ssize_t len = PyList_GET_SIZE(self->added);
+ if (PyDict_SetItemString(obj, "index entries added",
+ PyInt_FromLong(len)) == -1)
+ goto bail;
+ }
+
+ if (self->raw_length != self->length - 1)
+ istat(raw_length, "revs on disk");
+ istat(length, "revs in memory");
+ istat(ntlength, "node trie count");
+ istat(ntcapacity, "node trie capacity");
+ istat(ntdepth, "node trie depth");
+ istat(ntsplits, "node trie splits");
+ istat(ntrev, "node trie last rev scanned");
+
+#undef istat
+
+ return obj;
+
+bail:
+ Py_XDECREF(obj);


+ return NULL;
+}
+

+static inline int nt_level(const char *node, int level)
+{
+ int v = node[level>>1];
+ if (!(level & 1))
+ v >>= 4;
+ return v & 0xf;
+}
+
+static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen)
+{
+ int level, off;
+
+ if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
+ return -1;
+
+ if (self->nt == NULL)
+ return -2;
+
+ for (level = off = 0; level < nodelen; level++) {
+ int k = nt_level(node, level);
+ nodetree *n = &self->nt[off];
+ int v = n->children[k];
+
+ if (v < 0) {
+ const char *n;
+ v = -v - 1;
+ n = index_node(self, v);
+ if (n == NULL)
+ return -2;
+ return memcmp(node, n, nodelen > 20 ? 20 : nodelen)
+ ? -2 : v;
+ }
+ if (v == 0)
+ return -2;
+ off = v;
+ }
+ return -2;
+}
+
+static int nt_new(indexObject *self)
+{
+ if (self->ntlength == self->ntcapacity) {
+ self->ntcapacity *= 2;
+ self->nt = realloc(self->nt,
+ self->ntcapacity * sizeof(nodetree));
+ if (self->nt == NULL) {
+ PyErr_SetString(PyExc_MemoryError, "out of memory");
+ return -1;
+ }
+ memset(&self->nt[self->ntlength], 0,
+ sizeof(nodetree) * (self->ntcapacity - self->ntlength));
+ }
+ return self->ntlength++;
+}
+
+static int nt_insert(indexObject *self, const char *node, int rev)
+{
+ int level = 0;
+ int off = 0;
+
+ while (level < 20) {
+ int k = nt_level(node, level);
+ nodetree *n;
+ int v;
+
+ n = &self->nt[off];
+ v = n->children[k];
+
+ if (v == 0) {
+ n->children[k] = -rev - 1;
+ return 0;
+ }
+ if (v < 0) {
+ const char *oldnode = index_node(self, -v - 1);
+ int noff;
+
+ if (oldnode == NULL) {
+ /* node has been deleted from array */
+ n->children[k] = -rev - 1;
+ return 0;
+ }
+ if (memcmp(oldnode, node, 20) == 0)
+ return 0;
+ noff = nt_new(self);
+ if (noff == -1)
+ return -1;
+ /* self->nt may have been changed by realloc */
+ self->nt[off].children[k] = noff;
+ off = noff;
+ n = &self->nt[off];
+ n->children[nt_level(oldnode, ++level)] = v;
+ if (level > self->ntdepth)
+ self->ntdepth = level;
+ self->ntsplits += 1;
+ } else {
+ level += 1;
+ off = v;
+ }
+ }
+


+ return -1;
+}
+

+/*
+ * Return values:
+ *
+ * -3: error (exception set)
+ * -2: not found (no exception set)
+ * rest: valid rev
+ */
+static int index_find_node(indexObject *self,
+ const char *node, Py_ssize_t nodelen)
+{
+ int rev;
+
+ rev = nt_find(self, node, nodelen);
+ if (rev >= -1)
+ return rev;
+
+ if (self->nt == NULL) {
+ self->ntcapacity = 4096;
+ self->nt = calloc(self->ntcapacity, sizeof(nodetree));
+ if (self->nt == NULL) {
+ PyErr_SetString(PyExc_MemoryError, "out of memory");
+ return -3;
+ }
+ self->ntrev = (int) index_length(self) - 1;
+ self->ntlength = 1;
+ }
+
+ for (rev = self->ntrev - 1; rev >= 0; rev--) {
+ const char *n = index_node(self, rev);
+ if (n == NULL)
+ return -2;
+ if (nt_insert(self, n, rev) == -1)
+ return -3;
+ if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0)
+ break;
+ }
+
+ self->ntrev = rev;
+
+ if (rev >= 0)
+ return rev;
+ return -2;
+}
+
+static PyObject *
+index_getitem(indexObject *self, PyObject *value)
+{
+ char *node;
+ Py_ssize_t nodelen;
+ int rev;
+
+ if (PyInt_Check(value))
+ return index_get(self, PyInt_AS_LONG(value));
+
+ if (PyString_AsStringAndSize(value, &node, &nodelen) == -1)
+ return NULL;
+ rev = index_find_node(self, node, nodelen);
+ if (rev >= -1)
+ return PyInt_FromLong(rev);
+ if (rev == -2)
+ /* XXX should be a LookupError? */
+ PyErr_SetString(PyExc_KeyError, "node not found");


+ return NULL;
+}
+

+static PyObject *index_m_get(indexObject *self, PyObject *args)
+{
+ char *node;
+ int nodelen, rev;
+
+ if (!PyArg_ParseTuple(args, "s#", &node, &nodelen))
+ return NULL;
+
+ rev = index_find_node(self, node, nodelen);
+ if (rev == -3)
+ return NULL;
+ if (rev == -2)
+ Py_RETURN_NONE;
+ return PyInt_FromLong(rev);
+}
+
+static int index_contains(indexObject *self, PyObject *value)
+{
+ char *node;
+ Py_ssize_t nodelen;
+
+ if (PyInt_Check(value)) {
+ long rev = PyInt_AS_LONG(value);
+ return rev >= -1 && rev < index_length(self);
+ }
+
+ if (!PyString_Check(value))
+ return 0;
+
+ node = PyString_AS_STRING(value);
+ nodelen = PyString_GET_SIZE(value);
+
+ switch (index_find_node(self, node, nodelen)) {
+ case -3:
+ return -1;
+ case -2:
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+/*
+ * Invalidate any trie entries introduced by added revs.
+ */
+static void nt_invalidate_added(indexObject *self, Py_ssize_t start)
+{
+ Py_ssize_t i, len = PyList_GET_SIZE(self->added);
+
+ for (i = start; i < len; i++) {
+ PyObject *tuple = PyList_GET_ITEM(self->added, i);
+ PyObject *node = PyTuple_GET_ITEM(tuple, 7);
+
+ nt_insert(self, PyString_AS_STRING(node), -1);
+ }
+
+ if (start == 0) {


+ Py_DECREF(self->added);
+ self->added = NULL;
+ }

+}
+
+/*
+ * Delete a numeric range of revs, which must be at the end of the
+ * range, but exclude the sentinel nullid entry.
+ */
static int
-index_ass_subscript(indexObject* self, PyObject* item, PyObject* value)
+index_slice_del(indexObject* self, PyObject* item)
{


Py_ssize_t start, stop, step, slicelength;

Py_ssize_t length = index_length(self);

- if (!PySlice_Check(item) || value != NULL) {
- PyErr_SetString(PyExc_TypeError,
- "revlog index only supports slice deletion");
- return -1;
- }
-
if (PySlice_GetIndicesEx((PySliceObject*)item, length,


&start, &stop, &step, &slicelength) < 0)

return -1;
@@ -437,20 +782,67 @@
return -1;
}

- if (start < self->length) {
+ if (start < self->length - 1) {


self->length = start + 1;

- if (self->added) {
- Py_DECREF(self->added);
- self->added = NULL;
+
+ if (self->nt) {
+ Py_ssize_t i;
+
+ for (i = start + 1; i < self->length - 1; i++) {
+ const char *node = index_node(self, i);
+
+ if (node)
+ nt_insert(self, node, -1);
+ }
+ if (self->added)
+ nt_invalidate_added(self, 0);
}
return 0;
}

- return PyList_SetSlice(self->added, start - self->length + 1,
- PyList_GET_SIZE(self->added),
- NULL);
+ if (self->nt)
+ nt_invalidate_added(self, start - self->length + 1);
+ return self->added
+ ? PyList_SetSlice(self->added, start - self->length + 1,
+ PyList_GET_SIZE(self->added), NULL)
+ : 0;
}

+/*
+ * Supported ops:
+ *
+ * slice deletion
+ * string assignment (extend node->rev mapping)
+ * string deletion (shrink node->rev mapping)
+ */


+static int
+index_ass_subscript(indexObject* self, PyObject* item, PyObject* value)
+{

+ char *node;
+ Py_ssize_t nodelen;
+ long rev;
+
+ if (PySlice_Check(item) && value == NULL)
+ return index_slice_del(self, item);
+
+ if (node_check(item, &node, &nodelen) == -1)
+ return -1;
+
+ if (value == NULL)
+ return self->nt ? nt_insert(self, node, -1) : 0;
+ rev = PyInt_AsLong(value);
+ if (rev > INT_MAX || rev < 0) {
+ if (!PyErr_Occurred())
+ PyErr_SetString(PyExc_ValueError, "rev out of range");
+ return -1;
+ }
+ return nt_insert(self, node, (int) rev);
+}
+
+/*
+ * Find all RevlogNG entries in an index that has inline data. Update
+ * the optional "offsets" table with those entries.
+ */


static long inline_scan(indexObject *self, const char **offsets)

{


const char *data = PyString_AS_STRING(self->data);

@@ -493,6 +885,10 @@

self->added = NULL;
self->offsets = NULL;
+ self->nt = NULL;
+ self->ntlength = self->ntcapacity = 0;
+ self->ntdepth = self->ntsplits = 0;
+ self->ntrev = -1;
Py_INCREF(self->data);

if (self->inlined) {
@@ -535,6 +931,7 @@
Py_DECREF(self->data);
Py_XDECREF(self->added);
free(self->offsets);
+ free(self->nt);
PyObject_Del(self);
}

@@ -543,24 +940,32 @@
0, /* sq_concat */
0, /* sq_repeat */
(ssizeargfunc)index_get, /* sq_item */
+ 0, /* sq_slice */
+ 0, /* sq_ass_item */
+ 0, /* sq_ass_slice */
+ (objobjproc)index_contains, /* sq_contains */
};

static PyMappingMethods index_mapping_methods = {
(lenfunc)index_length, /* mp_length */
- NULL, /* mp_subscript */
+ (binaryfunc)index_getitem, /* mp_subscript */
(objobjargproc)index_ass_subscript, /* mp_ass_subscript */
};

static PyMethodDef index_methods[] = {
+ {"get", (PyCFunction)index_m_get, METH_VARARGS,
+ "get an index entry"},
{"insert", (PyCFunction)index_insert, METH_VARARGS,


"insert an index entry"},

+ {"stats", (PyCFunction)index_stats, METH_NOARGS,
+ "stats for the index"},
{NULL} /* Sentinel */
};

static PyTypeObject indexType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
- "index.index", /*tp_name*/
+ "parsers.index", /*tp_name*/
sizeof(indexObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)index_dealloc, /*tp_dealloc*/
@@ -600,10 +1005,10 @@
};

/*
- * returns a tuple of the form (index, None, cache) with elements as
+ * returns a tuple of the form (index, index, cache) with elements as
* follows:
*
- * index: an index object that lazily parses the RevlogNG records
+ * index: an index object that lazily parses RevlogNG records


* cache: if data is inlined, a tuple (index_file_content, 0), else None

*


* added complications are for backwards compatibility

@@ -638,10 +1043,9 @@
Py_INCREF(cache);
}

- none = Py_None;
- Py_INCREF(none);
+ Py_INCREF(idx);

- tuple = Py_BuildValue("NNN", idx, none, cache);
+ tuple = Py_BuildValue("NNN", idx, idx, cache);
if (!tuple)
goto bail;
return tuple;
@@ -665,8 +1069,6 @@

static void module_init(PyObject *mod)
{
- static const char nullid[20];
-
if (PyType_Ready(&indexType) < 0)
return;
Py_INCREF(&indexType);
@@ -701,4 +1103,3 @@
module_init(mod);
}
#endif
-


diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py

@@ -219,7 +219,7 @@
self._chunkcache = (0, '')
self.index = []
self._pcache = {}
- self._nodecache = {nullid: nullrev}
+ #self.__nodecache = {nullid: nullrev}
self._nodepos = None

v = REVLOG_DEFAULT_VERSION
@@ -287,9 +287,20 @@

def rev(self, node):
try:
- return self._nodecache[node]
+ r = self._nodecache[node]
except KeyError:
- n = self._nodecache
+ raise LookupError(node, self.indexfile, _('no node?'))
+ if False:
+ r1 = self.rev_(node)
+ if r != r1:
+ raise Exception('node %s: %s != %s' % (node.encode('hex'), r, r1))
+ return r
+
+ def rev_(self, node):
+ try:
+ return self.__nodecache[node]
+ except KeyError:
+ n = self.__nodecache
i = self.index
p = self._nodepos
if p is None:
@@ -1057,6 +1068,7 @@
base, link, p1r, p2r, node)
self.index.insert(-1, e)
self.nodemap[node] = curr
+ #self.__nodecache[node] = curr

entry = self._io.packentry(e, self.node, self.version, curr)
if not self._inline:
@@ -1230,6 +1242,7 @@
self._chunkclear()
for x in xrange(rev, len(self)):
del self.nodemap[self.node(x)]
+ #del self.__nodecache[self.node(x)]

del self.index[rev:-1]



diff --git a/tests/test-parseindex2.py b/tests/test-parseindex2.py
--- a/tests/test-parseindex2.py
+++ b/tests/test-parseindex2.py

@@ -110,6 +110,17 @@
if py_res_2 != c_res_2:
print "Parse index result (no inlined data) differs!"

+ ix = parsers.parse_index2(data_inlined, True)[0]
+ for i,r in enumerate(ix):
+ try:
+ if r[7] == nullid:
+ i = -1
+ if ix[r[7]] != i:
+ print 'Reverse lookup inconsistent for %r' % r[7].encode('hex')
+ except:
+ print 'Reverse lookup failed for %r' % r[7].encode('hex')
+ raise
+
print "done"

runtest()

Reply all
Reply to author
Forward
0 new messages