Tests on freebsd with zfs failed with OverflowErrors because zfs was
creating a device (dev_t) with the high bit set (which it is known
to), and it turns out that python hasn't handled that consistently.
Recent python versions handle it as unsigned, ignoring the platform's
definition in nearly all cases (e.g. on macos it's signed). Though it
looks like os.makedev() is still broken.
Python improved the "high bit set" dev_t handling in this
commit (switching, as mentioned, to try to treat them as unsigned):
7111d9605f9db7aa0b095bb8ece7ccc0b8115c3f
gh-89928: Fix integer conversion of device numbers (GH-31794)
https://github.com/python/cpython/pull/31794
And those changes have been backported to an increasing number of
python minor releases. (Whether or not python attempts to treat stat
fields other than dev_t as unsigned consistently has not been
investigated.)
I believe we were exposed to these errors when I removed our custom
stat function in main:
ae032d639582053a789230e9cc91c5e40dff9cf7
Drop our custom stat functions in favor of python's
To date, bup has also required dev_t (and some other stat fields) to
be unsigned and no more than eight bytes, via use of Q in the
INDEX_SIG pack format for index entries.
To address the dev_t issue, and to make sure we handle all the other
stat fields consistently over time and across python versions, bring
back and enhance our custom stat functions, and wrap python's
os.mknod() to make sure it only sees the unsigned device values it
expects (for now we don't add our own mknod because it's a bit more
complex, and python's doesn't appear to have changed behavior over the
periods we investigated).
Have all of our stat operations work with the actual platform
ranges (i.e. preserve unsigned/signed values), and make sure that the
index doesn't lose the stat field sign information by providing the
platform size and sign via _helpers.c_type_signed_size.
Finally, so that we don't mishandle the index (even across hosts, say
via moving a bup repo from one machine to another), add the entry
signature to the index's header and require reindexing if the current
platform's signature doesn't match the one in the index. Continue to
support the previous index format (trivially) so that if the entry
signature hasn't actually changed, you don't have to reindex just for
the new format.
Rename INDEX_SIG to ENTRY_SIG since that's what it is, and rename
INDEX_HDR to INDEX_SIG since that's what *it* is.
Add bup_int_from_py to pyutil and move the (u)int functions above
bup_ulong_from_py.
Thanks to Johannes Berg for a good bit of help figuring all this out.
Signed-off-by: Rob Browning <
r...@defaultvalue.org>
Tested-by: Rob Browning <
r...@defaultvalue.org>
---
configure | 24 ++++
lib/bup/_helpers.c | 339 +++++++++++++++++++++++++++++++++++++++++---
lib/bup/cmd/midx.py | 2 +-
lib/bup/git.py | 2 +-
lib/bup/index.py | 126 +++++++++++-----
lib/bup/xstat.py | 76 ++++++----
src/bup/pyutil.c | 43 ++++--
src/bup/pyutil.h | 1 +
8 files changed, 523 insertions(+), 90 deletions(-)
diff --git a/configure b/configure
index f202e7a4..85b9349a 100755
--- a/configure
+++ b/configure
@@ -360,6 +360,30 @@ if test "${c_define[BUP_HAVE_READLINE]:-}"; then
fi
fi
+info -n 'checking for ns resolution stat times'
+stat_code='
+#include <sys/stat.h>
+
+int main(int argc, char **argv)
+{
+ struct stat st;
+ stat(argv[0], &st);
+ return (int) st.BUP_TIME_FIELD;
+}
+'
+if try-c-code -DBUP_TIME_FIELD=st_atim.tv_nsec -- "$stat_code"; then
+ info ' (found, tim)'
+ c_define[BUP_STAT_NS_FLAVOR_TIM]=1
+elif try-c-code -DBUP_TIME_FIELD=st_atimensec.tv_nsec -- "$stat_code"; then
+ info ' (found, timensec)'
+ c_define[BUP_STAT_NS_FLAVOR_TIMENSEC]=1
+elif try-c-code -DBUP_TIME_FIELD=st_atimespec.tv_nsec -- "$stat_code"; then
+ info ' (found, timespec)'
+ c_define[BUP_STAT_NS_FLAVOR_TIMESPEC]=1
+else
+ info ' (not found)'
+ c_define[BUP_STAT_NS_FLAVOR_NONE]=1
+fi
info -n 'checking for libacl'
if pkg-config libacl; then
diff --git a/lib/bup/_helpers.c b/lib/bup/_helpers.c
index c6a99785..969ddadc 100644
--- a/lib/bup/_helpers.c
+++ b/lib/bup/_helpers.c
@@ -42,10 +42,6 @@
#include <sys/ioctl.h>
#endif
-#ifdef HAVE_TM_TM_GMTOFF
-#include <time.h>
-#endif
-
#if defined(BUP_RL_EXPECTED_XOPEN_SOURCE) \
&& (!defined(_XOPEN_SOURCE) || _XOPEN_SOURCE < BUP_RL_EXPECTED_XOPEN_SOURCE)
# warning "_XOPEN_SOURCE version is incorrect for readline"
@@ -113,6 +109,10 @@ static uint64_t htonll(uint64_t value)
#define INTEGRAL_ASSIGNMENT_FITS(dest, src) INT_ADD_OK(src, 0, dest)
+static PyObject *bup_py_zero;
+static PyObject *bup_py_billion;
+
+
static PyObject *bup_bytescmp(PyObject *self, PyObject *args)
{
PyObject *py_s1, *py_s2; // This is really a PyBytes/PyString
@@ -1108,6 +1108,217 @@ static PyObject *bup_set_linux_file_attr(PyObject *self, PyObject *args)
}
#endif /* def BUP_HAVE_FILE_ATTRS */
+
+#ifdef BUP_STAT_NS_FLAVOR_TIM
+# define BUP_STAT_ATIME_NS(st) (st)->st_atim.tv_nsec
+# define BUP_STAT_MTIME_NS(st) (st)->st_mtim.tv_nsec
+# define BUP_STAT_CTIME_NS(st) (st)->st_ctim.tv_nsec
+#elif defined BUP_STAT_NS_FLAVOR_TIMENSEC
+# define BUP_STAT_ATIME_NS(st) (st)->st_atimensec.tv_nsec
+# define BUP_STAT_MTIME_NS(st) (st)->st_mtimensec.tv_nsec
+# define BUP_STAT_CTIME_NS(st) (st)->st_ctimensec.tv_nsec
+#elif defined BUP_STAT_NS_FLAVOR_TIMESPEC
+# define BUP_STAT_ATIME_NS(st) (st)->st_atimespec.tv_nsec
+# define BUP_STAT_MTIME_NS(st) (st)->st_mtimespec.tv_nsec
+# define BUP_STAT_CTIME_NS(st) (st)->st_ctimespec.tv_nsec
+#elif defined BUP_STAT_NS_FLAVOR_NONE
+# define BUP_STAT_ATIME_NS(st) 0
+# define BUP_STAT_MTIME_NS(st) 0
+# define BUP_STAT_CTIME_NS(st) 0
+#else
+# error "./configure did not define a BUP_STAT_NS_FLAVOR"
+#endif
+
+static PyStructSequence_Field xstat_result_fields[] = {
+ {"st_mode", "access modes"},
+ {"st_ino", "inode"},
+ {"st_dev", "containing device ID"},
+ {"st_nlink", "link count"},
+ {"st_uid", "owner's user id"},
+ {"st_gid", "owner's group id"},
+ {"st_size", "length in bytes"},
+ // Below here we no longer match the os.stat_result field order.
+ // But we shouldn't rely on index references anyway.
+ {"st_atime_ns", "access time {ns)"},
+ {"st_mtime_ns", "modification time (ns)"},
+ {"st_ctime_ns", "change time (ns)"},
+ {"st_rdev", "character or block special device ID"},
+ {0, 0}
+};
+
+static PyStructSequence_Desc xstat_result_desc = {
+ "bup.xstat.xstat_result",
+ "bup fstat, stat, lstat data",
+ xstat_result_fields,
+ 11 // how many fields are visible in python (all)
+};
+
+static PyTypeObject *xstat_result_type;
+
+static PyObject *make_timestamp(PyObject *sec, PyObject *ns)
+{
+ // Ensures sec and ns end up unrefed
+ PyObject *result = NULL;
+ if (sec == NULL || ns == NULL)
+ goto finish;
+ PyObject *sec_as_ns = PyNumber_Multiply(sec, bup_py_billion);
+ if (sec_as_ns == NULL)
+ goto finish;
+ result = PyNumber_Add(sec_as_ns, ns);
+ Py_DECREF (sec_as_ns);
+ finish:
+ Py_XDECREF(sec);
+ Py_XDECREF(ns);
+ return result;
+}
+
+static PyObject *stat_struct_to_py(const struct stat *st)
+{
+ // The (unsigned) long long conversions (which includes
+ // BUP_LONGISH_TO_PY) are OK because setup_module has already
+ // ensured that the values will fit.
+
+#ifdef __CYGWIN__
+ // These are potentially redundant until/unless we remove the
+ // metadata _add_common guards (which we could, given that posix
+ // allows negative values). See also:
+ // dbb239b26a6e373c448a27e435afe48d76e99086
+ // Drop workaround for negative cygwin uid/gid
+ if (st->st_uid < 0)
+ return PyErr_Format(PyExc_ValueError,
+ "negative Cygwin st_uid (please report)");
+ if (st->st_gid < 0)
+ return PyErr_Format(PyExc_ValueError,
+ "negative Cygwin st_gid (please report)");
+#endif
+
+ PyObject *sr = PyStructSequence_New(xstat_result_type);
+
+ PyObject *v = BUP_LONGISH_TO_PY(st->st_mode);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 0, v);
+
+ v = BUP_LONGISH_TO_PY(st->st_ino);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 1, v);
+
+ // Depending on the version, Python treats some values like dev_t
+ // as unsigned, no matter what the platform defines (e.g. it's
+ // signed on macos), and we want the real value.
+ v = BUP_LONGISH_TO_PY(st->st_dev);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 2, v);
+
+ v = BUP_LONGISH_TO_PY(st->st_nlink);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 3, v);
+
+ v = BUP_LONGISH_TO_PY(st->st_uid);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 4, v);
+
+ v = BUP_LONGISH_TO_PY(st->st_gid);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 5, v);
+
+ v = BUP_LONGISH_TO_PY(st->st_size);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 6, v);
+
+ {
+ PyObject *stamp;
+ stamp = make_timestamp (BUP_LONGISH_TO_PY(st->st_atime),
+ BUP_LONGISH_TO_PY(BUP_STAT_ATIME_NS(st)));
+ if (stamp == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 7, stamp);
+
+ stamp = make_timestamp (BUP_LONGISH_TO_PY(st->st_mtime),
+ BUP_LONGISH_TO_PY(BUP_STAT_MTIME_NS(st)));
+ if (stamp == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 8, stamp);
+
+ stamp = make_timestamp (BUP_LONGISH_TO_PY(st->st_ctime),
+ BUP_LONGISH_TO_PY(BUP_STAT_CTIME_NS(st)));
+ if (stamp == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 9, stamp);
+ }
+
+ v = BUP_LONGISH_TO_PY(st->st_rdev);
+ if (v == NULL) { Py_DECREF(sr); return NULL; }
+ PyStructSequence_SetItem(sr, 10, v);
+
+ return sr;
+}
+
+// Handle stat(path_or_fd, *, dir_fd=None) or equivalent lstat.
+static PyObject
+*bup_xstat(PyObject *py_path, PyObject *dir_fd, int follow_symlinks)
+{
+ PyObject *result = NULL;
+
+ // Establish the "path" (fd, bytes, or string (as bytes))
+ int path_fd = AT_FDCWD;
+ PyBytesObject *path_bytes = NULL; // if set, must eventually be unrefed
+ const char *path = NULL;
+ if (PyLong_Check(py_path)) {
+ if (dir_fd != Py_None)
+ return PyErr_Format(PyExc_ValueError, "fd disallows dir_fd");
+ if (!bup_int_from_py (&path_fd, py_path, "path"))
+ return NULL;
+ } else {
+ if (!PyUnicode_FSConverter(py_path, &path_bytes))
+ return NULL;
+ // Must goto failed after this point
+ if((path = PyBytes_AS_STRING (path_bytes)) == NULL)
+ goto failed;
+ }
+ // Now we have either path_fd or path_bytes
+
+ int rc;
+ struct stat st;
+ int dfd = AT_FDCWD;
+ if (dir_fd && dir_fd != Py_None &&
+ !bup_int_from_py (&dfd, dir_fd, "dir_fd"))
+ goto failed;
+ rc = fstatat(dfd, path, &st, follow_symlinks ? 0 : AT_SYMLINK_NOFOLLOW);
+ if (rc != 0) {
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, path);
+ goto failed;
+ }
+ result = stat_struct_to_py(&st);
+
+ failed:
+ Py_XDECREF(path_bytes);
+ return result;
+}
+
+// Handle stat(path_or_fd, *, dir_fd=None) or equivalent lstat.
+static PyObject *bup_stat(PyObject *self, PyObject *args, PyObject *kwds)
+{
+ int follow_symlinks;
+ PyObject *py_path, *dir_fd = NULL;
+ static char *keys[] = {"path", "dir_fd", "follow_symlinks", NULL };
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|$Op", keys,
+ &py_path, &dir_fd, &follow_symlinks))
+ return NULL;
+ return bup_xstat(py_path, dir_fd, follow_symlinks);
+}
+
+// We don't just have an xstat.lstat follow_symlinks=False stat
+// wrapper because this is easy, lstat is our most common stat-related
+// call, and specializing made drecurse about a 10k paths/s faster in
+// trials.
+static PyObject *bup_lstat(PyObject *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *py_path, *dir_fd = NULL;
+ static char *keys[] = {"path", "dir_fd", NULL };
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|$O", keys,
+ &py_path, &dir_fd))
+ return NULL;
+ return bup_xstat(py_path, dir_fd, 0);
+}
+
+
static unsigned int vuint_encode(long long val, char *buf)
{
unsigned int len = 0;
@@ -1834,6 +2045,12 @@ static PyMethodDef helper_methods[] = {
{ "set_linux_file_attr", bup_set_linux_file_attr, METH_VARARGS,
"Set the Linux attributes for the given file." },
#endif
+ { "stat", (PyCFunction)(void(*)(void)) bup_stat,
+ METH_VARARGS | METH_KEYWORDS,
+ "Extended version of stat." },
+ { "lstat", (PyCFunction)(void(*)(void)) bup_lstat,
+ METH_VARARGS | METH_KEYWORDS,
+ "Extended version of lstat." },
{ "bytescmp", bup_bytescmp, METH_VARARGS,
"Return a negative value if x < y, zero if equal, positive otherwise."},
{ "getpwuid", bup_getpwuid, METH_VARARGS,
@@ -1918,7 +2135,8 @@ static void test_integral_assignment_fits(void)
}
}
-static void setattr_or_die (PyObject *obj, const char *name, PyObject *val,
+static PyObject *
+setattr_or_die (PyObject *obj, const char *name, PyObject *val,
const char *msg)
{
// Ensures reference to val is dropped
@@ -1927,28 +2145,91 @@ static void setattr_or_die (PyObject *obj, const char *name, PyObject *val,
exit(BUP_EXIT_FAILURE);
}
Py_DECREF(val);
+ return val;
+}
+
+static int
+set_dict_item (PyObject *dict, const char *name, PyObject *val)
+{
+ // Ensures reference to val is dropped
+ if (val == NULL || PyDict_SetItemString(dict, name, val))
+ return 0;
+ Py_DECREF(val);
+ return 1;
}
+static void
+set_ssize_or_die(PyObject *dict, const char *name, int is_signed, size_t size)
+{
+ // Set dict[name] to -size if signed, otherwise size.
+ if (!is_signed) {
+ if (!set_dict_item(dict, name, PyLong_FromSize_t(size)))
+ goto fail;
+ return;
+ }
+
+ ssize_t ssize;
+ if (!INT_SUBTRACT_OK(0, size, &ssize))
+ goto fail;
+ if (!set_dict_item(dict, name, PyLong_FromSsize_t(ssize)))
+ goto fail;
+ return;
+
+ fail:
+ fprintf(stderr, "error: unable to set c_type_signed_size['%s']\n", name);
+ exit(BUP_EXIT_FAILURE);
+}
+
+
static int setup_module(PyObject *m)
{
// FIXME: migrate these tests to configure, or at least don't
- // possibly crash the whole application. Check against the type
- // we're going to use when passing to python. Other stat types
- // are tested at runtime.
- assert(sizeof(ino_t) <= sizeof(unsigned PY_LONG_LONG));
- assert(sizeof(off_t) <= sizeof(PY_LONG_LONG));
- assert(sizeof(blksize_t) <= sizeof(PY_LONG_LONG));
- assert(sizeof(blkcnt_t) <= sizeof(PY_LONG_LONG));
- // Just be sure (relevant when passing timestamps back to Python above).
- assert(sizeof(PY_LONG_LONG) <= sizeof(long long));
- assert(sizeof(unsigned PY_LONG_LONG) <= sizeof(unsigned long long));
- // At least for BUP_LONGISH_TO_PY
+ // possibly crash the whole application.
+
+ // Establish that all the "long long" integer types are the same
+ // size, and that the intmax types are no larger. This is almost
+ // certainly true, and for now, simplifies
+ // conversions. i.e. BUP_LONGISH_TO_PY is always sufficient when
+ // the type fits in a "long long" flavor. Note that intmax_t and
+ // uintmax_t are not guaranteed to be the largest integer types
+ // (e.g. gcc __int128). Some of these checks are likely
+ // redundant, standards-wise but they're roughly free.
+ assert(sizeof(long long) == sizeof(unsigned long long));
assert(sizeof(intmax_t) <= sizeof(long long));
- assert(sizeof(uintmax_t) <= sizeof(unsigned long long));
+ assert(sizeof(uintmax_t) <= sizeof(long long));
+ assert(sizeof(PY_LONG_LONG) == sizeof(long long));
+ assert(sizeof(unsigned PY_LONG_LONG) == sizeof(long long));
+
// This should be guaranteed by the C standard, but it's cheap to
// double-check, and we depend on it.
assert(sizeof(unsigned long) >= sizeof(uint32_t));
+ // Simplify the stat conversions by ensuring that all the types
+ // fit in a long long or unsigned long long.
+ assert(sizeof(ino_t) <= sizeof(long long));
+ assert(sizeof(dev_t) <= sizeof(long long));
+ assert(sizeof(nlink_t) <= sizeof(long long));
+ assert(sizeof(uid_t) <= sizeof(long long));
+ assert(sizeof(gid_t) <= sizeof(long long));
+ assert(sizeof(size_t) <= sizeof(long long));
+ assert(sizeof(((struct stat *) 0)->st_atime) <= sizeof(long long));
+ assert(sizeof(((struct stat *) 0)->st_mtime) <= sizeof(long long));
+ assert(sizeof(((struct stat *) 0)->st_ctime) <= sizeof(long long));
+ assert(sizeof(BUP_STAT_ATIME_NS((struct stat *) 0)) <= sizeof(long long));
+ assert(sizeof(BUP_STAT_MTIME_NS((struct stat *) 0)) <= sizeof(long long));
+ assert(sizeof(BUP_STAT_CTIME_NS((struct stat *) 0)) <= sizeof(long long));
+ assert(sizeof(long long) <= 8); // For ENTRY_SIG, at least
+
+ // Ensure the platform respects posix for the relevant stat_t
+ // types (supports at least index.ENTRY_SIG).
+ assert(!TYPE_SIGNED(ino_t));
+ assert(!TYPE_SIGNED(size_t));
+
+ // ENTRY_SIG has always used int; let's continue until some new
+ // platform requires us to force everyone to reindex
+ // (cf. make_index_entry_packfmt).
+ assert(sizeof(mode_t) <= sizeof(int));
+
test_integral_assignment_fits();
// Originally required by append_sparse_region()
@@ -1961,6 +2242,28 @@ static int setup_module(PyObject *m)
}
}
+ {
+ assert(CHAR_BIT == 8);
+ PyObject *d =
+ setattr_or_die (m, "c_type_signed_size", PyDict_New(),
+ "error: unable to define c_type_signed_size\n");
+ set_ssize_or_die(d, "dev_t", TYPE_SIGNED(dev_t), sizeof(dev_t));
+ set_ssize_or_die(d, "mode_t", TYPE_SIGNED(mode_t), sizeof(mode_t));
+ set_ssize_or_die(d, "nlink_t", TYPE_SIGNED(nlink_t), sizeof(nlink_t));
+ }
+
+ bup_py_zero = PyLong_FromUnsignedLong(0);
+ if (bup_py_zero == NULL) {
+ fprintf(stderr, "error: unable to define a zero\n");
+ exit(BUP_EXIT_FAILURE);
+ }
+
+ bup_py_billion = PyLong_FromUnsignedLongLong(1000000000);
+ if (bup_py_billion == NULL) {
+ fprintf(stderr, "error: unable to define a billion\n");
+ exit(BUP_EXIT_FAILURE);
+ }
+
{
PyObject *math = PyImport_ImportModule("math");
if (!math) {
@@ -1979,6 +2282,8 @@ static int setup_module(PyObject *m)
}
}
+ xstat_result_type = PyStructSequence_NewType(&xstat_result_desc);
+
setattr_or_die (m, "INT_MAX", BUP_LONGISH_TO_PY(INT_MAX),
"error: unable to define INT_MAX\n");
setattr_or_die (m, "UINT_MAX", BUP_LONGISH_TO_PY(UINT_MAX),
diff --git a/lib/bup/cmd/midx.py b/lib/bup/cmd/midx.py
index f483eae4..479d0c2b 100644
--- a/lib/bup/cmd/midx.py
+++ b/lib/bup/cmd/midx.py
@@ -212,7 +212,7 @@ def do_midx_dir(path, outfilename, prout, auto=False, force=False,
# sort the biggest+newest midxes first, so that we can eliminate
# smaller (or older) redundant ones that come later in the list
- midxs.sort(key=lambda ix: (-sizes[ix], -xstat.stat(ix).st_mtime))
+ midxs.sort(key=lambda ix: (-sizes[ix], -xstat.stat(ix).st_mtime_ns))
for mname in midxs:
any = 0
diff --git a/lib/bup/git.py b/lib/bup/git.py
index 1200d166..aa5bcf18 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -702,7 +702,7 @@ class PackIdxList:
% (path_msg(n), path_msg(mxf)))
unlink(full)
midxl.sort(key=lambda ix:
- (-len(ix), -xstat.stat(
ix.name).st_mtime))
+ (-len(ix), -xstat.stat(
ix.name).st_mtime_ns))
for ix in midxl:
any_needed = False
for sub in ix.idxnames:
diff --git a/lib/bup/index.py b/lib/bup/index.py
index 3618862d..67446178 100644
--- a/lib/bup/index.py
+++ b/lib/bup/index.py
@@ -1,11 +1,12 @@
from contextlib import ExitStack
-import os, stat, struct
+import os, stat, struct, sys
from bup import metadata, xstat
-from bup._helpers import UINT_MAX, bytescmp
+from bup._helpers import UINT_MAX, bytescmp, c_type_signed_size
from bup.helpers import \
- (add_error,
+ (EXIT_FAILURE,
+ add_error,
atomically_replaced_file,
fsync,
log,
@@ -16,38 +17,70 @@ from bup.helpers import \
qprogress,
resolve_parent,
slashappend)
+from
bup.io import path_msg
from bup.metadata import empty_metadata
EMPTY_SHA = b'\0' * 20
FAKE_SHA = b'\x01' * 20
-INDEX_HDR = b'BUPI\0\0\0\7'
+INDEX_SIG_V7 = b'BUPI\0\0\0\x07'
+INDEX_SIG = b'BUPI\0\0\0\x08'
+assert len(INDEX_SIG_V7) == len (INDEX_SIG)
# Time values are handled as integer nanoseconds since the epoch in
# memory, but are written as xstat/metadata timespecs. This behavior
# matches the existing metadata/xstat/.bupm code.
-
+#
# Record times (mtime, ctime, atime) as xstat/metadata timespecs, and
# store all of the times in the index so they won't interfere with the
# forthcoming metadata cache.
-INDEX_SIG = ('!'
- 'Q' # dev
- 'Q' # ino
- 'Q' # nlink
- 'qQ' # ctime_s, ctime_ns
- 'qQ' # mtime_s, mtime_ns
- 'qQ' # atime_s, atime_ns
- 'Q' # size
- 'I' # mode
- 'I' # gitmode
- '20s' # sha
- 'H' # flags
- 'Q' # children_ofs
- 'I' # children_n
- 'Q') # meta_ofs
-
-ENTLEN = struct.calcsize(INDEX_SIG)
+#
+# POSIX specifies that all of the non-timestamp types here except
+# dev_t, mode_t, and nlink_t must be unsigned, and _helpers
+# setup_module() guarantees they all fit in eight bytes. For now, we
+# leave mode an int (i/I), as it has long been, and setup_module()
+# ensures that's OK too. Note that the timestamp pairs are *ours*
+# (via nsecs_to_timespec), not the POSIX specified time_t and long.
+
+# ENTRY_SIG_V7 was the long-standing format before we started matching
+# the platform's sign for the three relevant types, and including the
+# format in the index (in v8).
+#
+ENTRY_SIG_V7 = ('!'
+ 'Q' # dev
+ 'Q' # ino
+ 'Q' # nlink
+ 'qQ' # ctime_s, ctime_ns
+ 'qQ' # mtime_s, mtime_ns
+ 'qQ' # atime_s, atime_ns
+ 'Q' # size
+ 'I' # mode
+ 'I' # gitmode
+ '20s' # sha
+ 'H' # flags
+ 'Q' # children_ofs
+ 'I' # children_n
+ 'Q') # meta_ofs
+
+ENTRY_SIG = ''.join(('!',
+ 'q' if c_type_signed_size['dev_t'] < 0 else 'Q',
+ 'Q', # ino
+ 'q' if c_type_signed_size['nlink_t'] < 0 else 'Q',
+ 'qQ' # ctime_s, ctime_ns
+ 'qQ' # mtime_s, mtime_ns
+ 'qQ' # atime_s, atime_ns
+ 'Q', # size
+ 'i' if c_type_signed_size['mode_t'] < 0 else 'I', # mode
+ 'i' if c_type_signed_size['mode_t'] < 0 else 'I', # gitmode
+ '20s' # sha
+ 'H' # flags
+ 'Q' # children_ofs
+ 'I' # children_n
+ 'Q')) # meta_ofs
+
+ENTLEN = struct.calcsize(ENTRY_SIG)
+
FOOTER_SIG = '!Q'
FOOTLEN = struct.calcsize(FOOTER_SIG)
@@ -55,10 +88,21 @@ IX_EXISTS = 0x8000 # file exists on filesystem
IX_HASHVALID = 0x4000 # the stored sha1 matches the filesystem
IX_SHAMISSING = 0x2000 # the stored sha1 object doesn't seem to exist
+
class Error(Exception):
pass
+def header_len(sig):
+ if sig == INDEX_SIG:
+ # The byte before the ENTRY_SIG contains its length
+ return len(INDEX_SIG) + 1 + len(ENTRY_SIG)
+ if sig == INDEX_SIG_V7:
+ return len(INDEX_SIG_V7)
+ log(f'Unsupported index version {sig}, please --clear and reindex\n')
+ sys.exit(EXIT_FAILURE)
+
+
class MetaStoreReader:
def __init__(self, filename):
self._file = None
@@ -205,7 +249,7 @@ class Entry:
ctime = xstat.nsecs_to_timespec(self.ctime)
mtime = xstat.nsecs_to_timespec(self.mtime)
atime = xstat.nsecs_to_timespec(self.atime)
- return struct.pack(INDEX_SIG,
+ return struct.pack(ENTRY_SIG,
self.dev, self.ino, self.nlink,
ctime[0], ctime[1],
mtime[0], mtime[1],
@@ -366,7 +410,7 @@ class ExistingEntry(Entry):
self.ctime, ctime_ns, self.mtime, mtime_ns, self.atime, atime_ns,
self.size, self.mode, self.gitmode, self.sha,
self.flags, self.children_ofs, self.children_n, self.meta_ofs
- ) = struct.unpack(INDEX_SIG, m[ofs : ofs + ENTLEN])
+ ) = struct.unpack(ENTRY_SIG, m[ofs : ofs + ENTLEN])
self.atime = xstat.timespec_to_nsecs((self.atime, atime_ns))
self.mtime = xstat.timespec_to_nsecs((self.mtime, mtime_ns))
self.ctime = xstat.timespec_to_nsecs((self.ctime, ctime_ns))
@@ -430,17 +474,32 @@ class Reader:
self.m = b''
self.writable = False
self.count = 0
+ self._entries_start = header_len(INDEX_SIG) # for new indexes
with ExitStack() as ctx:
try:
# pylint: disable-next=consider-using-with
f = ctx.enter_context(open(filename, 'rb+'))
except FileNotFoundError:
return
- b = f.read(len(INDEX_HDR))
- if b != INDEX_HDR:
- log('warning: %s: header: expected %r, got %r\n'
- % (filename, INDEX_HDR, b))
- return
+
+ header = f.read(len(INDEX_SIG))
+ if header == INDEX_SIG_V7 and ENTRY_SIG == ENTRY_SIG_V7:
+ pass
+ elif header == INDEX_SIG:
+ entry_len = ord(f.read(1))
+ entry_sig = f.read(entry_len).decode('ascii')
+ if entry_sig != ENTRY_SIG:
+ log(f'error: incompatible index {path_msg(filename)},'
+ f' please --clear and reindex\n')
+ log(f'(entry signature is {entry_sig} not {ENTRY_SIG})\n')
+ sys.exit(EXIT_FAILURE)
+ else:
+ log(f'error: incompatible index {path_msg(filename)},'
+ f' please --clear and reindex\n')
+ log(f'(index signature is {header} not {INDEX_SIG})\n')
+ sys.exit(EXIT_FAILURE)
+
+ self._entries_start = header_len(header)
st = os.fstat(f.fileno())
if st.st_size:
m = mmap_readwrite(f)
@@ -460,7 +519,7 @@ class Reader:
return int(self.count)
def forward_iter(self):
- ofs = len(INDEX_HDR)
+ ofs = self._entries_start
while ofs+ENTLEN <= len(self.m)-FOOTLEN:
eon = self.m.find(b'\0', ofs)
assert(eon >= 0)
@@ -471,7 +530,7 @@ class Reader:
ofs = eon + 1 + ENTLEN
def iter(self, name=None, wantrecurse=None):
- if len(self.m) > len(INDEX_HDR)+ENTLEN:
+ if len(self.m) > self._entries_start + ENTLEN:
dname = name
if dname and not dname.endswith(b'/'):
dname += b'/'
@@ -554,7 +613,10 @@ class Writer:
buffering=65536)
self.f = self.cleanup.enter_context(self.pending_index)
self.cleanup.enter_context(self.f)
- self.f.write(INDEX_HDR)
+ self.f.write(INDEX_SIG)
+ sig = ENTRY_SIG.encode('ascii')
+ self.f.write(b'%c' % len(sig))
+ self.f.write(sig)
self.cleanup = self.cleanup.pop_all()
def __enter__(self): return self
diff --git a/lib/bup/xstat.py b/lib/bup/xstat.py
index 84062552..e87feec1 100644
--- a/lib/bup/xstat.py
+++ b/lib/bup/xstat.py
@@ -1,9 +1,12 @@
"""Enhanced stat operations for bup."""
from time import strftime
-import os, sys, time
+import os, time
import stat as pystat
+from bup import _helpers
+from bup._helpers import c_type_signed_size
+
def timespec_to_nsecs(ts):
ts_s, ts_ns = ts
@@ -49,29 +52,54 @@ def lutime(path, times):
os.utime(path, ns=times, follow_symlinks=False)
-if not sys.platform.startswith('cygwin'):
- stat = os.stat
- fstat = os.fstat
- lstat = os.lstat
-else:
- # These are potentially redundant until/unless we remove the
- # metadata _add_common guards (which we could, given that posix
- # allows negative values).
- def stat(path):
- st = os.stat(path)
- assert st.st_uid >= 0, st
- assert st.st_gid >= 0, st
- return st
- def fstat(path):
- st = os.fstat(path)
- assert st.st_uid >= 0, st
- assert st.st_gid >= 0, st
- return st
- def lstat(path):
- st = os.lstat(path)
- assert st.st_uid >= 0, st
- assert st.st_gid >= 0, st
- return st
+# We provide our own stat wrappers, and they produce an xstat_result,
+# not an os.stat_result. Our result provides a subset of the
+# stat_result fields, and those fields respect the platform's actual
+# types, in particular with respect to sign. For example, dev_t is
+# unsigned on Linux, and FreeBSD, but signed on macOS.
+#
+# Originally, Python treated dev_t as signed when populating
+# os.stat_result regardless of the platform's actual definition (POSIX
+# does not require it to be signed or unsigned), but Python eventually
+# switched to treat dev_t as unsigned[1], no matter what the platform
+# specified in <sys/types.h>, with the one exception that if the
+# platform defines NODEV, then that can be returned, and is typically
+# -1. The switch to unsigned dev_t is/was also incomplete, for
+# example still rejecting "high bit set" makedev values.
+#
+# Having our own stat functions ensures the vaules returned are
+# consistent across platforms and across Python versions. We also
+# don't need or provide all of the stat fields right now. For
+# example, xstat_result only provides nanosecond timestamps via
+# st_[amc]time_ns, not st_[amc]time.
+#
+# Note, though, that because we produce the platform's actual values
+# for dev_t, etc., we have to be careful if we pass those values to
+# Python functions, which is why we wrap mknod() below.
+#
+# [1] This is where Python switched to unsigned dev_t:
+#
+# 7111d9605f9db7aa0b095bb8ece7ccc0b8115c3f
+# gh-89928: Fix integer conversion of device numbers (GH-31794)
+#
https://github.com/python/cpython/pull/31794
+#
+# and it has been backported to various minor releases over time.
+
+stat = _helpers.stat
+lstat = _helpers.lstat
+fstat = _helpers.stat
+
+# Assuming two's complement, offset to convert negative values to
+# their corresponding unsigned equivalents (as if coerced in C).
+_dev_t_shift = 1 << abs(c_type_signed_size['dev_t']) * 8
+_nodev = getattr(os, 'NODEV', 0)
+
+def mknod(path, mode=0o600, device=0, *, dir_fd=None):
+ # If needed, adapt our native dev_t values to the unsigned values
+ # os.mknod expects.
+ if device < 0 and device != _nodev:
+ device += _dev_t_shift
+ return os.mknod(path, mode, device, dir_fd=dir_fd)
def mode_str(mode):
diff --git a/src/bup/pyutil.c b/src/bup/pyutil.c
index 5c480a4d..e01b9268 100644
--- a/src/bup/pyutil.c
+++ b/src/bup/pyutil.c
@@ -36,24 +36,17 @@ void *checked_malloc(size_t n, size_t size)
return result;
}
-int bup_ulong_from_py(unsigned long *x, PyObject *py, const char *name)
+int bup_int_from_py(int *x, PyObject *py, const char *name)
{
- if (!PyLong_Check(py))
- {
- PyErr_Format(PyExc_TypeError, "%s expected integer, not %R", name, py);
- return 0;
- }
-
- const unsigned long tmp = PyLong_AsUnsignedLong(py);
- if (PyErr_Occurred())
- {
- if (PyErr_ExceptionMatches(PyExc_OverflowError))
- PyErr_Format(PyExc_OverflowError, "%s overflows unsigned long: %R",
- name, py);
+ const long i = PyLong_AsLong(py);
+ if (i == -1 && PyErr_Occurred()) {
+ PyErr_Format(PyExc_OverflowError, "%s overflows int: %R", name, py);
return 0;
}
- *x = tmp;
- return 1;
+ if (INT_ADD_OK(i, 0, x))
+ return 1;
+ PyErr_Format(PyExc_OverflowError, "%s overflows int: %R", name, py);
+ return 0;
}
int bup_uint_from_py(unsigned int *x, PyObject *py, const char *name)
@@ -72,6 +65,26 @@ int bup_uint_from_py(unsigned int *x, PyObject *py, const char *name)
return 1;
}
+int bup_ulong_from_py(unsigned long *x, PyObject *py, const char *name)
+{
+ if (!PyLong_Check(py))
+ {
+ PyErr_Format(PyExc_TypeError, "%s expected integer, not %R", name, py);
+ return 0;
+ }
+
+ const unsigned long tmp = PyLong_AsUnsignedLong(py);
+ if (PyErr_Occurred())
+ {
+ if (PyErr_ExceptionMatches(PyExc_OverflowError))
+ PyErr_Format(PyExc_OverflowError, "%s overflows unsigned long: %R",
+ name, py);
+ return 0;
+ }
+ *x = tmp;
+ return 1;
+}
+
int bup_ullong_from_py(unsigned PY_LONG_LONG *x, PyObject *py, const char *name)
{
if (!PyLong_Check(py))
diff --git a/src/bup/pyutil.h b/src/bup/pyutil.h
index 93059e98..1a0ca74d 100644
--- a/src/bup/pyutil.h
+++ b/src/bup/pyutil.h
@@ -10,6 +10,7 @@
void *checked_calloc(size_t n, size_t size);
void *checked_malloc(size_t n, size_t size);
+int bup_int_from_py(int *x, PyObject *py, const char *name);
int bup_uint_from_py(unsigned int *x, PyObject *py, const char *name);
int bup_ulong_from_py(unsigned long *x, PyObject *py, const char *name);
int bup_ullong_from_py(unsigned long long *x, PyObject *py, const char *name);
--
2.47.3