Add enc_dsqs and enc_shs, very similar to enc_dsq and enc_sh, except
that they work on strings instead of bytes, and they pass through any
non-ASCII characters (ord() > 127). They also \xNN escape any lone
surrogates encountered (presumably via errors=surrogateescape) where
NN is the value for the original byte.
Change path_msg to return the enc_shs() result for a surrogate-escaped
fsdecoded value. This is intended to provide unsurprising, copy
pasteable values on standard error much of the time.
There's some code duplication which presumes these functions may be
called enough to justify it.
cf.
https://peps.python.org/pep-0383/
Signed-off-by: Rob Browning <
r...@defaultvalue.org>
Tested-by: Rob Browning <
r...@defaultvalue.org>
---
Now that we have the POSIX quoting code (and POSIX quoting is
sufficiently expressive via the newer $'' option), the key change
here is to POSIX quote (when needed) most paths that we print to
stderr (log) instead of rendering them via python's
errors=surrogateescape.
Proposed for main.
lib/bup/io.py | 100 ++++++++++++++++++++++++++++++++++++++++----
note/main.md | 4 ++
test/int/test_io.py | 92 +++++++++++++++++++++++++++++-----------
3 files changed, 164 insertions(+), 32 deletions(-)
diff --git a/lib/bup/io.py b/lib/bup/io.py
index 3c20ea87..b93de5de 100644
--- a/lib/bup/io.py
+++ b/lib/bup/io.py
@@ -1,5 +1,6 @@
from errno import EAGAIN
+from os import fsdecode
import mmap as py_mmap
import os, select, sys, time
@@ -101,9 +102,23 @@ def _make_enc_sh_map():
_enc_sh_map = _make_enc_sh_map()
+def _make_enc_shs_map():
+ m = [None] * 128
+ for i in range(128):
+ enc = _enc_sh_map[i]
+ if enc:
+ m[i] = enc.decode('ascii')
+ return m
+
+_enc_shs_map = _make_enc_shs_map()
+
+
def enc_dsq(val):
- """Encode val in POSIX $'...' (dollar-single-quote) format."""
- #
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html#tag_19_02_04
+ """Encode val (bytes) in POSIX $'...' (dollar-single-quote)
+ format.
+
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html#tag_19_02_04
+
+ """
result = [b"$'"]
part_start = 0
i = 0
@@ -127,10 +142,47 @@ def enc_dsq(val):
result.append(b"'")
return b''.join(result)
+def enc_dsqs(val):
+ """Encode string in POSIX $'...' (dollar-single-quote) format with
+ any surrogates (from surrogate escape) \\xNN encoded as the
+ original bytes. Pass through any characters whose ord() is >= 128.
+
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html#tag_19_02_04
+
https://peps.python.org/pep-0383/
+
+ """
+ result = ["$'"]
+ part_start = 0
+ i = 0
+
+ def finish_part():
+ nonlocal result, i, part_start
+ if i != part_start:
+ result.append(val[part_start:i])
+ part_start = i = i + 1
+
+ encoding = _enc_shs_map
+ while i < len(val):
+ b = ord(val[i])
+ if b < 128:
+ enc = encoding[b]
+ elif (b >= 0xdc80 and b <= 0xdcff): # surrogate escape
+ enc = r'\x%02x' % (128 + (b - 0xdc80))
+ else:
+ enc = None
+ if enc:
+ finish_part()
+ result.append(enc)
+ else:
+ i += 1
+ finish_part()
+ result.append("'")
+ return ''.join(result)
+
+
def enc_sh(val):
- """Minimally POSIX quote val as a single line. Use no quotes if
- possible, single quotes if val doesn't contain single quotes or
- newline, otherwise dollar-single-quote.
+ """Minimally POSIX quote val (bytes) as a single line. Use no
+ quotes if possible, single quotes if val doesn't contain single
+ quotes or newline, otherwise dollar-single-quote.
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html#tag_19_02
For now, like git with core.quotePath set to false, this
@@ -146,7 +198,7 @@ def enc_sh(val):
need_sq = False
need_dsq = False
for c in val: # 32 is space
- if c < 32 or c == b"'"[0]:
+ if c < 32 or c >= 127 or c == b"'"[0]:
need_dsq = True
break
# This set is everything from POSIX except ' and \n (handled above).
@@ -158,10 +210,42 @@ def enc_sh(val):
return b"'%s'" % val
return val
+def enc_shs(val):
+ """Minimally POSIX quote val (string) as a single line. Use no
+ quotes if possible, single quotes if val doesn't contain single
+ quotes or newline, otherwise dollar-single-quote.
+
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html#tag_19_02
+
https://peps.python.org/pep-0383/
+
+ \\xNN encode any surrogates (from surrogate escape) as the
+ original bytes. Pass through any characters whose ord() is >= 128.
+
+ """
+ #pylint: disable=consider-using-in
+ assert isinstance(val, str), val
+ if val == '':
+ return "''"
+ need_sq = False
+ need_dsq = False
+ for ch in val:
+ c = ord(ch)
+ if c < 32 or c == b"'"[0] or c == 127 \
+ or (c >= 0xdc80 and c <= 0xdcff): # lone surrogate (PEP-0383)
+ need_dsq = True
+ break
+ # This set is everything from POSIX except ' and \n (handled above).
+ if ch in '|&;<>()$`\\" \t*?[]^!#~=%{,}':
+ need_sq = True
+ if need_dsq:
+ return enc_dsqs(val)
+ if need_sq:
+ return f"'{val}'"
+ return val
+
+
def path_msg(x):
"""Return a string representation of a path."""
- # FIXME: configurability (might git-config quotePath be involved?)
- return x.decode(errors='backslashreplace')
+ return enc_shs(fsdecode(x))
assert not hasattr(py_mmap.mmap, '__del__')
diff --git a/note/main.md b/note/main.md
index 7f85cdf0..bd597f6c 100644
--- a/note/main.md
+++ b/note/main.md
@@ -144,6 +144,10 @@ General
single quote or newline, and falls back to `$'...'` quoting
otherwise.
+* Many of the paths printed in error and status messages are now POSIX
+ quoted instead of Python "backslashreplace"d, but note that the
+ format is not settled, i.e. may continue to change.
+
Bugs
----
diff --git a/test/int/test_io.py b/test/int/test_io.py
index a6e30ca5..d683aa0c 100644
--- a/test/int/test_io.py
+++ b/test/int/test_io.py
@@ -1,38 +1,37 @@
from wvpytest import *
-from
bup.io import enc_dsq, enc_sh
+from
bup.io import enc_dsq, enc_dsqs, enc_sh, enc_shs
-def test_enc_dsq():
- def enc_byte(b):
- bb = bytes([b])
- sym = {b'\a': br'\a',
- b'\b': br'\b',
- b'\t': br'\t',
- b'\n': br'\n',
- b'\v': br'\v',
- b'\f': br'\f',
- b'\r': br'\r',
- b'\x1b': br'\e'}
- sub = sym.get(bb)
- if sub:
- return sub
- if bb == b"'":
- return br"\'"
- if bb == b'\\':
- return br'\\'
- if b >= 127 or b < 7 or (b > 13 and b < 27) or (b > 27 and b < 32):
- return br'\x%02x' % b
- return bb
+def _dsq_enc_byte(b):
+ bb = bytes([b])
+ sym = {b'\a': br'\a',
+ b'\b': br'\b',
+ b'\t': br'\t',
+ b'\n': br'\n',
+ b'\v': br'\v',
+ b'\f': br'\f',
+ b'\r': br'\r',
+ b'\x1b': br'\e'}
+ sub = sym.get(bb)
+ if sub:
+ return sub
+ if bb == b"'":
+ return br"\'"
+ if bb == b'\\':
+ return br'\\'
+ if b >= 127 or b < 7 or (b > 13 and b < 27) or (b > 27 and b < 32):
+ return br'\x%02x' % b
+ return bb
+def test_enc_dsq():
def enc(bv):
result = [b"$'"]
for b in bv:
- result.append(enc_byte(b))
+ result.append(_dsq_enc_byte(b))
result.append(b"'")
return b''.join(result)
-
for i in range(1, 256):
bi = bytes([i])
wvpasseq(enc(bi), enc_dsq(bi))
@@ -47,13 +46,58 @@ def test_enc_dsq():
assert br"$'\n'" == enc_dsq(b'\n')
assert br"$'\x03'" == enc_dsq(b'\x03')
+def test_enc_dsqs():
+ def enc(s):
+ result = ["$'"]
+ for c in s:
+ result.append(_dsq_enc_byte(ord(c)).decode('ascii'))
+ result.append("'")
+ return ''.join(result)
+ for i in range(1, 128):
+ c = chr(i)
+ wvpasseq(enc(c), enc_dsqs(c))
+ v = 'foo' + c
+ wvpasseq(enc(v), enc_dsqs(v))
+ v = c + 'foo'
+ wvpasseq(enc(v), enc_dsqs(v))
+ v = 'foo' + c + 'bar'
+ wvpasseq(enc(v), enc_dsqs(v))
+
+ assert r"$'x'" == enc_dsqs('x')
+ assert r"$'\n'" == enc_dsqs('\n')
+ assert r"$'\x03'" == enc_dsqs('\x03')
+ assert r"$'\x80'" \
+ == enc_dsqs(b'\x80'.decode('ascii', errors='surrogateescape'))
+ assert r"$'\xb5'" \
+ == enc_dsqs(b'\xb5'.decode('utf-8', errors='surrogateescape'))
+
def test_enc_sh():
assert br"''" == enc_sh(b'')
assert br"'a|b'" == enc_sh(b'a|b')
assert br"$'\n'" == enc_sh(b'\n')
assert br"$'\''" == enc_sh(b"'")
assert br"$'\x00'" == enc_sh(b'\0')
+ assert br"$'\x7f'" == enc_sh(b'\x7f')
for needs_dsq in range(32):
assert enc_dsq(b'%c' % needs_dsq) == enc_sh(needs_dsq.to_bytes(1, 'big'))
for needs_sq in br'|&;<>()$`\" *?[]^!#~=%{,}':
assert b"'%c'" % needs_sq == enc_sh(needs_sq.to_bytes(1, 'big'))
+
+def test_enc_shs():
+ assert r"''" == enc_shs('')
+ assert r"'a|b'" == enc_shs('a|b')
+ assert r"$'\n'" == enc_shs('\n')
+ assert r"$'\''" == enc_shs("'")
+ assert r"$'\x00'" == enc_shs('\0')
+ assert r"$'\x7f'" == enc_shs('\x7f')
+ for needs_dsq in range(32):
+ assert enc_dsqs(chr(needs_dsq)) == enc_shs(chr(needs_dsq))
+ for needs_sq in r'|&;<>()$`\" *?[]^!#~=%{,}':
+ assert f"'{needs_sq}'" == enc_shs(needs_sq)
+ # Characters outside ascii are passed through.
+ assert 'büp' == enc_shs('büp')
+ # Undecodable bytes are \xNN escaped
+ assert r"$'\x80'" \
+ == enc_shs(b'\x80'.decode('ascii', errors='surrogateescape'))
+ assert r"$'\xb5'" \
+ == enc_shs(b'\xb5'.decode('utf-8', errors='surrogateescape'))
--
2.47.2