For rootless nesting of the uid_ns, we need to provide a list of sub
uids/gids which can be used in the sub namespace. As each id in the
sub namespace must be mappable into the parent namespace, we need to
compute the mapping based on the layout of the parent. The following
mapping works for rootless podman with --userns=keep-id.
Note, that the default range (65k) does NOT allow to map nobody/nogroup,
in the sub-namespace, as this usually is id 65k-1 and we loose at least
one id per nesting. If we get a larger range, we just map as much as
we can and by that make nobody/nogroup usable.
With these changes we also add the uidmap and acl binaries to the
container which are needed by isar to setup the namespaces and
permissions.
Signed-off-by: Felix Moessbauer <
felix.mo...@siemens.com>
---
Dockerfile | 3 +-
container-entrypoint | 44 ++++++++++++++++++++
docs/userguide/kas-container-description.inc | 5 ++-
kas-container | 43 ++++++++++++++++++-
4 files changed, 90 insertions(+), 5 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index e798472c8..b3514fad8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -113,7 +113,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=${CACHE_SHARING} \
umoci skopeo \
python3-botocore \
bubblewrap \
- debootstrap && \
+ debootstrap \
+ uidmap acl && \
rm -f /etc/apt/apt.conf.d/use-snapshot.conf /etc/apt/apt.conf.d/keep-packages.conf && \
if [ -f "/etc/apt/sources.list.d/debian.sources~" ]; then \
mv -f /etc/apt/sources.list.d/debian.sources~ /etc/apt/sources.list.d/debian.sources; \
diff --git a/container-entrypoint b/container-entrypoint
index dee7275e8..da0c36d3a 100755
--- a/container-entrypoint
+++ b/container-entrypoint
@@ -34,6 +34,47 @@ enable_qemu_binfmts()
done
}
+# For rootless nesting of the uid_ns, we need to provide a list of sub
+# uids/gids which can be used in the sub namespace. As each id in the
+# sub namespace must be mappable into the parent namespace, we need to
+# compute the mapping based on the layout of the parent: Podman maps
+# to an intermediate namespace, hence giving us a user-id map we
+# actually can use for further splitting. Docker maps directly, hence
+# we need to compute the number of sub ids we can use (which have a
+# mapping in the parent ns)
+#
+# Note, that the default range (65k) does NOT allow to map nobody/nogroup,
+# in the sub-namespace, as this usually is id 65k-1 and we loose at least
+# one id per nesting. If we get a larger range, we just map as much as
+# we can and by that make nobody/nogroup usable.
+# See man user_namespaces for details.
+setup_userns_mappings()
+{
+ UID_ROW=$(sort -r /proc/self/uid_map | head -1)
+ UID_OUTER=$(printf '%s' "$UID_ROW" | awk '{print $1}')
+ UID_INNER=$(printf '%s' "$UID_ROW" | awk '{print $2}')
+ UID_COUNT=$(printf '%s' "$UID_ROW" | awk '{print $3}')
+ GID_ROW=$(sort -r /proc/self/gid_map | head -1)
+ GID_OUTER=$(printf '%s' "$GID_ROW" | awk '{print $1}')
+ GID_INNER=$(printf '%s' "$GID_ROW" | awk '{print $2}')
+ GID_COUNT=$(printf '%s' "$GID_ROW" | awk '{print $3}')
+
+ # docker (direct mapping)
+ if [ "$UID_OUTER" = "1" ]; then
+ UID_INNER="$(($(id -u builder) + 1))"
+ [ "$UID_INNER" -lt "$UID_COUNT" ] && \
+ UID_COUNT="$((UID_COUNT - UID_INNER))"
+ fi
+ if [ "$GID_OUTER" = "1" ]; then
+ GID_INNER="$(($(id -g builder) + 1))"
+ [ "$GID_INNER" -lt "$GID_COUNT" ] && \
+ GID_COUNT="$((GID_COUNT - GID_INNER))"
+ fi
+
+ echo "builder:${UID_INNER}:${UID_COUNT}" | sudo tee /etc/subuid > /dev/null
+ echo "builder:${GID_INNER}:${GID_COUNT}" | sudo tee /etc/subgid > /dev/null
+}
+
# kas-isar: enable_qemu_binfmts
chown_managed_dirs()
@@ -103,6 +144,9 @@ else
GOSU="gosu builder"
fi
+# after all uid / gid changes are done, setup the namespace mappings
+# kas-isar: setup_userns_mappings
+
# kas-container on rootless docker workaround
if [ -n "$USER_ID" ] && [ "$USER_ID" -ne 0 ] && \
[ "$KAS_DOCKER_ROOTLESS" = "1" ] && [ "$(stat -c %u /repo)" -eq 0 ]; then
diff --git a/docs/userguide/kas-container-description.inc b/docs/userguide/kas-container-description.inc
index 7f350138a..776251124 100644
--- a/docs/userguide/kas-container-description.inc
+++ b/docs/userguide/kas-container-description.inc
@@ -39,5 +39,6 @@ written to from the host. To completely remove all data managed by kas, use
so they can be removed from the host.
.. note::
- The ISAR build system is not compatible with rootless execution. By that,
- we fall back to the system docker or podman instance.
+ The ISAR build system is compatible with rootless execution in ``isar-rootless``
+ mode only. The ``isar`` and ``isar-privileged`` modes fall back to the system docker
+ or podman instance.
diff --git a/kas-container b/kas-container
index 9afab2557..38f4992cc 100755
--- a/kas-container
+++ b/kas-container
@@ -68,6 +68,8 @@ usage()
printf "%b" "\nOptional arguments:\n"
printf "%b" "--isar\t\t\tUse kas-isar container to build Isar image. To force\n"
printf "%b" " \t\t\tthe use of run0 over sudo, set KAS_SUDO_CMD=run0.\n"
+ printf "%b" "--isar-privileged\tRun the isar build in privileged mode\n"
+ printf "%b" "--isar-rootless\t\tRun the isar build in rootless mode\n"
printf "%b" "--with-loop-dev Pass a loop device to the " \
"container. Only required if\n"
printf "%b" "\t\t\tloop-mounting is used by recipes.\n"
@@ -168,6 +170,33 @@ enable_isar_mode()
fi
}
+enable_isar_rootless_mode()
+{
+ if [ -n "${ISAR_ROOTLESS_MODE}" ]; then
+ return
+ fi
+ ISAR_ROOTLESS_MODE=1
+ KAS_CONTAINER_IMAGE_NAME_DEFAULT="kas-isar"
+
+ # Use --privileged to pass the ambient capabilities into the container.
+ # When calling from the user session (podman or docker-rootless), this
+ # is fundamentally different from the system docker run --privileged
+ if [ "${KAS_CONTAINER_ENGINE}" = "podman" ]; then
+ KAS_RUNTIME_ARGS="${KAS_RUNTIME_ARGS} --userns=keep-id --privileged"
+ elif [ "${KAS_DOCKER_ROOTLESS}" = "1" ]; then
+ KAS_ISAR_ARGS="--privileged"
+ else
+ # we don't need --privileged, but we need to run with SYS_ADMIN
+ # to be able to unshare.
+ KAS_ISAR_ARGS=" \
+ --security-opt seccomp=unconfined \
+ --security-opt apparmor=unconfined \
+ --security-opt systempaths=unconfined \
+ --cap-add=SYS_ADMIN \
+ "
+ fi
+}
+
enable_oe_mode()
{
if [ "${KAS_CONTAINER_ENGINE}" = "podman" ]; then
@@ -362,10 +391,14 @@ esac
# parse kas-container options
while [ $# -gt 0 ]; do
case "$1" in
- --isar)
+ --isar|--isar-privileged)
enable_isar_mode
shift 1
;;
+ --isar-rootless)
+ enable_isar_rootless_mode
+ shift 1
+ ;;
--with-loop-dev)
if ! KAS_LOOP_DEV=$(/sbin/losetup -f 2>/dev/null); then
if [ "$(id -u)" -eq 0 ]; then
@@ -579,8 +612,10 @@ else
sed 's/build_system:[ ]\+//')
fi
-if [ "${BUILD_SYSTEM}" = "isar" ]; then
+if [ "${BUILD_SYSTEM}" = "isar" ] || [ "${BUILD_SYSTEM}" = "isar-privileged" ]; then
enable_isar_mode
+elif [ "${BUILD_SYSTEM}" = "isar-rootless" ]; then
+ enable_isar_rootless_mode
elif [ -z "${ISAR_MODE}" ]; then
enable_oe_mode
fi
@@ -778,5 +813,9 @@ while [ $KAS_EXTRA_BITBAKE_ARGS -gt 0 ]; do
KAS_EXTRA_BITBAKE_ARGS=$((KAS_EXTRA_BITBAKE_ARGS - 1))
done
+if [ "${ISAR_MODE}" = "1" ] && [ "${ISAR_ROOTLESS_MODE}" = "1" ]; then
+ fatal_error "only one of --isar and --isar-rootless can be selected."
+fi
+
# shellcheck disable=SC2086
trace ${KAS_CONTAINER_COMMAND} run "$@"
--
2.51.0