Kirill Kolyshkin has uploaded this change for review.
syscall: add CgroupFD support for ForkExec on Linux
TODO: describe this, clean up tests.
Updates #51246.
Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e
---
M src/syscall/asm_linux_386.s
M src/syscall/asm_linux_amd64.s
M src/syscall/asm_linux_arm.s
M src/syscall/asm_linux_arm64.s
M src/syscall/asm_linux_loong64.s
M src/syscall/asm_linux_mips64x.s
M src/syscall/asm_linux_mipsx.s
M src/syscall/asm_linux_ppc64x.s
M src/syscall/asm_linux_riscv64.s
M src/syscall/asm_linux_s390x.s
M src/syscall/exec_linux.go
M src/syscall/exec_linux_test.go
M src/syscall/syscall_linux_386.go
M src/syscall/syscall_linux_amd64.go
M src/syscall/syscall_linux_arm.go
M src/syscall/syscall_linux_arm64.go
M src/syscall/syscall_linux_loong64.go
M src/syscall/syscall_linux_mips64x.go
M src/syscall/syscall_linux_mipsx.go
M src/syscall/syscall_linux_ppc64x.go
M src/syscall/syscall_linux_riscv64.go
M src/syscall/syscall_linux_s390x.go
22 files changed, 340 insertions(+), 106 deletions(-)
diff --git a/src/syscall/asm_linux_386.s b/src/syscall/asm_linux_386.s
index e86a859..a8e63f7 100644
--- a/src/syscall/asm_linux_386.s
+++ b/src/syscall/asm_linux_386.s
@@ -13,24 +13,24 @@
// instead of the glibc-specific "CALL 0x10(GS)".
#define INVOKE_SYSCALL INT $0x80
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-20
MOVL trap+0(FP), AX // syscall entry
MOVL a1+4(FP), BX
- MOVL $0, CX
+ MOVL a2+8(FP), CX
MOVL $0, DX
POPL SI // preserve return address
INVOKE_SYSCALL
PUSHL SI
CMPL AX, $0xfffff001
JLS ok
- MOVL $-1, r1+8(FP)
+ MOVL $-1, r1+12(FP)
NEGL AX
- MOVL AX, err+12(FP)
+ MOVL AX, err+16(FP)
RET
ok:
- MOVL AX, r1+8(FP)
- MOVL $0, err+12(FP)
+ MOVL AX, r1+12(FP)
+ MOVL $0, err+16(FP)
RET
// func rawSyscallNoError(trap uintptr, a1, a2, a3 uintptr) (r1, r2 uintptr);
diff --git a/src/syscall/asm_linux_amd64.s b/src/syscall/asm_linux_amd64.s
index 3206a45..00d6fed 100644
--- a/src/syscall/asm_linux_amd64.s
+++ b/src/syscall/asm_linux_amd64.s
@@ -11,10 +11,10 @@
#define SYS_gettimeofday 96
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
MOVQ a1+8(FP), DI
- MOVQ $0, SI
+ MOVQ a2+16(FP), SI
MOVQ $0, DX
MOVQ $0, R10
MOVQ $0, R8
@@ -25,13 +25,13 @@
PUSHQ R12
CMPQ AX, $0xfffffffffffff001
JLS ok2
- MOVQ $-1, r1+16(FP)
+ MOVQ $-1, r1+24(FP)
NEGQ AX
- MOVQ AX, err+24(FP)
+ MOVQ AX, err+32(FP)
RET
ok2:
- MOVQ AX, r1+16(FP)
- MOVQ $0, err+24(FP)
+ MOVQ AX, r1+24(FP)
+ MOVQ $0, err+32(FP)
RET
// func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
diff --git a/src/syscall/asm_linux_arm.s b/src/syscall/asm_linux_arm.s
index 3252220..d399541 100644
--- a/src/syscall/asm_linux_arm.s
+++ b/src/syscall/asm_linux_arm.s
@@ -41,25 +41,25 @@
BL runtime·exitsyscall(SB)
RET
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-20
MOVW trap+0(FP), R7 // syscall entry
MOVW a1+4(FP), R0
- MOVW $0, R1
+ MOVW a2+8(FP), R1
MOVW $0, R2
SWI $0
MOVW $0xfffff001, R1
CMP R1, R0
BLS ok
MOVW $-1, R1
- MOVW R1, r1+8(FP)
+ MOVW R1, r1+12(FP)
RSB $0, R0, R0
- MOVW R0, err+12(FP)
+ MOVW R0, err+16(FP)
RET
ok:
- MOVW R0, r1+8(FP)
+ MOVW R0, r1+12(FP)
MOVW $0, R0
- MOVW R0, err+12(FP)
+ MOVW R0, err+16(FP)
RET
// func rawSyscallNoError(trap uintptr, a1, a2, a3 uintptr) (r1, r2 uintptr);
diff --git a/src/syscall/asm_linux_arm64.s b/src/syscall/asm_linux_arm64.s
index be78ac8..7fa789a 100644
--- a/src/syscall/asm_linux_arm64.s
+++ b/src/syscall/asm_linux_arm64.s
@@ -4,10 +4,10 @@
#include "textflag.h"
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-40
MOVD a1+8(FP), R0
- MOVD $0, R1
+ MOVD a2+16(FP), R1
MOVD $0, R2
MOVD $0, R3
MOVD $0, R4
@@ -17,13 +17,13 @@
CMN $4095, R0
BCC ok
MOVD $-1, R4
- MOVD R4, r1+16(FP) // r1
+ MOVD R4, r1+24(FP) // r1
NEG R0, R0
- MOVD R0, err+24(FP) // errno
+ MOVD R0, err+32(FP) // errno
RET
ok:
- MOVD R0, r1+16(FP) // r1
- MOVD ZR, err+24(FP) // errno
+ MOVD R0, r1+24(FP) // r1
+ MOVD ZR, err+32(FP) // errno
RET
// func rawSyscallNoError(trap uintptr, a1, a2, a3 uintptr) (r1, r2 uintptr);
diff --git a/src/syscall/asm_linux_loong64.s b/src/syscall/asm_linux_loong64.s
index 7dc69c6..1a7457c 100644
--- a/src/syscall/asm_linux_loong64.s
+++ b/src/syscall/asm_linux_loong64.s
@@ -8,10 +8,10 @@
// System calls for loong64, Linux
//
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-40
MOVV a1+8(FP), R4
- MOVV $0, R5
+ MOVV a2+16(FP), R5
MOVV $0, R6
MOVV $0, R7
MOVV $0, R8
@@ -21,13 +21,13 @@
MOVW $-4096, R12
BGEU R12, R4, ok
MOVV $-1, R12
- MOVV R12, r1+16(FP) // r1
+ MOVV R12, r1+24(FP) // r1
SUBVU R4, R0, R4
- MOVV R4, err+24(FP) // errno
+ MOVV R4, err+32(FP) // errno
RET
ok:
- MOVV R4, r1+16(FP) // r1
- MOVV R0, err+24(FP) // errno
+ MOVV R4, r1+24(FP) // r1
+ MOVV R0, err+32(FP) // errno
RET
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
diff --git a/src/syscall/asm_linux_mips64x.s b/src/syscall/asm_linux_mips64x.s
index fadf193..ceafeb6 100644
--- a/src/syscall/asm_linux_mips64x.s
+++ b/src/syscall/asm_linux_mips64x.s
@@ -10,10 +10,10 @@
// System calls for mips64, Linux
//
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
MOVV a1+8(FP), R4
- MOVV R0, R5
+ MOVV a2+16(FP), R5
MOVV R0, R6
MOVV R0, R7
MOVV R0, R8
@@ -22,12 +22,12 @@
SYSCALL
BEQ R7, ok
MOVV $-1, R1
- MOVV R1, r1+16(FP) // r1
- MOVV R2, err+24(FP) // errno
+ MOVV R1, r1+24(FP) // r1
+ MOVV R2, err+32(FP) // errno
RET
ok:
- MOVV R2, r1+16(FP) // r1
- MOVV R0, err+24(FP) // errno
+ MOVV R2, r1+24(FP) // r1
+ MOVV R0, err+32(FP) // errno
RET
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
diff --git a/src/syscall/asm_linux_mipsx.s b/src/syscall/asm_linux_mipsx.s
index b8cae96..3e5e8b1 100644
--- a/src/syscall/asm_linux_mipsx.s
+++ b/src/syscall/asm_linux_mipsx.s
@@ -44,21 +44,21 @@
JAL runtime·exitsyscall(SB)
RET
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-20
MOVW a1+4(FP), R4
- MOVW R0, R5
+ MOVW a2+8(FP), R5
MOVW R0, R6
MOVW trap+0(FP), R2 // syscall entry
SYSCALL
BEQ R7, ok
MOVW $-1, R1
- MOVW R1, r1+8(FP) // r1
- MOVW R2, err+12(FP) // errno
+ MOVW R1, r1+12(FP) // r1
+ MOVW R2, err+16(FP) // errno
RET
ok:
- MOVW R2, r1+8(FP) // r1
- MOVW R0, err+12(FP) // errno
+ MOVW R2, r1+12(FP) // r1
+ MOVW R0, err+16(FP) // errno
RET
TEXT ·rawSyscallNoError(SB),NOSPLIT,$20-24
diff --git a/src/syscall/asm_linux_ppc64x.s b/src/syscall/asm_linux_ppc64x.s
index 89cc1c2..b9412fe 100644
--- a/src/syscall/asm_linux_ppc64x.s
+++ b/src/syscall/asm_linux_ppc64x.s
@@ -10,10 +10,10 @@
// System calls for ppc64, Linux
//
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
MOVD a1+8(FP), R3
- MOVD R0, R4
+ MOVD a2+16(FP), R4
MOVD R0, R5
MOVD R0, R6
MOVD R0, R7
@@ -22,12 +22,12 @@
SYSCALL R9
BVC ok
MOVD $-1, R4
- MOVD R4, r1+16(FP) // r1
- MOVD R3, err+24(FP) // errno
+ MOVD R4, r1+24(FP) // r1
+ MOVD R3, err+32(FP) // errno
RET
ok:
- MOVD R3, r1+16(FP) // r1
- MOVD R0, err+24(FP) // errno
+ MOVD R3, r1+24(FP) // r1
+ MOVD R0, err+32(FP) // errno
RET
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
diff --git a/src/syscall/asm_linux_riscv64.s b/src/syscall/asm_linux_riscv64.s
index 0fc1f73..6fd09ec 100644
--- a/src/syscall/asm_linux_riscv64.s
+++ b/src/syscall/asm_linux_riscv64.s
@@ -8,10 +8,10 @@
// System calls for riscv64, Linux
//
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
MOV a1+8(FP), A0
- MOV ZERO, A1
+ MOV a2+16(FP), A1
MOV ZERO, A2
MOV ZERO, A3
MOV ZERO, A4
@@ -20,14 +20,14 @@
ECALL
MOV $-4096, T0
BLTU T0, A0, err
- MOV A0, r1+16(FP) // r1
- MOV ZERO, err+24(FP) // errno
+ MOV A0, r1+24(FP) // r1
+ MOV ZERO, err+32(FP) // errno
RET
err:
MOV $-1, T0
- MOV T0, r1+16(FP) // r1
+ MOV T0, r1+24(FP) // r1
SUB A0, ZERO, A0
- MOV A0, err+24(FP) // errno
+ MOV A0, err+32(FP) // errno
RET
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
diff --git a/src/syscall/asm_linux_s390x.s b/src/syscall/asm_linux_s390x.s
index c3631c1..db8f9a6 100644
--- a/src/syscall/asm_linux_s390x.s
+++ b/src/syscall/asm_linux_s390x.s
@@ -8,11 +8,11 @@
// System calls for s390x, Linux
//
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
MOVD $0, R2
MOVD a1+8(FP), R3
- MOVD $0, R4
+ MOVD a2+16(FP), R4
MOVD $0, R5
MOVD $0, R6
MOVD $0, R7
@@ -20,13 +20,13 @@
SYSCALL
MOVD $0xfffffffffffff001, R8
CMPUBLT R2, R8, ok2
- MOVD $-1, r1+16(FP)
+ MOVD $-1, r1+24(FP)
NEG R2, R2
- MOVD R2, err+24(FP) // errno
+ MOVD R2, err+32(FP) // errno
RET
ok2:
- MOVD R2, r1+16(FP)
- MOVD $0, err+24(FP) // errno
+ MOVD R2, r1+24(FP)
+ MOVD $0, err+32(FP) // errno
RET
// func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go
index 43903ee..7165c75 100644
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -99,6 +99,8 @@
// users this should be set to false for mappings work.
GidMappingsEnableSetgroups bool
AmbientCaps []uintptr // Ambient capabilities (Linux only)
+ UseCgroupFD bool // Whether to make use of CgroupFD field.
+ CgroupFD int // File descriptor of a cgroup to put the new process into.
}
var (
@@ -176,6 +178,21 @@
// See CAP_TO_MASK in linux/capability.h:
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
+// cloneArgs holds arguments for clone3 Linux syscall.
+type cloneArgs struct {
+ flags uint64 // Flags bit mask
+ pidFD uint64 // Where to store PID file descriptor (int *)
+ childTID uint64 // Where to store child TID, in child's memory (pid_t *)
+ parentPID uint64 // Where to store child TID, in parent's memory (pid_t *)
+ exitSignal uint64 // Signal to deliver to parent on child termination
+ stack uint64 // Pointer to lowest byte of stack
+ stackSize uint64 // Size of stack
+ tls uint64 // Location of new TLS
+ setTID uint64 // Pointer to a pid_t array (since Linux 5.5)
+ setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
+ cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7)
+}
+
// forkAndExecInChild1 implements the body of forkAndExecInChild up to
// the parent's post-fork path. This is a separate function so we can
// separate the child's and parent's stack frames if we're using
@@ -256,13 +273,24 @@
// No more allocation or calls of non-assembly functions.
runtime_BeforeFork()
locked = true
+ flags := sys.Cloneflags
+ if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
+ flags |= CLONE_VFORK | CLONE_VM
+ }
switch {
- case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
- r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
+ case sys.UseCgroupFD:
+ args := cloneArgs{
+ flags: uint64(flags | CLONE_INTO_CGROUP),
+ exitSignal: uint64(SIGCHLD),
+ cgroup: uint64(sys.CgroupFD),
+ }
+ r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(&args)), unsafe.Sizeof(args))
+ case flags&CLONE_VFORK != 0:
+ r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD)|flags, 0)
case runtime.GOARCH == "s390x":
- r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
+ r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|flags, 0, 0, 0, 0)
default:
- r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
+ r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|flags, 0, 0, 0, 0, 0)
}
if err1 != 0 || r1 != 0 {
// If we're in the parent, we must return immediately
diff --git a/src/syscall/exec_linux_test.go b/src/syscall/exec_linux_test.go
index 8a9258d..bc40e4f 100644
--- a/src/syscall/exec_linux_test.go
+++ b/src/syscall/exec_linux_test.go
@@ -7,6 +7,7 @@
package syscall_test
import (
+ "bytes"
"flag"
"fmt"
"internal/testenv"
@@ -14,6 +15,7 @@
"os"
"os/exec"
"os/user"
+ "path"
"path/filepath"
"runtime"
"strconv"
@@ -116,8 +118,12 @@
}
}
-func whoamiCmd(t *testing.T, uid, gid int, setgroups bool) *exec.Cmd {
+func whoamiCmd(t *testing.T, uid, gid int, setgroups, cgroup bool) *exec.Cmd {
checkUserNS(t)
+ cgroupFD := 0
+ if cgroup {
+ cgroupFD, _ = prepareCgroupFD(t)
+ }
cmd := exec.Command("whoami")
cmd.SysProcAttr = &syscall.SysProcAttr{
Cloneflags: syscall.CLONE_NEWUSER,
@@ -128,12 +134,15 @@
{ContainerID: 0, HostID: gid, Size: 1},
},
GidMappingsEnableSetgroups: setgroups,
+
+ UseCgroupFD: cgroup,
+ CgroupFD: cgroupFD,
}
return cmd
}
-func testNEWUSERRemap(t *testing.T, uid, gid int, setgroups bool) {
- cmd := whoamiCmd(t, uid, gid, setgroups)
+func testNEWUSERRemap(t *testing.T, uid, gid int, setgroups, cgroup bool) {
+ cmd := whoamiCmd(t, uid, gid, setgroups, cgroup)
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("Cmd failed with err %v, output: %s", err, out)
@@ -149,28 +158,49 @@
if os.Getuid() != 0 {
t.Skip("skipping root only test")
}
- testNEWUSERRemap(t, 0, 0, false)
+ testNEWUSERRemap(t, 0, 0, false, false)
+}
+
+func TestCloneNEWUSERAndRemapRootDisableSetgroupsCgroupFD(t *testing.T) {
+ if os.Getuid() != 0 {
+ t.Skip("skipping root only test")
+ }
+ testNEWUSERRemap(t, 0, 0, false, true)
}
func TestCloneNEWUSERAndRemapRootEnableSetgroups(t *testing.T) {
if os.Getuid() != 0 {
t.Skip("skipping root only test")
}
- testNEWUSERRemap(t, 0, 0, true)
+ testNEWUSERRemap(t, 0, 0, true, false)
+}
+
+func TestCloneNEWUSERAndRemapRootEnableSetgroupsCgroupFD(t *testing.T) {
+ if os.Getuid() != 0 {
+ t.Skip("skipping root only test")
+ }
+ testNEWUSERRemap(t, 0, 0, true, true)
}
func TestCloneNEWUSERAndRemapNoRootDisableSetgroups(t *testing.T) {
if os.Getuid() == 0 {
t.Skip("skipping unprivileged user only test")
}
- testNEWUSERRemap(t, os.Getuid(), os.Getgid(), false)
+ testNEWUSERRemap(t, os.Getuid(), os.Getgid(), false, false)
+}
+
+func TestCloneNEWUSERAndRemapNoRootDisableSetgroupsCgroupFD(t *testing.T) {
+ if os.Getuid() == 0 {
+ t.Skip("skipping unprivileged user only test")
+ }
+ testNEWUSERRemap(t, os.Getuid(), os.Getgid(), false, true)
}
func TestCloneNEWUSERAndRemapNoRootSetgroupsEnableSetgroups(t *testing.T) {
if os.Getuid() == 0 {
t.Skip("skipping unprivileged user only test")
}
- cmd := whoamiCmd(t, os.Getuid(), os.Getgid(), true)
+ cmd := whoamiCmd(t, os.Getuid(), os.Getgid(), true, false)
err := cmd.Run()
if err == nil {
t.Skip("probably old kernel without security fix")
@@ -181,7 +211,15 @@
}
func TestEmptyCredGroupsDisableSetgroups(t *testing.T) {
- cmd := whoamiCmd(t, os.Getuid(), os.Getgid(), false)
+ cmd := whoamiCmd(t, os.Getuid(), os.Getgid(), false, false)
+ cmd.SysProcAttr.Credential = &syscall.Credential{}
+ if err := cmd.Run(); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestEmptyCredGroupsDisableSetgroupsCgroupFD(t *testing.T) {
+ cmd := whoamiCmd(t, os.Getuid(), os.Getgid(), false, true)
cmd.SysProcAttr.Credential = &syscall.Credential{}
if err := cmd.Run(); err != nil {
t.Fatal(err)
@@ -429,8 +467,17 @@
}
}
-// Test for Issue 29789: unshare fails when uid/gid mapping is specified
+// Test for Issue 29789: unshare fails when uid/gid mapping is specified.
func TestUnshareUidGidMapping(t *testing.T) {
+ testUnshareUidGidMapping(t, false)
+}
+
+// Same as TestUnshareUidGidMapping but with enabled CgroupFD.
+func TestUnshareUidGidMappingAndCgroupFD(t *testing.T) {
+ testUnshareUidGidMapping(t, true)
+}
+
+func testUnshareUidGidMapping(t *testing.T, cgroup bool) {
if os.Getuid() == 0 {
t.Skip("test exercises unprivileged user namespace, fails with privileges")
}
@@ -455,12 +502,108 @@
},
},
}
+ if cgroup {
+ fd, _ := prepareCgroupFD(t)
+ cmd.SysProcAttr.CgroupFD = fd
+ cmd.SysProcAttr.UseCgroupFD = true
+ }
+
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("Cmd failed with err %v, output: %s", err, out)
}
}
+func prepareCgroupFD(t *testing.T) (int, string) {
+ t.Helper()
+
+ const O_PATH = 0x200000 // Same for all architectures, but for some reason not defined in syscall for 386||amd64.
+
+ // Requires cgroup v2.
+ const prefix = "/sys/fs/cgroup"
+ selfCg, err := os.ReadFile("/proc/self/cgroup")
+ if err != nil {
+ if os.IsNotExist(err) || os.IsPermission(err) {
+ t.Skip(err)
+ }
+ t.Fatal(err)
+ }
+
+ // Expect a single line like this:
+ // 0::/user.slice/user-1000.slice/us...@1000.service/app.slice/vte-spawn-891992a2-efbb-4f28-aedb-b24f9e706770.scope
+ // Otherwise it's either cgroup v1 or a hybrid hierarchy.
+ if bytes.Count(selfCg, []byte("\n")) > 1 {
+ t.Skip("cgroup v2 not available")
+ }
+ cg := bytes.TrimPrefix(selfCg, []byte("0::"))
+ if len(cg) == len(selfCg) { // No prefix found.
+ t.Skipf("cgroup v2 not available (/proc/self/cgroup contents: %q)", selfCg)
+ }
+
+ // Need clone3 with CLONE_INTO_CGROUP support.
+ _, err = syscall.ForkExec("!ex", []string{}, &syscall.ProcAttr{
+ Sys: &syscall.SysProcAttr{
+ UseCgroupFD: true,
+ CgroupFD: -1,
+ },
+ })
+ // // EPERM can be returned if clone3 is not enabled by seccomp.
+ if err == syscall.ENOSYS || err == syscall.EPERM {
+ t.Skipf("clone3 with CLONE_INTO_CGROUP not available: %v", err)
+ }
+
+ // Need an ability to create a sub-cgroup.
+ subCgroup, err := os.MkdirTemp(prefix+string(bytes.TrimSpace(cg)), "subcg-")
+ if err != nil {
+ if os.IsPermission(err) {
+ t.Skip(err)
+ }
+ t.Fatal(err)
+ }
+ t.Cleanup(func() { syscall.Rmdir(subCgroup) })
+
+ cgroupFD, err := syscall.Open(subCgroup, O_PATH, 0)
+ if err != nil {
+ t.Fatal(&os.PathError{Op: "open", Path: subCgroup, Err: err})
+ }
+ t.Cleanup(func() { syscall.Close(cgroupFD) })
+
+ return cgroupFD, "/" + path.Base(subCgroup)
+}
+
+func TestUseCgroupFD(t *testing.T) {
+ fd, suffix := prepareCgroupFD(t)
+
+ cmd := exec.Command(os.Args[0], "-test.run=TestUseCgroupFDHelper")
+ cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ UseCgroupFD: true,
+ CgroupFD: fd,
+ }
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ t.Fatalf("Cmd failed with err %v, output: %s", err, out)
+ }
+ // NB: this wouldn't work with cgroupns.
+ if !bytes.HasSuffix(bytes.TrimSpace(out), []byte(suffix)) {
+ t.Fatalf("got: %q, want: a line that ends with %q", out, suffix)
+ }
+}
+
+func TestUseCgroupFDHelper(*testing.T) {
+ if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+ return
+ }
+ defer os.Exit(0)
+ // Read and print own cgroup path.
+ selfCg, err := os.ReadFile("/proc/self/cgroup")
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(2)
+ }
+ fmt.Print(string(selfCg))
+}
+
type capHeader struct {
version uint32
pid int32
@@ -545,15 +688,30 @@
t.Skip("kernel prohibits unshare in unprivileged process, unless using user namespace")
}
- testAmbientCaps(t, false)
+ testAmbientCaps(t, false, false)
+}
+
+func TestAmbientCapsCgroupFD(t *testing.T) {
+ // Make sure we are running as root so we have permissions to use unshare
+ // and create a network namespace.
+ if os.Getuid() != 0 {
+ t.Skip("kernel prohibits unshare in unprivileged process, unless using user namespace")
+ }
+
+ testAmbientCaps(t, false, true)
}
func TestAmbientCapsUserns(t *testing.T) {
checkUserNS(t)
- testAmbientCaps(t, true)
+ testAmbientCaps(t, true, false)
}
-func testAmbientCaps(t *testing.T, userns bool) {
+func TestAmbientCapsUsernsCgroupFD(t *testing.T) {
+ checkUserNS(t)
+ testAmbientCaps(t, true, true)
+}
+
+func testAmbientCaps(t *testing.T, userns, cgroup bool) {
skipInContainer(t)
mustSupportAmbientCaps(t)
@@ -610,6 +768,11 @@
},
AmbientCaps: []uintptr{CAP_SYS_TIME, CAP_SYSLOG},
}
+ if cgroup {
+ fd, _ := prepareCgroupFD(t)
+ cmd.SysProcAttr.UseCgroupFD = true
+ cmd.SysProcAttr.CgroupFD = fd
+ }
if userns {
cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWUSER
const nobody = 65534
diff --git a/src/syscall/syscall_linux_386.go b/src/syscall/syscall_linux_386.go
index fc7df84..eade035 100644
--- a/src/syscall/syscall_linux_386.go
+++ b/src/syscall/syscall_linux_386.go
@@ -6,7 +6,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS32
+const (
+ _SYS_setgroups = SYS_SETGROUPS32
+ _SYS_clone3 = 435
+)
func setTimespec(sec, nsec int64) Timespec {
return Timespec{Sec: int32(sec), Nsec: int32(nsec)}
@@ -347,4 +350,4 @@
cmsg.Len = uint32(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_amd64.go b/src/syscall/syscall_linux_amd64.go
index 0bcc664..4b32455 100644
--- a/src/syscall/syscall_linux_amd64.go
+++ b/src/syscall/syscall_linux_amd64.go
@@ -4,7 +4,10 @@
package syscall
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
+)
//sys Dup2(oldfd int, newfd int) (err error)
//sysnb EpollCreate(size int) (fd int, err error)
@@ -119,4 +122,4 @@
cmsg.Len = uint64(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_arm.go b/src/syscall/syscall_linux_arm.go
index 9db7027..40225c4 100644
--- a/src/syscall/syscall_linux_arm.go
+++ b/src/syscall/syscall_linux_arm.go
@@ -6,7 +6,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS32
+const (
+ _SYS_setgroups = SYS_SETGROUPS32
+ _SYS_clone3 = 435
+)
func setTimespec(sec, nsec int64) Timespec {
return Timespec{Sec: int32(sec), Nsec: int32(nsec)}
@@ -199,4 +202,4 @@
cmsg.Len = uint32(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_arm64.go b/src/syscall/syscall_linux_arm64.go
index ef935f3..d1be21d 100644
--- a/src/syscall/syscall_linux_arm64.go
+++ b/src/syscall/syscall_linux_arm64.go
@@ -6,7 +6,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
+)
func EpollCreate(size int) (fd int, err error) {
if size <= 0 {
@@ -183,4 +186,4 @@
return err
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_loong64.go b/src/syscall/syscall_linux_loong64.go
index 99674b4..1ae43c3 100644
--- a/src/syscall/syscall_linux_loong64.go
+++ b/src/syscall/syscall_linux_loong64.go
@@ -6,7 +6,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
+)
func EpollCreate(size int) (fd int, err error) {
if size <= 0 {
@@ -222,4 +225,4 @@
return err
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_mips64x.go b/src/syscall/syscall_linux_mips64x.go
index 258eb97..6a96774 100644
--- a/src/syscall/syscall_linux_mips64x.go
+++ b/src/syscall/syscall_linux_mips64x.go
@@ -6,7 +6,10 @@
package syscall
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 5435
+)
//sys Dup2(oldfd int, newfd int) (err error)
//sysnb EpollCreate(size int) (fd int, err error)
@@ -181,4 +184,4 @@
cmsg.Len = uint64(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_mipsx.go b/src/syscall/syscall_linux_mipsx.go
index 5390277..2a88dfc 100644
--- a/src/syscall/syscall_linux_mipsx.go
+++ b/src/syscall/syscall_linux_mipsx.go
@@ -8,7 +8,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 4435
+)
func Syscall9(trap, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno)
@@ -192,4 +195,4 @@
cmsg.Len = uint32(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_ppc64x.go b/src/syscall/syscall_linux_ppc64x.go
index 88ad8e4..2c706e1 100644
--- a/src/syscall/syscall_linux_ppc64x.go
+++ b/src/syscall/syscall_linux_ppc64x.go
@@ -6,7 +6,10 @@
package syscall
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
+)
//sys Dup2(oldfd int, newfd int) (err error)
//sysnb EpollCreate(size int) (fd int, err error)
@@ -89,7 +92,7 @@
cmsg.Len = uint64(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
//sys syncFileRange2(fd int, flags int, off int64, n int64) (err error) = SYS_SYNC_FILE_RANGE2
diff --git a/src/syscall/syscall_linux_riscv64.go b/src/syscall/syscall_linux_riscv64.go
index 0ac4c54..cf9a751 100644
--- a/src/syscall/syscall_linux_riscv64.go
+++ b/src/syscall/syscall_linux_riscv64.go
@@ -6,7 +6,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
+)
func EpollCreate(size int) (fd int, err error) {
if size <= 0 {
@@ -169,4 +172,4 @@
return err
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_s390x.go b/src/syscall/syscall_linux_s390x.go
index 46b252d..a28993b 100644
--- a/src/syscall/syscall_linux_s390x.go
+++ b/src/syscall/syscall_linux_s390x.go
@@ -6,7 +6,10 @@
import "unsafe"
-const _SYS_setgroups = SYS_SETGROUPS
+const (
+ _SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
+)
//sys Dup2(oldfd int, newfd int) (err error)
//sysnb EpollCreate(size int) (fd int, err error)
@@ -256,4 +259,4 @@
cmsg.Len = uint64(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Tobias Klauser.
1 comment:
Patchset:
Can someone please run try-bots (afaik I can't do it myself)? I'm not entirely sure about s390, it has weird syscall semantics (first argument goes to R3, but Syscall implementation says R2) so would like trybots to run these tests (this is also the reason I added so many test cases -- will minimize it later).
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Kirill Kolyshkin, Tobias Klauser.
Patch set 1:Run-TryBot +1
1 comment:
Patchset:
TRY=s390x
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Kirill Kolyshkin, Tobias Klauser.
Kirill Kolyshkin uploaded patch set #2 to this change.
The following approvals got outdated and were removed: Run-TryBot+1 by Ian Lance Taylor, TryBot-Result-1 by Gopher Robot
syscall: add CgroupFD support for ForkExec on Linux
TODO: describe this, clean up tests.
Updates #51246.
Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e
---
M api/next/51246.txt
M src/syscall/asm_linux_386.s
M src/syscall/asm_linux_amd64.s
M src/syscall/asm_linux_arm.s
M src/syscall/asm_linux_arm64.s
M src/syscall/asm_linux_loong64.s
M src/syscall/asm_linux_mips64x.s
M src/syscall/asm_linux_mipsx.s
M src/syscall/asm_linux_ppc64x.s
M src/syscall/asm_linux_riscv64.s
M src/syscall/asm_linux_s390x.s
M src/syscall/exec_linux.go
M src/syscall/exec_linux_test.go
M src/syscall/syscall_linux_386.go
M src/syscall/syscall_linux_amd64.go
M src/syscall/syscall_linux_arm.go
M src/syscall/syscall_linux_arm64.go
M src/syscall/syscall_linux_loong64.go
M src/syscall/syscall_linux_mips64x.go
M src/syscall/syscall_linux_mipsx.go
M src/syscall/syscall_linux_ppc64x.go
M src/syscall/syscall_linux_riscv64.go
M src/syscall/syscall_linux_s390x.go
23 files changed, 352 insertions(+), 106 deletions(-)
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Tobias Klauser.
1 comment:
Patchset:
10 of 30 SlowBots failed. […]
I guess I fixed all these failures, need to run trybots again.
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Kirill Kolyshkin, Tobias Klauser.
Patch set 2:Run-TryBot +1
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Kirill Kolyshkin, Tobias Klauser.
Kirill Kolyshkin uploaded patch set #3 to this change.
The following approvals got outdated and were removed: Run-TryBot+1 by Ian Lance Taylor, TryBot-Result+1 by Gopher Robot
syscall: add CgroupFD support for ForkExec on Linux
Implement CLONE_INTO_CGROUP feature, allowing to put a child in a
specified cgroup in a clean and simple way. Note that the feature only
works for cgroup v2, and requires Linux kernel 5.7 or newer.
Add some tests for the new feature.
While at it, simplify the syscall calling code in forkAndExecInChild1,
which became complicated over time because:
1. It was using either rawVforkSyscall or RawSyscall6 depending on
whether CLONE_NEWUSER was set.
2. On Linux/s390, the first two arguments to clone(2) system call are
swapped (which deserved a mention in Linux ABI hall of shame). It
was workarounded in rawVforkSyscall on s390, but had to be
implemented via a switch/case when using RawSyscall6.
To untangle this,
- modify rawVforkSyscall to have two arguments;
- remove the arguments workaround from s390 asm, instead implementing
arguments swap in the caller;
- use rawVforkSyscall in all cases (since it's a subset of RawSyscall6,
except for saving/restoring the return address before/after syscall
on some platforms, which doesn't change anything).
TODO: clean up tests.
23 files changed, 386 insertions(+), 112 deletions(-)
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
Patch set 5:Run-TryBot +1
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
Kirill Kolyshkin uploaded patch set #6 to this change.
The following approvals got outdated and were removed: Run-TryBot+1 by Kirill Kolyshkin, TryBot-Result+1 by Gopher Robot
syscall: add CgroupFD support for ForkExec on Linux
Implement CLONE_INTO_CGROUP feature, allowing to put a child in a
specified cgroup in a clean and simple way. Note that the feature only
works for cgroup v2, and requires Linux kernel 5.7 or newer.
Add some tests for the new feature.
While at it, simplify the syscall calling code in forkAndExecInChild1,
which became complicated over time because:
1. It was using either rawVforkSyscall or RawSyscall6 depending on
whether CLONE_NEWUSER was set.
2. On Linux/s390, the first two arguments to clone(2) system call are
swapped (which deserved a mention in Linux ABI hall of shame). It
was workarounded in rawVforkSyscall on s390, but had to be
implemented via a switch/case when using RawSyscall6.
To untangle this,
- modify rawVforkSyscall to have two arguments;
- remove the arguments workaround from s390 asm, instead implementing
arguments swap in the caller;
- use rawVforkSyscall in all cases (since it's a subset of RawSyscall6,
except for saving/restoring the return address before/after syscall
on some platforms, which doesn't change anything).
23 files changed, 271 insertions(+), 90 deletions(-)
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
Patch set 6:Run-TryBot +1
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
5 comments:
File api/next/51246.txt:
Patch Set #6, Line 61: pkg syscall (linux-386), type SysProcAttr struct, CgroupFD int #51246
Let's sort these lines so that all of the linux-386 changes are together, and similarly for all targets.
File src/syscall/exec_linux.go:
Patch Set #6, Line 186: parentPID uint64 // Where to store child TID, in parent's memory (pid_t *)
Seems like this should be parentTID. The C name is parent_tid.
Patch Set #6, Line 278: a1 = uintptr(unsafe.Pointer(&cloneArgs{
This is not safe. The conversion from unsafe.Pointer to uintptr must occur in the system call itself. See https://pkg.go.dev/unsafe#Pointer.
File src/syscall/exec_linux_test.go:
Patch Set #6, Line 493: _, err = syscall.ForkExec("!ex", []string{}, &syscall.ProcAttr{
I don't understand the use of "!ex" here.
Pass the arguments as nil, not []string{}.
Patch Set #6, Line 505: subCgroup, err := os.MkdirTemp(prefix+string(bytes.TrimSpace(cg)), "subcg-")
Can you use t.TempDir rather than os.MkdirTemp?
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
Kirill Kolyshkin uploaded patch set #7 to this change.
The following approvals got outdated and were removed: Run-TryBot+1 by Kirill Kolyshkin, TryBot-Result+1 by Gopher Robot
syscall: add CgroupFD support for ForkExec on Linux
Implement CLONE_INTO_CGROUP feature, allowing to put a child in a
specified cgroup in a clean and simple way. Note that the feature only
works for cgroup v2, and requires Linux kernel 5.7 or newer.
Using the feature requires a new syscall, clone3. Currently this is the
only reason to use clone3, but the code is structured in a way so that
other cases may be easily added in the future.
Add a test case.
While at it, try to simplify the syscall calling code in
forkAndExecInChild1, which became complicated over time because:
1. It was using either rawVforkSyscall or RawSyscall6 depending on
whether CLONE_NEWUSER was set.
2. On Linux/s390, the first two arguments to clone(2) system call are
swapped (which deserved a mention in Linux ABI hall of shame). It
was worked around in rawVforkSyscall on s390, but had to be
implemented via a switch/case when using RawSyscall6, making the code
less clear.
Let's
- modify rawVforkSyscall to have two arguments (which is also required
for clone3);
- remove the arguments workaround from s390 asm, instead implementing
arguments swap in the caller (which still looks ugly but at least
it's done once and is clearly documented now);
- use rawVforkSyscall for all cases (since it is essentially similar to
RawSyscall6, except for having less parameters, not returning r2, and
saving/restoring the return address before/after syscall on 386 and
amd64).
Updates #51246.
Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e
---
M api/next/51246.txt
M src/syscall/asm_linux_386.s
M src/syscall/asm_linux_amd64.s
M src/syscall/asm_linux_arm.s
M src/syscall/asm_linux_arm64.s
M src/syscall/asm_linux_loong64.s
M src/syscall/asm_linux_mips64x.s
M src/syscall/asm_linux_mipsx.s
M src/syscall/asm_linux_ppc64x.s
M src/syscall/asm_linux_riscv64.s
M src/syscall/asm_linux_s390x.s
M src/syscall/exec_linux.go
M src/syscall/exec_linux_test.go
M src/syscall/syscall_linux.go
M src/syscall/syscall_linux_386.go
M src/syscall/syscall_linux_amd64.go
M src/syscall/syscall_linux_arm.go
M src/syscall/syscall_linux_arm64.go
M src/syscall/syscall_linux_loong64.go
M src/syscall/syscall_linux_mips64x.go
M src/syscall/syscall_linux_mipsx.go
M src/syscall/syscall_linux_ppc64x.go
M src/syscall/syscall_linux_riscv64.go
M src/syscall/syscall_linux_s390x.go
24 files changed, 275 insertions(+), 99 deletions(-)
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
5 comments:
File api/next/51246.txt:
Patch Set #6, Line 61: pkg syscall (linux-386), type SysProcAttr struct, CgroupFD int #51246
Let's sort these lines so that all of the linux-386 changes are together, and similarly for all targ […]
Done
File src/syscall/exec_linux.go:
Patch Set #6, Line 186: parentPID uint64 // Where to store child TID, in parent's memory (pid_t *)
Seems like this should be parentTID. The C name is parent_tid.
Thank you for spotting this; fixed.
Patch Set #6, Line 278: a1 = uintptr(unsafe.Pointer(&cloneArgs{
This is not safe. The conversion from unsafe. […]
Right. I fixed this; as a result, the code is now slightly less elegant that it was (OTOH is uses less variables).
File src/syscall/exec_linux_test.go:
Patch Set #6, Line 493: _, err = syscall.ForkExec("!ex", []string{}, &syscall.ProcAttr{
I don't understand the use of "!ex" here. […]
This is an (untasteful) abbreviation for "non-existent binary". Will fix.
Patch Set #6, Line 505: subCgroup, err := os.MkdirTemp(prefix+string(bytes.TrimSpace(cg)), "subcg-")
Can you use t.TempDir rather than os. […]
I can't; the directory created should have a specific parent, and there is no way to have specify it when using `t.TempDir`.
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
Patch set 7:Run-TryBot +1
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
Kirill Kolyshkin uploaded patch set #9 to this change.
syscall: add CgroupFD support for ForkExec on Linux
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
Patch set 9:Run-TryBot +1
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
1 comment:
File src/syscall/exec_linux_test.go:
Patch Set #6, Line 505: subCgroup, err := os.MkdirTemp(prefix+string(bytes.TrimSpace(cg)), "subcg-")
I can't; the directory created should have a specific parent, and there is no way to have specify it […]
Marking this as resolved since there's no way to use `t.TempDir`.
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
Patch set 9:Run-TryBot +1Auto-Submit +1Code-Review +2
1 comment:
Patchset:
RELNOTE=yes
Thanks.
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
1 comment:
Patchset:
One more thing -- I was not quite puzzled as to why rawVforkSyscall is almost identical to runtime/internal/syscall.Syscall (modulo extra arguments and r2), except for 386 and amd64 where it saved/restored the return address (which is on top of the stack) around the syscall in a register.
Turns out, all other platforms (that Go supports) have a dedicated register for the return address, so only 386 and amd64 uses stack for that.
That probably means we don't really need rawVforkSyscall for platforms other than 386 and amd64, and can reuse syscall.rawSyscall6 (aka runtime/internal/syscall.Syscall) for that purpose. The differences are:
- NOFRAME is missing from runtime/internal/syscall.Syscall for some platforms; I _guess_ this is not really needed since the compiler figures out this is a leaf function with the frame size of 0.
- added a3...a6 arguments and r2 return value, making the code slightly bigger (and using slightly more stack space). Most probably this is not worth optimizing out.
This is to be addressed as a follow up.
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Tobias Klauser.
1 comment:
Patchset:
I was not quite puzzled ...
Ughm, meant to say I was quite puzzled.
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Brad Fitzpatrick, Ian Lance Taylor, Joedian Reid, Kirill Kolyshkin, Tobias Klauser.
Patch set 9:Code-Review +1
Gopher Robot submitted this change.
Reviewed-on: https://go-review.googlesource.com/c/go/+/417695
Auto-Submit: Ian Lance Taylor <ia...@google.com>
Reviewed-by: Michael Knyszek <mkny...@google.com>
Reviewed-by: Ian Lance Taylor <ia...@google.com>
Run-TryBot: Ian Lance Taylor <ia...@google.com>
Run-TryBot: Kirill Kolyshkin <koly...@gmail.com>
TryBot-Result: Gopher Robot <go...@golang.org>
24 files changed, 282 insertions(+), 99 deletions(-)
diff --git a/api/next/51246.txt b/api/next/51246.txt
index ae583cf..b00f540 100644
--- a/api/next/51246.txt
+++ b/api/next/51246.txt
@@ -8,6 +8,8 @@
pkg syscall (linux-386), const CLONE_NEWTIME ideal-int #51246
pkg syscall (linux-386), const CLONE_PIDFD = 4096 #51246
pkg syscall (linux-386), const CLONE_PIDFD ideal-int #51246
+pkg syscall (linux-386), type SysProcAttr struct, CgroupFD int #51246
+pkg syscall (linux-386), type SysProcAttr struct, UseCgroupFD bool #51246
pkg syscall (linux-386-cgo), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
pkg syscall (linux-386-cgo), const CLONE_CLEAR_SIGHAND ideal-int #51246
pkg syscall (linux-386-cgo), const CLONE_INTO_CGROUP = 8589934592 #51246
@@ -18,6 +20,8 @@
pkg syscall (linux-386-cgo), const CLONE_NEWTIME ideal-int #51246
pkg syscall (linux-386-cgo), const CLONE_PIDFD = 4096 #51246
pkg syscall (linux-386-cgo), const CLONE_PIDFD ideal-int #51246
+pkg syscall (linux-386-cgo), type SysProcAttr struct, CgroupFD int #51246
+pkg syscall (linux-386-cgo), type SysProcAttr struct, UseCgroupFD bool #51246
pkg syscall (linux-amd64), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
pkg syscall (linux-amd64), const CLONE_CLEAR_SIGHAND ideal-int #51246
pkg syscall (linux-amd64), const CLONE_INTO_CGROUP = 8589934592 #51246
@@ -28,6 +32,8 @@
pkg syscall (linux-amd64), const CLONE_NEWTIME ideal-int #51246
pkg syscall (linux-amd64), const CLONE_PIDFD = 4096 #51246
pkg syscall (linux-amd64), const CLONE_PIDFD ideal-int #51246
+pkg syscall (linux-amd64), type SysProcAttr struct, CgroupFD int #51246
+pkg syscall (linux-amd64), type SysProcAttr struct, UseCgroupFD bool #51246
pkg syscall (linux-amd64-cgo), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
pkg syscall (linux-amd64-cgo), const CLONE_CLEAR_SIGHAND ideal-int #51246
pkg syscall (linux-amd64-cgo), const CLONE_INTO_CGROUP = 8589934592 #51246
@@ -38,6 +44,8 @@
pkg syscall (linux-amd64-cgo), const CLONE_NEWTIME ideal-int #51246
pkg syscall (linux-amd64-cgo), const CLONE_PIDFD = 4096 #51246
pkg syscall (linux-amd64-cgo), const CLONE_PIDFD ideal-int #51246
+pkg syscall (linux-amd64-cgo), type SysProcAttr struct, CgroupFD int #51246
+pkg syscall (linux-amd64-cgo), type SysProcAttr struct, UseCgroupFD bool #51246
pkg syscall (linux-arm), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
pkg syscall (linux-arm), const CLONE_CLEAR_SIGHAND ideal-int #51246
pkg syscall (linux-arm), const CLONE_INTO_CGROUP = 8589934592 #51246
@@ -48,6 +56,8 @@
pkg syscall (linux-arm), const CLONE_NEWTIME ideal-int #51246
pkg syscall (linux-arm), const CLONE_PIDFD = 4096 #51246
pkg syscall (linux-arm), const CLONE_PIDFD ideal-int #51246
+pkg syscall (linux-arm), type SysProcAttr struct, CgroupFD int #51246
+pkg syscall (linux-arm), type SysProcAttr struct, UseCgroupFD bool #51246
pkg syscall (linux-arm-cgo), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
pkg syscall (linux-arm-cgo), const CLONE_CLEAR_SIGHAND ideal-int #51246
pkg syscall (linux-arm-cgo), const CLONE_INTO_CGROUP = 8589934592 #51246
@@ -58,3 +68,5 @@
pkg syscall (linux-arm-cgo), const CLONE_NEWTIME ideal-int #51246
pkg syscall (linux-arm-cgo), const CLONE_PIDFD = 4096 #51246
pkg syscall (linux-arm-cgo), const CLONE_PIDFD ideal-int #51246
+pkg syscall (linux-arm-cgo), type SysProcAttr struct, CgroupFD int #51246
+pkg syscall (linux-arm-cgo), type SysProcAttr struct, UseCgroupFD bool #51246
index c3631c1..41c34b1 100644
--- a/src/syscall/asm_linux_s390x.s
+++ b/src/syscall/asm_linux_s390x.s
@@ -8,10 +8,10 @@
// System calls for s390x, Linux
//
-// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
-TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
- MOVD $0, R2
- MOVD a1+8(FP), R3
+// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
+TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD a1+8(FP), R2
+ MOVD a2+16(FP), R3
MOVD $0, R4
MOVD $0, R5
MOVD $0, R6
@@ -20,13 +20,13 @@
SYSCALL
MOVD $0xfffffffffffff001, R8
CMPUBLT R2, R8, ok2
- MOVD $-1, r1+16(FP)
+ MOVD $-1, r1+24(FP)
NEG R2, R2
- MOVD R2, err+24(FP) // errno
+ MOVD R2, err+32(FP) // errno
RET
ok2:
- MOVD R2, r1+16(FP)
- MOVD $0, err+24(FP) // errno
+ MOVD R2, r1+24(FP)
+ MOVD $0, err+32(FP) // errno
RET
// func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go
index d9e9e6d..72b56f4 100644
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -99,6 +99,8 @@
// users this should be set to false for mappings work.
GidMappingsEnableSetgroups bool
AmbientCaps []uintptr // Ambient capabilities (Linux only)
+ UseCgroupFD bool // Whether to make use of the CgroupFD field.
+ CgroupFD int // File descriptor of a cgroup to put the new process into.
}
var (
@@ -176,6 +178,21 @@
// See CAP_TO_MASK in linux/capability.h:
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
+// cloneArgs holds arguments for clone3 Linux syscall.
+type cloneArgs struct {
+ flags uint64 // Flags bit mask
+ pidFD uint64 // Where to store PID file descriptor (int *)
+ childTID uint64 // Where to store child TID, in child's memory (pid_t *)
+ parentTID uint64 // Where to store child TID, in parent's memory (pid_t *)
+ exitSignal uint64 // Signal to deliver to parent on child termination
+ stack uint64 // Pointer to lowest byte of stack
+ stackSize uint64 // Size of stack
+ tls uint64 // Location of new TLS
+ setTID uint64 // Pointer to a pid_t array (since Linux 5.5)
+ setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
+ cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7)
+}
+
// forkAndExecInChild1 implements the body of forkAndExecInChild up to
// the parent's post-fork path. This is a separate function so we can
// separate the child's and parent's stack frames if we're using
@@ -205,9 +222,10 @@
nextfd int
i int
caps caps
- fd1 uintptr
+ fd1, flags uintptr
puid, psetgroups, pgid []byte
uidmap, setgroups, gidmap []byte
+ clone3 *cloneArgs
)
if sys.UidMappings != nil {
@@ -252,17 +270,33 @@
}
}
+ flags = sys.Cloneflags
+ if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
+ flags |= CLONE_VFORK | CLONE_VM
+ }
+ // Whether to use clone3.
+ if sys.UseCgroupFD {
+ clone3 = &cloneArgs{
+ flags: uint64(flags) | CLONE_INTO_CGROUP,
+ exitSignal: uint64(SIGCHLD),
+ cgroup: uint64(sys.CgroupFD),
+ }
+ }
+
// About to call fork.
// No more allocation or calls of non-assembly functions.
runtime_BeforeFork()
locked = true
- switch {
- case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
- r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
- case runtime.GOARCH == "s390x":
- r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
- default:
- r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
+ if clone3 != nil {
+ r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
+ } else {
+ flags |= uintptr(SIGCHLD)
+ if runtime.GOARCH == "s390x" {
+ // On Linux/s390, the first two arguments of clone(2) are swapped.
+ r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
+ } else {
+ r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
+ }
}
if err1 != 0 || r1 != 0 {
// If we're in the parent, we must return immediately
diff --git a/src/syscall/exec_linux_test.go b/src/syscall/exec_linux_test.go
index 8a9258d..a035d41 100644
--- a/src/syscall/exec_linux_test.go
+++ b/src/syscall/exec_linux_test.go
@@ -7,6 +7,7 @@
package syscall_test
import (
+ "bytes"
"flag"
"fmt"
"internal/testenv"
@@ -14,6 +15,7 @@
"os"
"os/exec"
"os/user"
+ "path"
"path/filepath"
"runtime"
"strconv"
@@ -461,6 +463,96 @@+ _, err = syscall.ForkExec("non-existent binary", nil, &syscall.ProcAttr{diff --git a/src/syscall/syscall_linux.go b/src/syscall/syscall_linux.go
index c3038fc..bdee570 100644
--- a/src/syscall/syscall_linux.go
+++ b/src/syscall/syscall_linux.go
@@ -94,6 +94,7 @@
}
func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
+func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
/*
* Wrapped
diff --git a/src/syscall/syscall_linux_386.go b/src/syscall/syscall_linux_386.go
index 7602736..0c9c6aa 100644
--- a/src/syscall/syscall_linux_386.go
+++ b/src/syscall/syscall_linux_386.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS32
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -348,5 +349,3 @@
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length)
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_amd64.go b/src/syscall/syscall_linux_amd64.go
index 02e4116..77e1393 100644
--- a/src/syscall/syscall_linux_amd64.go
+++ b/src/syscall/syscall_linux_amd64.go
@@ -6,6 +6,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -120,5 +121,3 @@
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length)
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_arm.go b/src/syscall/syscall_linux_arm.go
index 1b5d639..f4740af 100644
--- a/src/syscall/syscall_linux_arm.go
+++ b/src/syscall/syscall_linux_arm.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS32
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -200,5 +201,3 @@
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length)
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_arm64.go b/src/syscall/syscall_linux_arm64.go
index 3ce6849..f426862 100644
--- a/src/syscall/syscall_linux_arm64.go
+++ b/src/syscall/syscall_linux_arm64.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -182,5 +183,3 @@
_, err := ppoll(nil, 0, nil, nil)
return err
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_loong64.go b/src/syscall/syscall_linux_loong64.go
index 2cd3494..5a0fa08 100644
--- a/src/syscall/syscall_linux_loong64.go
+++ b/src/syscall/syscall_linux_loong64.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -217,5 +218,3 @@
_, err := ppoll(nil, 0, nil, nil)
return err
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_mips64x.go b/src/syscall/syscall_linux_mips64x.go
index 2d3784e..8a0aa5c 100644
--- a/src/syscall/syscall_linux_mips64x.go
+++ b/src/syscall/syscall_linux_mips64x.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 5435
_SYS_faccessat2 = 5439
)
@@ -182,5 +183,3 @@
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length)
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_mipsx.go b/src/syscall/syscall_linux_mipsx.go
index 59825e4..c8468fb 100644
--- a/src/syscall/syscall_linux_mipsx.go
+++ b/src/syscall/syscall_linux_mipsx.go
@@ -10,6 +10,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 4435
_SYS_faccessat2 = 4439
)
@@ -193,5 +194,3 @@
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length)
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_ppc64x.go b/src/syscall/syscall_linux_ppc64x.go
index ba8f1e7..5c076d8 100644
--- a/src/syscall/syscall_linux_ppc64x.go
+++ b/src/syscall/syscall_linux_ppc64x.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -91,8 +92,6 @@
cmsg.Len = uint64(length)
}
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
-
//sys syncFileRange2(fd int, flags int, off int64, n int64) (err error) = SYS_SYNC_FILE_RANGE2
func SyncFileRange(fd int, off int64, n int64, flags int) error {
diff --git a/src/syscall/syscall_linux_riscv64.go b/src/syscall/syscall_linux_riscv64.go
index 82c4094..3bb5460 100644
--- a/src/syscall/syscall_linux_riscv64.go
+++ b/src/syscall/syscall_linux_riscv64.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -168,5 +169,3 @@
_, err := ppoll(nil, 0, nil, nil)
return err
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
diff --git a/src/syscall/syscall_linux_s390x.go b/src/syscall/syscall_linux_s390x.go
index fb97180..cb83697 100644
--- a/src/syscall/syscall_linux_s390x.go
+++ b/src/syscall/syscall_linux_s390x.go
@@ -8,6 +8,7 @@
const (
_SYS_setgroups = SYS_SETGROUPS
+ _SYS_clone3 = 435
_SYS_faccessat2 = 439
)
@@ -257,5 +258,3 @@
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length)
}
-
-func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
To view, visit change 417695. To unsubscribe, or for help writing mail filters, visit settings.