unix: add CPUSetDynamic for systems with more than 1024 CPUs
The existing CPUSet type is a fixed-size array limited to 1024 CPUs,
which makes it problematic to use for large systems (such as Google's
X4 instances with 1440 and 1920 vCPUs), see e.g.
https://github.com/opencontainers/runc/issues/5023.
Introduce CPUSetDynamic type and NewCPUSet constructor to support large
systems. The bit-managing routines (set/clear/isset/fill/count) are
separated and reused.
Add variants of SchedGetaffinity, SchedSetaffinity and SetMemPolicy
that accept the new type.
Amend the documentation for CPUSet.
Add tests for new functionality (mostly a copy of existing tests).
This is an alternative to CL 727540 / CL 727541.
diff --git a/unix/affinity_linux.go b/unix/affinity_linux.go
index 3ea4703..af5e6c8 100644
--- a/unix/affinity_linux.go
+++ b/unix/affinity_linux.go
@@ -13,11 +13,19 @@
const cpuSetSize = _CPU_SETSIZE / _NCPUBITS
-// CPUSet represents a CPU affinity mask.
+// CPUSet represents a bit mask of CPUs, to be used with [SchedGetaffinity], [SchedSetaffinity],
+// and [SetMemPolicy].
+//
+// Note that this a fixed-size type that can represent CPU IDs 0 through 1023 only.
+// Use [CPUSetDynamic]/[NewCPUSet] instead to avoid this limit.
type CPUSet [cpuSetSize]cpuMask
-func schedAffinity(trap uintptr, pid int, set *CPUSet) error {
- _, _, e := RawSyscall(trap, uintptr(pid), uintptr(unsafe.Sizeof(*set)), uintptr(unsafe.Pointer(set)))
+// CPUSetDynamic represents a bit mask of CPUs, to be used with [SchedGetaffinityDynamic],
+// [SchedSetaffinityDynamic], and [SetMemPolicyDynamic]. Use [NewCPUSet] to allocate.
+type CPUSetDynamic []cpuMask
+
+func schedAffinity(trap uintptr, pid int, size uintptr, ptr unsafe.Pointer) error {
+ _, _, e := RawSyscall(trap, uintptr(pid), uintptr(size), uintptr(ptr))
if e != 0 {
return errnoErr(e)
}
@@ -27,13 +35,13 @@
// SchedGetaffinity gets the CPU affinity mask of the thread specified by pid.
// If pid is 0 the calling thread is used.
func SchedGetaffinity(pid int, set *CPUSet) error {
- return schedAffinity(SYS_SCHED_GETAFFINITY, pid, set)
+ return schedAffinity(SYS_SCHED_GETAFFINITY, pid, unsafe.Sizeof(*set), unsafe.Pointer(set))
}
// SchedSetaffinity sets the CPU affinity mask of the thread specified by pid.
// If pid is 0 the calling thread is used.
func SchedSetaffinity(pid int, set *CPUSet) error {
- return schedAffinity(SYS_SCHED_SETAFFINITY, pid, set)
+ return schedAffinity(SYS_SCHED_SETAFFINITY, pid, unsafe.Sizeof(*set), unsafe.Pointer(set))
}
// Zero clears the set s, so that it contains no CPUs.
@@ -45,9 +53,7 @@
// will silently ignore any invalid CPU bits in [CPUSet] so this is an
// efficient way of resetting the CPU affinity of a process.
func (s *CPUSet) Fill() {
- for i := range s {
- s[i] = ^cpuMask(0)
- }
+ cpuMaskFill(s[:])
}
func cpuBitsIndex(cpu int) int {
@@ -58,24 +64,27 @@
return cpuMask(1 << (uint(cpu) % _NCPUBITS))
}
-// Set adds cpu to the set s.
-func (s *CPUSet) Set(cpu int) {
+func cpuMaskFill(s []cpuMask) {
+ for i := range s {
+ s[i] = ^cpuMask(0)
+ }
+}
+
+func cpuMaskSet(s []cpuMask, cpu int) {
i := cpuBitsIndex(cpu)
if i < len(s) {
s[i] |= cpuBitsMask(cpu)
}
}
-// Clear removes cpu from the set s.
-func (s *CPUSet) Clear(cpu int) {
+func cpuMaskClear(s []cpuMask, cpu int) {
i := cpuBitsIndex(cpu)
if i < len(s) {
s[i] &^= cpuBitsMask(cpu)
}
}
-// IsSet reports whether cpu is in the set s.
-func (s *CPUSet) IsSet(cpu int) bool {
+func cpuMaskIsSet(s []cpuMask, cpu int) bool {
i := cpuBitsIndex(cpu)
if i < len(s) {
return s[i]&cpuBitsMask(cpu) != 0
@@ -83,11 +92,95 @@
return false
}
-// Count returns the number of CPUs in the set s.
-func (s *CPUSet) Count() int {
+func cpuMaskCount(s []cpuMask) int {
c := 0
for _, b := range s {
c += bits.OnesCount64(uint64(b))
}
return c
}
+
+// Set adds cpu to the set s. If cpu is out of bounds for s, no action is taken.
+func (s *CPUSet) Set(cpu int) {
+ cpuMaskSet(s[:], cpu)
+}
+
+// Clear removes cpu from the set s. If cpu is out of bounds for s, no action is taken.
+func (s *CPUSet) Clear(cpu int) {
+ cpuMaskClear(s[:], cpu)
+}
+
+// IsSet reports whether cpu is in the set s.
+func (s *CPUSet) IsSet(cpu int) bool {
+ return cpuMaskIsSet(s[:], cpu)
+}
+
+// Count returns the number of CPUs in the set s.
+func (s *CPUSet) Count() int {
+ return cpuMaskCount(s[:])
+}
+
+// NewCPUSet creates a dynamically-sized CPU affinity mask capable of
+// representing CPU IDs up to maxCPU (exclusive).
+func NewCPUSet(maxCPU int) CPUSetDynamic {
+ numMasks := (maxCPU + _NCPUBITS - 1) / _NCPUBITS
+ if numMasks == 0 {
+ numMasks = 1
+ }
+ return make(CPUSetDynamic, numMasks)
+}
+
+// Zero clears the set s, so that it contains no CPUs.
+func (s CPUSetDynamic) Zero() {
+ clear(s)
+}
+
+// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinity]
+// will silently ignore any invalid CPU bits in [CPUSet] so this is an
+// efficient way of resetting the CPU affinity of a process.
+func (s CPUSetDynamic) Fill() {
+ cpuMaskFill(s)
+}
+
+// Set adds cpu to the set s. If cpu is out of bounds for s, no action is taken.
+func (s CPUSetDynamic) Set(cpu int) {
+ cpuMaskSet(s, cpu)
+}
+
+// Clear removes cpu from the set s. If cpu is out of bounds for s, no action is taken.
+func (s CPUSetDynamic) Clear(cpu int) {
+ cpuMaskClear(s, cpu)
+}
+
+// IsSet reports whether cpu is in the set s.
+func (s CPUSetDynamic) IsSet(cpu int) bool {
+ return cpuMaskIsSet(s, cpu)
+}
+
+// Count returns the number of CPUs in the set s.
+func (s CPUSetDynamic) Count() int {
+ return cpuMaskCount(s)
+}
+
+func (s CPUSetDynamic) size() uintptr {
+ return uintptr(len(s)) * unsafe.Sizeof(cpuMask(0))
+}
+
+func (s CPUSetDynamic) pointer() unsafe.Pointer {
+ if len(s) == 0 {
+ return nil
+ }
+ return unsafe.Pointer(&s[0])
+}
+
+// SchedGetaffinityDynamic gets the CPU affinity mask of the thread specified by pid.
+// If pid is 0 the calling thread is used.
+func SchedGetaffinityDynamic(pid int, set CPUSetDynamic) error {
+ return schedAffinity(SYS_SCHED_GETAFFINITY, pid, set.size(), set.pointer())
+}
+
+// SchedSetaffinityDynamic sets the CPU affinity mask of the thread specified by pid.
+// If pid is 0 the calling thread is used.
+func SchedSetaffinityDynamic(pid int, set CPUSetDynamic) error {
+ return schedAffinity(SYS_SCHED_SETAFFINITY, pid, set.size(), set.pointer())
+}
diff --git a/unix/syscall_linux.go b/unix/syscall_linux.go
index 06c0eea..f7b82bc 100644
--- a/unix/syscall_linux.go
+++ b/unix/syscall_linux.go
@@ -2644,8 +2644,12 @@
//sys Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error)
//sys Mseal(b []byte, flags uint) (err error)
-//sys setMemPolicy(mode int, mask *CPUSet, size int) (err error) = SYS_SET_MEMPOLICY
+//sys setMemPolicy(mode int, mask unsafe.Pointer, size uintptr) (err error) = SYS_SET_MEMPOLICY
func SetMemPolicy(mode int, mask *CPUSet) error {
- return setMemPolicy(mode, mask, _CPU_SETSIZE)
+ return setMemPolicy(mode, unsafe.Pointer(mask), _CPU_SETSIZE)
+}
+
+func SetMemPolicyDynamic(mode int, mask CPUSetDynamic) error {
+ return setMemPolicy(mode, mask.pointer(), mask.size())
}
diff --git a/unix/syscall_linux_test.go b/unix/syscall_linux_test.go
index d3075ca..0baa1e8 100644
--- a/unix/syscall_linux_test.go
+++ b/unix/syscall_linux_test.go
@@ -19,6 +19,7 @@
"path/filepath"
"runtime"
"runtime/debug"
+ "slices"
"strconv"
"strings"
"syscall"
@@ -512,7 +513,12 @@
}
func TestSchedSetaffinity(t *testing.T) {
+ const maxcpus = 1024 // _CPU_SETSIZE
var newMask unix.CPUSet
+ newMask.Fill()
+ if count := newMask.Count(); count != maxcpus {
+ t.Errorf("Fill: got %d CPUs, want %d", count, maxcpus)
+ }
newMask.Zero()
if newMask.Count() != 0 {
t.Errorf("CpuZero: didn't zero CPU set: %v", newMask)
@@ -588,6 +594,89 @@
}
}
+func TestSchedSetaffinityDynamic(t *testing.T) {
+ const maxcpus = 4096
+
+ newMask := unix.NewCPUSet(maxcpus)
+ newMask.Fill()
+ if count := newMask.Count(); count != maxcpus {
+ t.Errorf("Fill: got %d CPUs, want %d", count, maxcpus)
+ }
+ newMask.Zero()
+ if newMask.Count() != 0 {
+ t.Errorf("Zero: didn't zero CPU set: %v", newMask)
+ }
+ cpu := 1
+ newMask.Set(cpu)
+ if newMask.Count() != 1 || !newMask.IsSet(cpu) {
+ t.Errorf("Set: didn't set CPU %d in set: %v", cpu, newMask)
+ }
+ cpu = 5
+ newMask.Set(cpu)
+ if newMask.Count() != 2 || !newMask.IsSet(cpu) {
+ t.Errorf("Set: didn't set CPU %d in set: %v", cpu, newMask)
+ }
+ newMask.Clear(cpu)
+ if newMask.Count() != 1 || newMask.IsSet(cpu) {
+ t.Errorf("Clear: didn't clear CPU %d in set: %v", cpu, newMask)
+ }
+
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ oldMask := unix.NewCPUSet(maxcpus)
+ err := unix.SchedGetaffinityDynamic(0, oldMask)
+ if err != nil {
+ t.Fatalf("SchedGetaffinityDynamic: %v", err)
+ }
+
+ if runtime.NumCPU() < 2 {
+ t.Skip("skipping setaffinity tests on single CPU system")
+ }
+ if runtime.GOOS == "android" {
+ t.Skip("skipping setaffinity tests on android")
+ }
+
+ // On a system like ppc64x where some cores can be disabled using ppc64_cpu,
+ // setaffinity should only be called with enabled cores. The valid cores
+ // are found from the oldMask, but if none are found then the setaffinity
+ // tests are skipped. Issue #27875.
+ cpu = 1
+ if !oldMask.IsSet(cpu) {
+ newMask.Zero()
+ for i := range len(oldMask) {
+ if oldMask.IsSet(i) {
+ newMask.Set(i)
+ break
+ }
+ }
+ if newMask.Count() == 0 {
+ t.Skip("skipping setaffinity tests if CPU not available")
+ }
+ }
+
+ err = unix.SchedSetaffinityDynamic(0, newMask)
+ if err != nil {
+ t.Fatalf("SchedSetaffinityDynamic: %v", err)
+ }
+
+ gotMask := unix.NewCPUSet(maxcpus)
+ err = unix.SchedGetaffinityDynamic(0, gotMask)
+ if err != nil {
+ t.Fatalf("SchedGetaffinityDynamic: %v", err)
+ }
+
+ if !slices.Equal(gotMask, newMask) {
+ t.Errorf("SchedSetaffinityDynamic: returned affinity mask does not match set affinity mask (%+v != %+v", gotMask, newMask)
+ }
+
+ // Restore old mask so it doesn't affect successive tests.
+ err = unix.SchedSetaffinityDynamic(0, oldMask)
+ if err != nil {
+ t.Fatalf("SchedSetaffinityDynamic: %v", err)
+ }
+}
+
func TestStatx(t *testing.T) {
var stx unix.Statx_t
err := unix.Statx(unix.AT_FDCWD, ".", 0, 0, &stx)
diff --git a/unix/zsyscall_linux.go b/unix/zsyscall_linux.go
index 8935d10..886f5de 100644
--- a/unix/zsyscall_linux.go
+++ b/unix/zsyscall_linux.go
@@ -2241,8 +2241,8 @@
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func setMemPolicy(mode int, mask *CPUSet, size int) (err error) {
- _, _, e1 := Syscall(SYS_SET_MEMPOLICY, uintptr(mode), uintptr(unsafe.Pointer(mask)), uintptr(size))
+func setMemPolicy(mode int, mask unsafe.Pointer, size uintptr) (err error) {
+ _, _, e1 := Syscall(SYS_SET_MEMPOLICY, uintptr(mode), uintptr(mask), uintptr(size))
if e1 != 0 {
err = errnoErr(e1)
}
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Note this is a replacement for CL 727540 / CL 727541.
I don't really like this approach, as it looks even worse than what we have in glibc, but apparently there's no better way as we used a fixed type from the beginning and have to maintain API compatibility.
Having said that, if you see a better way, please speak up.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +1 |
Note this is a replacement for CL 727540 / CL 727541.
I don't really like this approach, as it looks even worse than what we have in glibc, but apparently there's no better way as we used a fixed type from the beginning and have to maintain API compatibility.
Having said that, if you see a better way, please speak up.
I think this is not so bad. NewCPUSet is a good name. If you use that plus the methods you never actually see the CPUSetDynamic name.
Probably the most awkward part is SchedSetaffinityDynamic, since you actually have the see the "dynamic" part.
I don't really have better suggestions for "dynamic" either.
// If pid is 0 the calling thread is used.On Linux at least, sched_getaffinity returns EINVAL if the cpu set is too small. I wonder if we should document that here and on SchedGetaffinity, since it is relevant to custom CPUSets?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
Michael PrattNote this is a replacement for CL 727540 / CL 727541.
I don't really like this approach, as it looks even worse than what we have in glibc, but apparently there's no better way as we used a fixed type from the beginning and have to maintain API compatibility.
Having said that, if you see a better way, please speak up.
I think this is not so bad. NewCPUSet is a good name. If you use that plus the methods you never actually see the CPUSetDynamic name.
Probably the most awkward part is SchedSetaffinityDynamic, since you actually have the see the "dynamic" part.
I don't really have better suggestions for "dynamic" either.
I guess the only way to avoid the "Dynamic" suffix is to make SchedGetaffinity, SchedSetaffinity and SetMemPolicy methods of CPUSetDynamic.
OTOH it'd be a bit messy since bit-manipulation routines and syscalls are all mixed together.
// If pid is 0 the calling thread is used.On Linux at least, sched_getaffinity returns EINVAL if the cpu set is too small. I wonder if we should document that here and on SchedGetaffinity, since it is relevant to custom CPUSets?
In general, x/sys/unix do not duplicate man pages, but in this case it actually makes sense (and can actually be used to check if the mask size is adequate).
Added.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Anything I can do to move this forward?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +2 |
// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinity]
// will silently ignore any invalid CPU bits in [CPUSet] so this is anShould this mention the *Dynamic variants instead?
```suggestion
// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinityDynamic]
// will silently ignore any invalid CPU bits in [CPUSetDynamic] so this is an
```
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinity]
// will silently ignore any invalid CPU bits in [CPUSet] so this is anShould this mention the *Dynamic variants instead?
```suggestion
// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinityDynamic]
// will silently ignore any invalid CPU bits in [CPUSetDynamic] so this is an
```
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
3 is the latest approved patch-set.
The change was submitted with unreviewed changes in the following files:
```
The name of the file: unix/affinity_linux.go
Insertions: 2, Deletions: 2.
@@ -135,8 +135,8 @@
clear(s)
}
-// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinity]
-// will silently ignore any invalid CPU bits in [CPUSet] so this is an
+// Fill adds all possible CPU bits to the set s. On Linux, [SchedSetaffinityDynamic]
+// will silently ignore any invalid CPU bits in [CPUSetDynamic] so this is an
// efficient way of resetting the CPU affinity of a process.
func (s CPUSetDynamic) Fill() {
cpuMaskFill(s)
```
unix: add CPUSetDynamic for systems with more than 1024 CPUs
The existing CPUSet type is a fixed-size array limited to 1024 CPUs,
which makes it problematic to use for large systems (such as Google's
X4 instances with 1440 and 1920 vCPUs), see e.g.
https://github.com/opencontainers/runc/issues/5023.
Introduce CPUSetDynamic type and NewCPUSet constructor to support large
systems. The bit-managing routines (set/clear/isset/fill/count) are
separated and reused.
Add variants of SchedGetaffinity, SchedSetaffinity and SetMemPolicy
that accept the new type.
Amend the documentation for CPUSet.
Amend the existing TestSchedSetaffinity to:
- test set.Fill;
- use t.Cleanup to restore the affinity.
Add tests for new functionality (mostly a copy of existing tests).
This is an alternative to CL 727540 / CL 727541.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |