From: Waldemar Kozaczuk <
jwkoz...@gmail.com>
Committer: Waldemar Kozaczuk <
jwkoz...@gmail.com>
Branch: master
aarch64: implement clone/clone3 to run multi-threaded static apps
Just like the patch b3792dfa62149a0f8c5dd75d445dcf2266235de1, this one
implements clone/clone3 system call but on aarch64. For more details
please read the code comments.
In addition this patch refactors the clone code by extracting common
logic into the clone() funtion in linux.cc and leaving the arch-specific
code in clone_thread() found under arch/$(arch)/clone.cc
With this patch, one can run multi-threaded static executables and
dynamic ones with Linux dynamic linker on OSv on aarch64.
./scripts/test.py --linux_ld -m modules/tests-with-linux-ld/usr.manifest \
-d java_no_wrapper \
-d tst-chmod \
-d tst-kill \
-d tst-remove \
-d tst-sigaction \
-d tst-sigwait \
-d tst-stdio-rofs \
-d tst-wctype
Please note the java_wrapper crashes because of the missing implementation
of AT_SYMLINK_NOFOLLOW in faccessat(). And tst-chmod and tst-remove fail
because of missing fchmodat syscall.
Signed-off-by: Waldemar Kozaczuk <
jwkoz...@gmail.com>
---
diff --git a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -977,6 +977,7 @@ objects += arch/$(arch)/cpuid.o
objects += arch/$(arch)/firmware.o
objects += arch/$(arch)/hypervisor.o
objects += arch/$(arch)/interrupt.o
+objects += arch/$(arch)/clone.o
ifeq ($(conf_drivers_pci),1)
objects += arch/$(arch)/pci.o
objects += arch/$(arch)/msi.o
@@ -1013,7 +1014,6 @@ objects += arch/x64/apic.o
objects += arch/x64/apic-clock.o
objects += arch/x64/entry-xen.o
objects += arch/x64/prctl.o
-objects += arch/x64/clone.o
objects += arch/x64/vmlinux.o
objects += arch/x64/vmlinux-boot64.o
objects += arch/x64/pvh-boot.o
diff --git a/arch/aarch64/clone.cc b/arch/aarch64/clone.cc
--- a/arch/aarch64/clone.cc
+++ b/arch/aarch64/clone.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2023 Waldemar Kozaczuk
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include "arch.hh"
+#include <errno.h>
+#include <osv/sched.hh>
+
+#define CLONE_SETTLS 0x00080000
+
+static constexpr size_t CHILD_FRAME_OFFSET = 7*4096 + sizeof(exception_frame);
+static constexpr size_t PARENT_FRAME_OFFSET = sizeof(exception_frame);
+
+sched::thread *clone_thread(unsigned long flags, void *child_stack, unsigned long newtls)
+{ //
+ //If the parent thread is pinned we should make new thread inherit this
+ auto parent_pinned_cpu = sched::thread::current()->pinned() ? sched::cpu::current() : nullptr;
+ //
+ //Create new child thread
+ auto t = sched::thread::make([=] {
+ //
+ //Switch to app TCB if one specified
+ auto frame_start_on_exception_stack = sched::thread::current()->get_exception_stack_top() - CHILD_FRAME_OFFSET;
+ exception_frame *child_frame = reinterpret_cast<exception_frame*>(frame_start_on_exception_stack);
+ if (child_frame->far) {
+ asm volatile ("msr tpidr_el0, %0; isb; " :: "r"(child_frame->far) : "memory");
+ }
+ //
+ //Restore registers from the exception stack and jump to the caller
+ //We are restoring the registers based on how they were saved
+ //on the exception stack of the parent
+ asm volatile
+ ("msr daifset, #2 \n\t" // Disable interrupts
+ "isb \n\t"
+ "mov sp, %0 \n\t" // Set child stack
+ "msr spsel, #0 \n\t" // Switch to exception stack
+ "mov sp, %1 \n\t" // Set stack to the beginning of the stack frame
+ "ldr x30, [sp, #256] \n\t" // Load x30 (link register) with elr_el1 (exception link register)
+ "ldp x0, x1, [sp], #16 \n\t"
+ "ldp x2, x3, [sp], #16 \n\t"
+ "ldp x4, x5, [sp], #16 \n\t"
+ "ldp x6, x7, [sp], #16 \n\t"
+ "ldp x8, x9, [sp], #16 \n\t"
+ "ldp x10, x11, [sp], #16 \n\t"
+ "ldp x12, x13, [sp], #16 \n\t"
+ "ldp x14, x15, [sp], #16 \n\t"
+ "ldp x16, x17, [sp], #16 \n\t"
+ "ldp x18, x19, [sp], #16 \n\t"
+ "ldp x20, x21, [sp], #16 \n\t"
+ "ldp x22, x23, [sp], #16 \n\t"
+ "ldp x24, x25, [sp], #16 \n\t"
+ "ldp x26, x27, [sp], #16 \n\t"
+ "ldp x28, x29, [sp], #16 \n\t"
+ "add sp, sp, #48 \n\t"
+ "add sp, sp, #28672 \n\t" // Move back 7*4096
+ "msr spsel, #1 \n\t" // Switch to user stack
+ "msr daifclr, #2 \n\t" // Enable interrupts
+ "isb \n\t" : : "r"(child_frame->sp), "r"(frame_start_on_exception_stack));
+ }, sched::thread::attr().
+ stack(4096 * 4). //16K kernel stack should be large enough
+ pin(parent_pinned_cpu),
+ false,
+ true);
+ //
+ //Copy all saved registers from parent exception stack to the child exception stack
+ //so that they can be restored in the child thread in the inlined assembly above
+ auto frame_start_on_child_exception_stack = t->get_exception_stack_top() - CHILD_FRAME_OFFSET;
+ exception_frame *child_frame = reinterpret_cast<exception_frame*>(frame_start_on_child_exception_stack);
+ auto frame_start_on_parent_exception_stack = sched::thread::current()->get_exception_stack_top() - PARENT_FRAME_OFFSET;
+ exception_frame *parent_frame = reinterpret_cast<exception_frame*>(frame_start_on_parent_exception_stack);
+ memcpy(child_frame, parent_frame, sizeof(*parent_frame));
+ //
+ // Save child stack pointer
+ child_frame->sp = reinterpret_cast<u64>(child_stack);
+ child_frame->regs[0] = 0;
+ //
+ // Set app TCB if CLONE_SETTLS flag set
+ if ((flags & CLONE_SETTLS)) {
+ child_frame->far = newtls;
+ } else {
+ child_frame->far = 0;
+ }
+
+ return t;
+}
diff --git a/arch/x64/clone.cc b/arch/x64/clone.cc
--- a/arch/x64/clone.cc
+++ b/arch/x64/clone.cc
@@ -10,40 +10,16 @@
#include <osv/sched.hh>
#include "tls-switch.hh"
-#define CLONE_THREAD 0x00010000
#define CLONE_SETTLS 0x00080000
-#define CLONE_CHILD_SETTID 0x01000000
-#define CLONE_PARENT_SETTID 0x00100000
-#define CLONE_CHILD_CLEARTID 0x00200000
static constexpr size_t CHILD_FRAME_OFFSET = 136;
static constexpr size_t PARENT_FRAME_OFFSET = 120;
static constexpr size_t FRAME_SIZE = 120;
static constexpr size_t RSP_OFFSET = 8;
static constexpr size_t RAX_OFFSET = 16;
-int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, unsigned long newtls)
+sched::thread *clone_thread(unsigned long flags, void *child_stack, unsigned long newtls)
{ //
- //We only support "cloning" of threads so fork() would fail but pthread_create() should
- //succeed
- if (!(flags & CLONE_THREAD)) {
- errno = ENOSYS;
- return -1;
- }
- //
- //Validate we have non-empty stack
- if (!child_stack) {
- errno = EINVAL;
- return -1;
- }
- //
- //Validate ptid and ctid which we would be setting down if requested by these flags
- if (((flags & CLONE_PARENT_SETTID) && !ptid) ||
- ((flags & CLONE_CHILD_SETTID) && !ctid) ||
- ((flags & CLONE_SETTLS) && !newtls)) {
- errno = EFAULT;
- return -1;
- }
//
//If the parent thread is pinned we should make new thread inherit this
auto parent_pinned_cpu = sched::thread::current()->pinned() ? sched::cpu::current() : nullptr;
@@ -91,23 +67,6 @@ int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, unsi
false,
true);
- //
- //Store the child thread ID at the location pointed to by ptid
- if ((flags & CLONE_PARENT_SETTID)) {
- *ptid = t->id();
- }
- //
- //Store the child thread ID at the location pointed to by ctid
- if ((flags & CLONE_CHILD_SETTID)) {
- *ctid = t->id();
- }
- //
- //Clear (zero) the child thread ID at the location pointed to by child_tid
- //in child memory when the child exits, and do a wakeup on the futex at that address
- //See thread::complete()
- if ((flags & CLONE_CHILD_CLEARTID)) {
- t->set_clear_id(ctid);
- }
//
//Copy all saved registers from parent syscall stack to the child syscall stack
//so that they can be restored in the child thread in the inlined assembly above
@@ -123,12 +82,6 @@ int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, unsi
if ((flags & CLONE_SETTLS)) {
t->set_app_tcb(newtls);
}
- t->start();
- //
- //The manual of sigprocmask has this to say about clone:
- //"Each of the threads in a process has its own signal mask.
- // A child created via fork(2) inherits a copy of its parent's
- // signal mask; the signal mask is preserved across execve(2)."
- //TODO: Does it mean new thread should inherit signal mask of the parent?
- return t->id();
+
+ return t;
}
diff --git a/core/elf.cc b/core/elf.cc
--- a/core/elf.cc
+++ b/core/elf.cc
@@ -535,9 +535,6 @@ void object::process_headers()
abort("Unknown p_type in executable %s: %d\n", pathname(), phdr.p_type);
}
}
- if (!is_core() && is_statically_linked_executable()) {
- std::cout << "WARNING: Statically linked executables are only supported to limited extent!\n";
- }
if (_is_dynamically_linked_executable && _tls_segment) {
auto app_tls_size = get_aligned_tls_size();
ulong pie_static_tls_maximum_size = &_pie_static_tls_end - &_pie_static_tls_start;
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -711,7 +711,10 @@ public:
bool unsafe_stop();
void setup_large_syscall_stack();
void free_tiny_syscall_stack();
+#ifdef __x86_64__
void* get_syscall_stack_top();
+#endif
+ void* get_exception_stack_top() { return _arch.exception_stack + sizeof(_arch.exception_stack); }
private:
static void wake_impl(detached_state* st,
unsigned allowed_initial_states_mask = 1 << unsigned(status::waiting));
diff --git a/linux.cc b/linux.cc
--- a/linux.cc
+++ b/linux.cc
@@ -441,9 +441,71 @@ static long sys_set_tid_address(int *tidptr)
return sched::thread::current()->id();
}
-#ifdef __x86_64__
+#define CLONE_THREAD 0x00010000
+#define CLONE_CHILD_SETTID 0x01000000
+#define CLONE_PARENT_SETTID 0x00100000
+#define CLONE_CHILD_CLEARTID 0x00200000
+
+extern sched::thread *clone_thread(unsigned long flags, void *child_stack, unsigned long newtls);
+
#define __NR_sys_clone __NR_clone
-extern int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, unsigned long newtls);
+#ifdef __x86_64__
+int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, unsigned long newtls)
+#endif
+#ifdef __aarch64__
+int sys_clone(unsigned long flags, void *child_stack, int *ptid, unsigned long newtls, int *ctid)
+#endif
+{ //
+ //We only support "cloning" of threads so fork() would fail but pthread_create() should
+ //succeed
+ if (!(flags & CLONE_THREAD)) {
+ errno = ENOSYS;
+ return -1;
+ }
+ //
+ //Validate we have non-empty stack
+ if (!child_stack) {
+ errno = EINVAL;
+ return -1;
+ }
+ //
+ //Validate ptid and ctid which we would be setting down if requested by these flags
+ if (((flags & CLONE_PARENT_SETTID) && !ptid) ||
+ ((flags & CLONE_CHILD_SETTID) && !ctid) ||
+ ((flags & CLONE_SETTLS) && !newtls)) {
+ errno = EFAULT;
+ return -1;
+ }
+
+ sched::thread *t = clone_thread(flags, child_stack, newtls);
+
+ //
+ //Store the child thread ID at the location pointed to by ptid
+ if ((flags & CLONE_PARENT_SETTID)) {
+ *ptid = t->id();
+ }
+ //
+ //Store the child thread ID at the location pointed to by ctid
+ if ((flags & CLONE_CHILD_SETTID)) {
+ *ctid = t->id();
+ }
+ //
+ //Clear (zero) the child thread ID at the location pointed to by child_tid
+ //in child memory when the child exits, and do a wakeup on the futex at that address
+ //See thread::complete()
+ if ((flags & CLONE_CHILD_CLEARTID)) {
+ t->set_clear_id(ctid);
+ }
+ t->start();
+
+ //
+ //The manual of sigprocmask has this to say about clone:
+ //"Each of the threads in a process has its own signal mask.
+ // A child created via fork(2) inherits a copy of its parent's
+ // signal mask; the signal mask is preserved across execve(2)."
+ //TODO: Does it mean new thread should inherit signal mask of the parent?
+ return t->id();
+}
struct clone_args {
u64 flags;
@@ -463,10 +525,15 @@ static int sys_clone3(struct clone_args *args, size_t size)
args->flags,
reinterpret_cast<void*>(args->stack) + args->stack_size,
reinterpret_cast<int*>(args->parent_tid),
+#ifdef __x86_64__
reinterpret_cast<int*>(args->child_tid),
args->tls);
-}
#endif
+#ifdef __aarch64__
+ args->tls,
+ reinterpret_cast<int*>(args->child_tid));
+#endif
+}
#define __NR_sys_ioctl __NR_ioctl
//
@@ -592,7 +659,7 @@ extern int utimensat4(int dirfd, const char *pathname, const struct timespec tim
TRACEPOINT(trace_syscall_open, "%d <= \"%s\" 0x%x", int, const char *, int);
#endif
TRACEPOINT(trace_syscall_read, "0x%x <= %d %p 0x%x", ssize_t, int, char *, size_t);
-TRACEPOINT(trace_syscall_uname, "%d <= ", int, struct utsname *);
+TRACEPOINT(trace_syscall_uname, "%d <= %p", int, struct utsname *);
TRACEPOINT(trace_syscall_write, "0x%x <= %d %p 0x%x", ssize_t, int, const void *, size_t);
TRACEPOINT(trace_syscall_gettid, "%d <=", pid_t);
TRACEPOINT(trace_syscall_clock_gettime, "%d <= %d %p", int, clockid_t, struct timespec *);
@@ -654,7 +721,7 @@ TRACEPOINT(trace_syscall_nanosleep, "%d <= %p %p", int, const struct timespec*,
TRACEPOINT(trace_syscall_fstatat, "%d <= %d \"%s\" %p 0%0o", int, int, const char *, struct stat *, int);
TRACEPOINT(trace_syscall_sys_exit_group, "%d <= %d", int, int);
TRACEPOINT(trace_syscall_sys_getcwd, "%ld <= 0%0o %lu", long, char *, unsigned long);
-TRACEPOINT(trace_syscall_readlinkat, "%lu <= %d 0%0o 0x%x %lu", ssize_t, int, const char *, char *, size_t);
+TRACEPOINT(trace_syscall_readlinkat, "%lu <= %d %s 0x%x %lu", ssize_t, int, const char *, char *, size_t);
TRACEPOINT(trace_syscall_getpid, "%d <=", pid_t);
TRACEPOINT(trace_syscall_set_mempolicy, "%ld <= %d %p %lu", long, int, unsigned long *, unsigned long);
TRACEPOINT(trace_syscall_sys_sched_setaffinity, "%d <= %d %u %p", int, pid_t, unsigned, unsigned long *);
@@ -726,8 +793,11 @@ TRACEPOINT(trace_syscall_sys_set_robust_list, "%d <= %p %lu", long, struct robus
TRACEPOINT(trace_syscall_sys_set_tid_address, "%d <= %p", long, int *);
#ifdef __x86_64__
TRACEPOINT(trace_syscall_sys_clone, "%d <= 0x%x 0x%x %p %p %lu", int, unsigned long, void *, int *, int *, unsigned long);
-TRACEPOINT(trace_syscall_sys_clone3, "%d <= %p %lu", int, struct clone_args *, size_t);
#endif
+#ifdef __aarch64__
+TRACEPOINT(trace_syscall_sys_clone, "%d <= 0x%x 0x%x %p %p %lu", int, unsigned long, void *, int *, unsigned long, int *);
+#endif
+TRACEPOINT(trace_syscall_sys_clone3, "%d <= %p %lu", int, struct clone_args *, size_t);
TRACEPOINT(trace_syscall_prlimit64, "%d <= %u %d %p %p", int, pid_t, int, const struct rlimit *, struct rlimit *);
TRACEPOINT(trace_syscall_msync, "%d <= 0x%x %lu %d", int, void *, size_t, int);
TRACEPOINT(trace_syscall_truncate, "%d <= %s %ld", int, const char *, off_t);
@@ -739,6 +809,7 @@ TRACEPOINT(trace_syscall_rt_sigtimedwait, "%d <= %p %p %p %lu", int, const sigse
TRACEPOINT(trace_syscall_getrlimit, "%d <= %d %p", int, int, struct rlimit *);
TRACEPOINT(trace_syscall_getpriority, "%d <= %d %d", int, int, int);
TRACEPOINT(trace_syscall_setpriority, "%d <= %d %d %d", int, int, int, int);
+TRACEPOINT(trace_syscall_ppoll, "%d <= %p %ld %p %p", int, struct pollfd *, nfds_t, const struct timespec *, const sigset_t *);
OSV_LIBC_API long syscall(long number, ...)
{
@@ -885,8 +956,11 @@ OSV_LIBC_API long syscall(long number, ...)
SYSCALL1(sys_set_tid_address, int *);
#ifdef __x86_64__
SYSCALL5(sys_clone, unsigned long, void *, int *, int *, unsigned long);
- SYSCALL2(sys_clone3, struct clone_args *, size_t);
#endif
+#ifdef __aarch64__
+ SYSCALL5(sys_clone, unsigned long, void *, int *, unsigned long, int *);
+#endif
+ SYSCALL2(sys_clone3, struct clone_args *, size_t);
SYSCALL4(prlimit64, pid_t, int, const struct rlimit *, struct rlimit *);
SYSCALL3(msync, void *, size_t, int);
SYSCALL2(truncate, const char *, off_t);
@@ -898,6 +972,7 @@ OSV_LIBC_API long syscall(long number, ...)
SYSCALL2(getrlimit, int, struct rlimit *);
SYSCALL2(getpriority, int, int);
SYSCALL3(setpriority, int, int, int);
+ SYSCALL4(ppoll, struct pollfd *, nfds_t, const struct timespec *, const sigset_t *);
}
debug_always("syscall(): unimplemented system call %d\n", number);