[COMMIT osv master] syscall: use GS register to store syscall stack address

7 views
Skip to first unread message

Commit Bot

unread,
Oct 12, 2023, 10:03:48 PM10/12/23
to osv...@googlegroups.com, Waldemar Kozaczuk
From: Waldemar Kozaczuk <jwkoz...@gmail.com>
Committer: WALDEMAR KOZACZUK <jwkoz...@gmail.com>
Branch: master

syscall: use GS register to store syscall stack address

The original syscall implementation utilized the TCB (Thread Control Block)
to store the syscall stack address. On each context switch, the
FS segment register would be reset to point to the thread specific
TCB where syscall handler would fetch its stack address from.

In order to support statically linked executables which allocate and use
its own TCB, OSv needs to be able to switch the FS register between
the user and kernel address when handling syscalls. The syscall handler
can no longer fetch its stack address from kernel TCB because the FS
register points to the app TCB. In order to break this dependency,
this patch changes all relevant code to move the syscall stack address
and syscall caller stack pointer to the per-CPU memory area addressed
by the GS segment register.

To that end, we define new structure - syscall_stack_descriptor - that
describes a syscall stack: its top and SYSCALL caller stack pointer.
Then, we add new field '_syscall_stack_descriptor' to the thread_state
to store each thread allocated syscall stack information.

In addition, we add new field '_current_syscall_stack_descriptor' to
the per-cpu structure arch_cpu and initialize each cpu GS register
to point to it. Finally, the thread::switch_to() is changed
to update the stack_top and caller_stack_pointer fields of
'_current_syscall_stack_descriptor' on each context switch so that
the syscall handler can fetch syscall stack address using GS segment.

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>

---
diff --git a/arch/aarch64/arch-switch.hh b/arch/aarch64/arch-switch.hh
--- a/arch/aarch64/arch-switch.hh
+++ b/arch/aarch64/arch-switch.hh
@@ -164,6 +164,10 @@ void thread::free_tcb()
free(_tcb);
}

+void thread::free_syscall_stack()
+{
+}
+
void thread_main_c(thread* t)
{
arch::irq_enable();
diff --git a/arch/x64/arch-cpu.hh b/arch/x64/arch-cpu.hh
--- a/arch/x64/arch-cpu.hh
+++ b/arch/x64/arch-cpu.hh
@@ -13,6 +13,8 @@
#include "cpuid.hh"
#include "osv/pagealloc.hh"
#include <xmmintrin.h>
+#include "syscall.hh"
+#include "msr.hh"

struct init_stack {
char stack[4096] __attribute__((aligned(16)));
@@ -46,6 +48,13 @@ struct arch_cpu {
u32 apic_id;
u32 acpi_id;
u64 gdt[nr_gdt];
+ // This field holds a syscall stack descriptor of a current thread
+ // which is updated on every context switch (see arch-switch.hh).
+ // We keep this field in this per-cpu structure and initialize GS register
+ // of the corresponding cpu to point to it (see init_on_cpu() down below),
+ // in order to make it possible to access it in assembly code through
+ // a known offset at %gs:0.
+ syscall_stack_descriptor _current_syscall_stack_descriptor;
void init_on_cpu();
void set_ist_entry(unsigned ist, char* base, size_t size);
char* get_ist_entry(unsigned ist);
@@ -181,6 +190,8 @@ inline void arch_cpu::init_on_cpu()
processor::init_fpu();

processor::init_syscall();
+
+ processor::wrmsr(msr::IA32_GS_BASE, reinterpret_cast<u64>(&_current_syscall_stack_descriptor.stack_top));
}

struct exception_guard {
diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -37,10 +37,10 @@
#define LARGE_SYSCALL_STACK_DEPTH (LARGE_SYSCALL_STACK_SIZE - SYSCALL_STACK_RESERVED_SPACE_SIZE)

#define SET_SYSCALL_STACK_TYPE_INDICATOR(value) \
-*reinterpret_cast<long*>(_tcb->syscall_stack_top) = value;
+*reinterpret_cast<long*>(_state._syscall_stack_descriptor.stack_top) = value;

#define GET_SYSCALL_STACK_TYPE_INDICATOR() \
-*reinterpret_cast<long*>(_tcb->syscall_stack_top)
+*reinterpret_cast<long*>(_state._syscall_stack_descriptor.stack_top)

#define TINY_SYSCALL_STACK_INDICATOR 0l
#define LARGE_SYSCALL_STACK_INDICATOR 1l
@@ -88,8 +88,14 @@ void thread::switch_to()
barrier();
auto c = _detached_state->_cpu;
old->_state.exception_stack = c->arch.get_exception_stack();
+ // save the old thread SYSCALL caller stack pointer in the syscall stack descriptor
+ old->_state._syscall_stack_descriptor.caller_stack_pointer = c->arch._current_syscall_stack_descriptor.caller_stack_pointer;
c->arch.set_interrupt_stack(&_arch);
c->arch.set_exception_stack(_state.exception_stack);
+ // set this cpu current thread syscall stack descriptor to the values copied from the new thread syscall stack descriptor
+ // so that the syscall handler can reference the current thread syscall stack top using the GS register
+ c->arch._current_syscall_stack_descriptor.caller_stack_pointer = _state._syscall_stack_descriptor.caller_stack_pointer;
+ c->arch._current_syscall_stack_descriptor.stack_top = _state._syscall_stack_descriptor.stack_top;
auto fpucw = processor::fnstcw();
auto mxcsr = processor::stmxcsr();
asm volatile
@@ -161,6 +167,25 @@ void thread::init_stack()
_state.rip = reinterpret_cast<void*>(thread_main);
_state.rsp = stacktop;
_state.exception_stack = _arch.exception_stack + sizeof(_arch.exception_stack);
+
+ if (is_app()) {
+ //
+ // Allocate TINY syscall call stack
+ void* tiny_syscall_stack_begin = malloc(TINY_SYSCALL_STACK_SIZE);
+ assert(tiny_syscall_stack_begin);
+ //
+ // The top of the stack needs to be 16 bytes lower to make space for
+ // OSv syscall stack type indicator and extra 8 bytes to make it 16-bytes aligned
+ _state._syscall_stack_descriptor.stack_top = tiny_syscall_stack_begin + TINY_SYSCALL_STACK_DEPTH;
+ SET_SYSCALL_STACK_TYPE_INDICATOR(TINY_SYSCALL_STACK_INDICATOR);
+ //
+ // Set a canary value at the bottom of the tiny stack to catch potential overflow
+ // caused by setup_large_syscall_stack()
+ *reinterpret_cast<u64*>(tiny_syscall_stack_begin) = STACK_CANARY;
+ }
+ else {
+ _state._syscall_stack_descriptor.stack_top = 0;
+ }
}

void thread::setup_tcb()
@@ -247,25 +272,6 @@ void thread::setup_tcb()
_tcb = static_cast<thread_control_block*>(p + total_tls_size);
_tcb->self = _tcb;
_tcb->tls_base = p + user_tls_size;
-
- if (is_app()) {
- //
- // Allocate TINY syscall call stack
- void* tiny_syscall_stack_begin = malloc(TINY_SYSCALL_STACK_SIZE);
- assert(tiny_syscall_stack_begin);
- //
- // The top of the stack needs to be 16 bytes lower to make space for
- // OSv syscall stack type indicator and extra 8 bytes to make it 16-bytes aligned
- _tcb->syscall_stack_top = tiny_syscall_stack_begin + TINY_SYSCALL_STACK_DEPTH;
- SET_SYSCALL_STACK_TYPE_INDICATOR(TINY_SYSCALL_STACK_INDICATOR);
- //
- // Set a canary value at the bottom of the tiny stack to catch potential overflow
- // caused by setup_large_syscall_stack()
- *reinterpret_cast<u64*>(tiny_syscall_stack_begin) = STACK_CANARY;
- }
- else {
- _tcb->syscall_stack_top = 0;
- }
}

void thread::setup_large_syscall_stack()
@@ -287,20 +293,23 @@ void thread::setup_large_syscall_stack()
// We could have copied only last 128 (registers) + 16 bytes (2 fields) instead
// of all of the stack but copying 1024 is simpler and happens
// only once per thread.
- void* tiny_syscall_stack_top = _tcb->syscall_stack_top;
+ void* tiny_syscall_stack_top = _state._syscall_stack_descriptor.stack_top;
memcpy(large_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH,
tiny_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH, TINY_SYSCALL_STACK_SIZE);
//
// Check if the tiny stack has not been overflowed
- assert(*reinterpret_cast<u64*>(_tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH) == STACK_CANARY);
+ assert(*reinterpret_cast<u64*>(_state._syscall_stack_descriptor.stack_top - TINY_SYSCALL_STACK_DEPTH) == STACK_CANARY);
//
// Save beginning of tiny stack at the bottom of LARGE stack so
// that we can deallocate it in free_tiny_syscall_stack
*((void**)large_syscall_stack_begin) = tiny_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH;
//
// Switch syscall stack address value in TCB to the top of the LARGE one
- _tcb->syscall_stack_top = large_syscall_stack_top;
+ _state._syscall_stack_descriptor.stack_top = large_syscall_stack_top;
SET_SYSCALL_STACK_TYPE_INDICATOR(LARGE_SYSCALL_STACK_INDICATOR);
+ //
+ // Switch what GS points to
+ _detached_state->_cpu->arch._current_syscall_stack_descriptor.stack_top = large_syscall_stack_top;
}

void thread::free_tiny_syscall_stack()
@@ -312,7 +321,7 @@ void thread::free_tiny_syscall_stack()
assert(is_app());
assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == LARGE_SYSCALL_STACK_INDICATOR);

- void* large_syscall_stack_top = _tcb->syscall_stack_top;
+ void* large_syscall_stack_top = _state._syscall_stack_descriptor.stack_top;
void* large_syscall_stack_begin = large_syscall_stack_top - LARGE_SYSCALL_STACK_DEPTH;
//
// Lookup address of tiny stack saved by setup_large_syscall_stack()
@@ -329,11 +338,14 @@ void thread::free_tcb()
} else {
free(_tcb->tls_base);
}
+}

- if (_tcb->syscall_stack_top) {
+void thread::free_syscall_stack()
+{
+ if (_state._syscall_stack_descriptor.stack_top) {
void* syscall_stack_begin = GET_SYSCALL_STACK_TYPE_INDICATOR() == TINY_SYSCALL_STACK_INDICATOR ?
- _tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH :
- _tcb->syscall_stack_top - LARGE_SYSCALL_STACK_DEPTH;
+ _state._syscall_stack_descriptor.stack_top - TINY_SYSCALL_STACK_DEPTH :
+ _state._syscall_stack_descriptor.stack_top - LARGE_SYSCALL_STACK_DEPTH;
free(syscall_stack_begin);
}
}
diff --git a/arch/x64/arch-thread-state.hh b/arch/x64/arch-thread-state.hh
--- a/arch/x64/arch-thread-state.hh
+++ b/arch/x64/arch-thread-state.hh
@@ -8,11 +8,16 @@
#ifndef ARCH_THREAD_STATE_HH_
#define ARCH_THREAD_STATE_HH_

+#include "syscall.hh"
+
struct thread_state {
char *exception_stack;
void* rsp;
void* rbp;
void* rip;
+ // The descriptor of the syscall stack intended to be used when handling
+ // SYSCALL instruction on this specific thread
+ syscall_stack_descriptor _syscall_stack_descriptor;
};

#endif /* ARCH_THREAD_STATE_HH_ */
diff --git a/arch/x64/arch-tls.hh b/arch/x64/arch-tls.hh
--- a/arch/x64/arch-tls.hh
+++ b/arch/x64/arch-tls.hh
@@ -13,23 +13,6 @@
struct thread_control_block {
thread_control_block* self;
void* tls_base;
- //
- // This field, a per-thread stack for SYSCALL instruction, is used in
- // arch/x64/entry.S for %fs's offset. We currently keep this field in the TCB
- // to make it easier to access in assembly code through a known offset at %fs:16.
- // But with more effort, we could have used an ordinary thread-local variable
- // instead and avoided extending the TCB here.
- //
- // The 8 bytes at the top of the syscall stack are used to identify if
- // the stack is tiny (0) or large (1). So the size of the syscall stack is in
- // reality smaller by 16 bytes from what was originally allocated because we need
- // to make it 16-bytes aligned.
- void* syscall_stack_top;
- //
- // This field is used to store the syscall caller stack pointer (value of RSP when
- // SYSCALL was called) so that it can be restored when syscall completed.
- // Same as above this field could be an ordinary thread-local variable.
- void* syscall_caller_stack_pointer;
};

#endif /* ARCH_TLS_HH */
diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh
--- a/arch/x64/msr.hh
+++ b/arch/x64/msr.hh
@@ -62,6 +62,7 @@ enum class msr : uint32_t {
IA32_LSTAR = 0xc0000082,
IA32_FMASK = 0xc0000084,
IA32_FS_BASE = 0xc0000100,
+ IA32_GS_BASE = 0xc0000101,

KVM_WALL_CLOCK = 0x11,
KVM_SYSTEM_TIME = 0x12,
diff --git a/arch/x64/syscall.S b/arch/x64/syscall.S
--- a/arch/x64/syscall.S
+++ b/arch/x64/syscall.S
@@ -19,14 +19,13 @@ syscall_entry:
.cfi_register rflags, r11 # r11 took previous rflags value
# There is no ring transition and rflags are left unchanged.
#
- # Unfortunately the mov instruction cannot be used to dereference an address
- # on syscall stack pointed by address in TCB (%fs:16) - double memory dereference.
- # Therefore we are forced to save caller stack address in a field in TCB.
- movq %rsp, %fs:24 # syscall_caller_stack_pointer
+ # Save the caller stack address in the field "_caller_stack_pointer" of
+ # the per-cpu _current_syscall_stack_descriptor structure referenced by GS.
+ movq %rsp, %gs:8 # syscall_caller_stack_pointer
#
# Switch stack to "tiny" syscall stack that should be large
# enough to setup "large" syscall stack (only when first SYSCALL on this thread)
- movq %fs:16, %rsp
+ movq %gs:0, %rsp

# Skip large syscall stack setup if it has been already setup
cmpq $0, (%rsp) // Check if we are on tiny or large stack
@@ -56,7 +55,7 @@ syscall_entry:
# This function does not take any arguments nor returns anything.
# It ends up allocating large stack and storing its address in tcb
callq setup_large_syscall_stack
- movq %fs:16, %rsp // Switch stack to large stack
+ movq %gs:0, %rsp // Switch stack to large stack
subq $128, %rsp // Skip 128 bytes of large stack so that we can restore all registers saved above (16 pushes).
// Please note that these 128 bytes have been copied by setup_large_syscall_stack function
// so that we do not have to pop and then push same registers again.
@@ -95,7 +94,7 @@ syscall_entry:
# We do this just so we can refer to it with CFI and help gdb's DWARF
# stack unwinding. This saving not otherwise needed for correct operation
# (we anyway restore it below by undoing all our modifications).
- pushq %fs:24
+ pushq %gs:8

.cfi_adjust_cfa_offset 8
.cfi_rel_offset %rsp, 0
@@ -172,7 +171,7 @@ syscall_entry:
popfq

# Restore caller stack pointer
- movq %fs:24, %rsp
+ movq %gs:8, %rsp

# jump to rcx where the syscall instruction put rip
# (sysret would leave rxc cloberred so we have nothing to do to restore it)
diff --git a/arch/x64/syscall.hh b/arch/x64/syscall.hh
--- a/arch/x64/syscall.hh
+++ b/arch/x64/syscall.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2023 Waldemar Kozaczuk
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#ifndef SYSCALL_HH_
+#define SYSCALL_HH_
+
+//This structure "describes" a per-thread syscall stack used when handling
+//the SYSCALL instruction. There are 2 places it is used:
+// - thread_state - holds information about this thread syscall stack
+// - arch_cpu - holds information about this cpu CURRENT thread syscall stack
+struct syscall_stack_descriptor {
+ // The address of the top of the syscall stack.
+ //
+ // The 8 bytes at the top of the stack are used to identify if the stack
+ // is tiny (0) or large (1). Therefore the size of the syscall stack is
+ // in reality smaller by 16 bytes from what was originally allocated because we need
+ // to make it 16-bytes aligned.
+ void* stack_top;
+ //
+ // This field is used to store the syscall caller stack pointer (value of RSP when
+ // SYSCALL was called) so that it can be restored when syscall completes.
+ void* caller_stack_pointer;
+};
+
+
+#endif
diff --git a/core/sched.cc b/core/sched.cc
--- a/core/sched.cc
+++ b/core/sched.cc
@@ -1221,6 +1221,7 @@ thread::~thread()
delete[] _tls[i];
}
free_tcb();
+ free_syscall_stack();
rcu_dispose(_detached_state.release());
}

diff --git a/include/osv/sched.hh b/include/osv/sched.hh
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -707,6 +707,7 @@ private:
void init_stack();
void setup_tcb();
void free_tcb();
+ void free_syscall_stack();
void complete() __attribute__((__noreturn__));
template <class Action>
inline void do_wake_with(Action action, unsigned allowed_initial_states_mask);
Reply all
Reply to author
Forward
0 new messages