Code Review - Add exception table kernel fixup (revised)

8 views
Skip to first unread message

Davide Libenzi

unread,
Oct 15, 2015, 8:43:26 PM10/15/15
to aka...@googlegroups.com


The following changes since commit fad2f1e22bbe4a7c8d9f5fb9df7ea4f5b563bd9b:

  Allow freeaddrinfo(NULL) (XCC) (2015-10-14 17:14:45 -0400)

are available in the git repository at:

  g...@github.com:dlibenzi/akaros user_memory_access

for you to fetch changes up to 1e38a5455115b6ef1dd1b2505d38182b9d9adc75:

  Added new kernel test case (2015-10-15 17:34:50 -0700)

----------------------------------------------------------------
Davide Libenzi (5):
      Added heapsort utility function to the lib/ framework.
      Added safe user memory access APIs
      Plugged the exception handling code
      Fixed a warning in the copy to/from user code.
      Added new kernel test case

 kern/arch/x86/trap.c            |  33 ++++++++-
 kern/arch/x86/uaccess.h         | 149 ++++++++++++++++++++++++++++++++++++++++
 kern/include/ex_table.h         |   9 +++
 kern/include/sort.h             |   9 +++
 kern/lib/Kbuild                 |   1 +
 kern/lib/sort.c                 |  41 +++++++++++
 kern/src/Kbuild                 |   1 +
 kern/src/ex_table.c             |  62 +++++++++++++++++
 kern/src/init.c                 |   2 +
 kern/src/ktest/Kconfig.postboot |   5 ++
 kern/src/ktest/pb_ktests.c      |  17 +++++
 11 files changed, 326 insertions(+), 3 deletions(-)
 create mode 100644 kern/arch/x86/uaccess.h
 create mode 100644 kern/include/ex_table.h
 create mode 100644 kern/include/sort.h
 create mode 100644 kern/lib/sort.c
 create mode 100644 kern/src/ex_table.c

diff --git a/kern/arch/x86/trap.c b/kern/arch/x86/trap.c
index e31b559..4b65310 100644
--- a/kern/arch/x86/trap.c
+++ b/kern/arch/x86/trap.c
@@ -16,6 +16,7 @@
 #include <syscall.h>
 #include <kdebug.h>
 #include <kmalloc.h>
+#include <ex_table.h>
 #include <arch/mptables.h>
 
 taskstate_t ts;
@@ -29,6 +30,18 @@ pseudodesc_t idt_pd;
 struct irq_handler *irq_handlers[NUM_IRQS];
 spinlock_t irq_handler_wlock = SPINLOCK_INITIALIZER_IRQSAVE;
 
+static bool try_handle_exception_fixup(struct hw_trapframe *hw_tf)
+{
+ uintptr_t fixup_ip;
+
+ if (in_kernel(hw_tf) && ((fixup_ip = get_fixup_ip(hw_tf->tf_rip)) != 0)) {
+ hw_tf->tf_rip = fixup_ip;
+ return true;
+ }
+
+ return false;
+}
+
 const char *x86_trapname(int trapno)
 {
  static const char *const excnames[] = {
@@ -229,6 +242,9 @@ static bool __handle_page_fault(struct hw_trapframe *hw_tf, unsigned long *aux)
  enable_irq();
 
  if (!pcpui->cur_proc) {
+ if (try_handle_exception_fixup(hw_tf))
+ return TRUE;
+
  /* still catch KPFs */
  assert((hw_tf->tf_cs & 3) == 0);
  print_trapframe(hw_tf);
@@ -248,6 +264,9 @@ static bool __handle_page_fault(struct hw_trapframe *hw_tf, unsigned long *aux)
  if (in_kernel(hw_tf))
  pcpui->__lock_checking_enabled++;
  if (err) {
+ if (try_handle_exception_fixup(hw_tf))
+ return TRUE;
+
  if (in_kernel(hw_tf)) {
  print_trapframe(hw_tf);
  backtrace_kframe(hw_tf);
@@ -285,6 +304,8 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
  struct per_cpu_info *pcpui;
  bool handled = TRUE;
  unsigned long aux = 0;
+ uintptr_t fixup_ip;
+
  // Handle processor exceptions.
  switch(hw_tf->tf_trapno) {
  case T_NMI:
@@ -342,7 +363,8 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
  handled = __handle_page_fault(hw_tf, &aux);
  break;
  case T_FPERR:
- handle_fperr(hw_tf);
+ if (!(handled = try_handle_exception_fixup(hw_tf)))
+ handle_fperr(hw_tf);
  break;
  case T_SYSCALL:
  enable_irq();
@@ -362,8 +384,13 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
  handled = FALSE;
  }
  }
- if (!handled)
- reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
+
+ if (!handled) {
+ if ((fixup_ip = get_fixup_ip(hw_tf->tf_rip)) != 0)
+ hw_tf->tf_rip = fixup_ip;
+ else
+ reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
+ }
 }
 
 /* Helper.  For now, this copies out the TF to pcpui.  Eventually, we should
diff --git a/kern/arch/x86/uaccess.h b/kern/arch/x86/uaccess.h
new file mode 100644
index 0000000..472d065
--- /dev/null
+++ b/kern/arch/x86/uaccess.h
@@ -0,0 +1,149 @@
+#ifndef ROS_INC_UACCESS_H
+#define ROS_INC_UACCESS_H
+
+#include <ros/errno.h>
+#include <stdint.h>
+
+#define ASM_STAC
+#define ASM_CLAC
+#define __m(x) *(x)
+
+struct fixup {
+ uint64_t insn;
+ uint64_t fixup;
+};
+
+#define _ASM_EXTABLE_INIT() \
+ asm volatile( \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 16\n" \
+ " .popsection\n" \
+ : :)
+
+#define _ASM_EXTABLE(from,to) \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 16\n" \
+ " .quad (" #from ") - .\n" \
+ " .quad (" #to ") - .\n" \
+ " .popsection\n"
+
+#define _ASM_EXTABLE_EX(from,to) \
+ " .pushsection \"__ex_table_ex\",\"a\"\n" \
+ " .balign 16\n" \
+ " .quad (" #from ") - .\n" \
+ " .quad (" #to ") - . + 0x7ffffff0\n" \
+ " .popsection\n"
+
+#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret)       \
+ asm volatile(ASM_STAC "\n" \
+ "1:        mov"itype" %"rtype"1,%2\n" \
+             "2: " ASM_CLAC "\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:        mov %3,%0\n" \
+ "  jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r"(err) \
+ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile(ASM_STAC "\n"                                      \
+ "1:        mov"itype" %2,%"rtype"1\n"              \
+ "2: " ASM_CLAC "\n"                                \
+ ".section .fixup,\"ax\"\n" \
+ "3:        mov %3,%0\n"                            \
+ "  xor"itype" %"rtype"1,%"rtype"1\n"               \
+ "  jmp 2b\n"                                       \
+ ".previous\n"                                      \
+ _ASM_EXTABLE(1b, 3b)                               \
+ : "=r" (err), ltype(x)                             \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __user_memcpy(dst, src, count, err, errret) \
+ asm volatile(ASM_STAC "\n" \
+ "1:        rep movsb\n" \
+             "2: " ASM_CLAC "\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:        mov %4,%0\n" \
+ "  jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r"(err) \
+ : "D" (dst), "S" (src), "c" (count), "i" (errret), "0" (err))
+
+static inline int copy_to_user(void *dst, const void *src, unsigned int count)
+{
+ int err = 0;
+
+ if (!__builtin_constant_p(count))
+ __user_memcpy(dst, src, count, err, -EFAULT);
+ else {
+ switch (count) {
+ case 1:
+ __put_user_asm(*(const uint8_t *) src, (uint8_t *) dst, err, "b",
+   "b", "iq", -EFAULT);
+ break;
+ case 2:
+ __put_user_asm(*(const uint16_t *) src, (uint16_t *) dst, err, "w",
+   "w", "ir", -EFAULT);
+ break;
+ case 4:
+ __put_user_asm(*(const uint32_t *) src, (uint32_t *) dst, err, "l",
+   "k", "ir", -EFAULT);
+ break;
+ case 8:
+ __put_user_asm(*(const uint64_t *) src, (uint64_t *) dst, err, "q",
+   "", "er", -EFAULT);
+ break;
+ default:
+ __user_memcpy(dst, src, count, err, -EFAULT);
+ }
+ }
+
+ return err;
+}
+
+static inline int copy_from_user(void *dst, const void *src,
+ unsigned int count)
+{
+ int err = 0;
+
+ if (!__builtin_constant_p(count))
+ __user_memcpy(dst, src, count, err, -EFAULT);
+ else {
+ switch (count) {
+ case 1:
+ __get_user_asm(*(uint8_t *) dst, (const uint8_t *) src, err, "b",
+   "b", "=q", -EFAULT);
+ break;
+ case 2:
+ __get_user_asm(*(uint16_t *) dst, (const uint16_t *) src, err, "w",
+   "w", "=r", -EFAULT);
+ break;
+ case 4:
+ __get_user_asm(*(uint32_t *) dst, (const uint32_t *) src, err, "l",
+   "k", "=r", -EFAULT);
+ break;
+ case 8:
+ __get_user_asm(*(uint64_t *) dst, (const uint64_t *) src, err, "q",
+   "", "=r", -EFAULT);
+ break;
+ default:
+ __user_memcpy(dst, src, count, err, -EFAULT);
+ }
+ }
+
+ return err;
+}
+
+static inline uintptr_t ex_insn_addr(const struct fixup *x)
+{
+ return (uintptr_t) &x->insn + x->insn;
+}
+
+static inline uintptr_t ex_fixup_addr(const struct fixup *x)
+{
+ return (uintptr_t) &x->fixup + x->fixup;
+}
+
+#endif /* !ROS_INC_UACCESS_H */
diff --git a/kern/include/ex_table.h b/kern/include/ex_table.h
new file mode 100644
index 0000000..8ef2885
--- /dev/null
+++ b/kern/include/ex_table.h
@@ -0,0 +1,9 @@
+#ifndef ROS_INC_EX_TABLE_H
+#define ROS_INC_EX_TABLE_H
+
+#include <stdint.h>
+
+void exception_table_init(void);
+uintptr_t get_fixup_ip(uintptr_t xip);
+
+#endif /* ROS_INC_EX_TABLE_H */
diff --git a/kern/include/sort.h b/kern/include/sort.h
new file mode 100644
index 0000000..11b98b6
--- /dev/null
+++ b/kern/include/sort.h
@@ -0,0 +1,9 @@
+#ifndef ROS_INC_LIB_SORT_H
+#define ROS_INC_LIB_SORT_H
+
+#include <stdio.h>
+
+void sort(void *base, size_t count, size_t size,
+          int (*cmp)(const void *, const void *));
+
+#endif /* ROS_INC_LIB_SORT_H */
diff --git a/kern/lib/Kbuild b/kern/lib/Kbuild
index 1cb408f..aa4a004 100644
--- a/kern/lib/Kbuild
+++ b/kern/lib/Kbuild
@@ -1,2 +1,3 @@
+obj-y += sort.o
 obj-y += zlib_deflate/
 obj-y += zlib_inflate/
diff --git a/kern/lib/sort.c b/kern/lib/sort.c
new file mode 100644
index 0000000..ec448df
--- /dev/null
+++ b/kern/lib/sort.c
@@ -0,0 +1,41 @@
+
+#include <stdio.h>
+
+static void mem_swap(void *a, void *b, size_t size)
+{
+ for (; size > 0; size--, a++, b++) {
+ char tmp = *(char*) a;
+
+ *(char *) a = *(char *) b;
+ *(char *) b = tmp;
+ }
+}
+
+void sort(void *base, size_t count, size_t size,
+          int (*cmp)(const void *, const void *))
+{
+ ssize_t n = count * size, c;
+
+ for (ssize_t i = (count / 2 - 1) * size; i >= 0; i -= size) {
+ for (ssize_t r = i; r * 2 + size < n; r = c) {
+ c = r * 2 + size;
+ if (c < n - size && cmp(base + c, base + c + size) < 0)
+ c += size;
+ if (cmp(base + r, base + c) >= 0)
+ break;
+ mem_swap(base + r, base + c, size);
+ }
+ }
+
+ for (ssize_t i = n - size; i > 0; i -= size) {
+ mem_swap(base, base + i, size);
+ for (ssize_t r = 0; r * 2 + size < i; r = c) {
+ c = r * 2 + size;
+ if (c < i - size && cmp(base + c, base + c + size) < 0)
+ c += size;
+ if (cmp(base + r, base + c) >= 0)
+ break;
+ mem_swap(base + r, base + c, size);
+ }
+ }
+}
diff --git a/kern/src/Kbuild b/kern/src/Kbuild
index 0981c66..d2a9d18 100644
--- a/kern/src/Kbuild
+++ b/kern/src/Kbuild
@@ -26,6 +26,7 @@ obj-y += error.o
 obj-$(CONFIG_ETH_AUDIO) += eth_audio.o
 obj-y += event.o
 obj-y += ext2fs.o
+obj-y += ex_table.o
 obj-y += fdtap.o
 obj-y += find_next_bit.o
 obj-y += find_last_bit.o
diff --git a/kern/src/ex_table.c b/kern/src/ex_table.c
new file mode 100644
index 0000000..8f060a7
--- /dev/null
+++ b/kern/src/ex_table.c
@@ -0,0 +1,62 @@
+
+#include <ex_table.h>
+#include <arch/uaccess.h>
+#include <sort.h>
+
+extern struct fixup __start___ex_table;
+extern struct fixup __stop___ex_table;
+
+static int fixup_cmp(const void *f1, const void *f2)
+{
+ return ((const struct fixup *) f1)->insn <
+ ((const struct fixup *) f2)->insn ? -1: +1;
+}
+
+void exception_table_init(void)
+{
+ struct fixup *first = &__start___ex_table;
+ struct fixup *last = &__stop___ex_table;
+ uint64_t offset = 0;
+
+ for (struct fixup *fx = first; fx < last; fx++) {
+ fx->insn += offset;
+ offset += sizeof(fx->insn);
+ fx->fixup += offset;
+ offset += sizeof(fx->fixup);
+ }
+
+ sort(first, last - first, sizeof(*first), fixup_cmp);
+
+ offset = 0;
+ for (struct fixup *fx = first; fx < last; fx++) {
+ fx->insn -= offset;
+ offset += sizeof(fx->insn);
+ fx->fixup -= offset;
+ offset += sizeof(fx->fixup);
+ }
+
+ /* Avoid undefined __start___ex_table and __stop___ex_table errors when
+ * no code is using the exception table facility.
+ */
+ _ASM_EXTABLE_INIT();
+}
+
+uintptr_t get_fixup_ip(uintptr_t xip)
+{
+ const struct fixup *first = &__start___ex_table;
+ const struct fixup *last = &__stop___ex_table;
+
+ while (first <= last) {
+ const struct fixup *x = first + ((last - first) >> 1);
+ uintptr_t insn = ex_insn_addr(x);
+
+ if (insn < xip)
+ first = x + 1;
+ else if (insn > xip)
+ last = x - 1;
+ else
+ return (uintptr_t) ex_fixup_addr(x);
+ }
+
+ return 0;
+}
diff --git a/kern/src/init.c b/kern/src/init.c
index a795119..ba88555 100644
--- a/kern/src/init.c
+++ b/kern/src/init.c
@@ -29,6 +29,7 @@
 #include <radix.h>
 #include <mm.h>
 #include <frontend.h>
+#include <ex_table.h>
 
 #include <arch/init.h>
 #include <bitmask.h>
@@ -64,6 +65,7 @@ void kernel_init(multiboot_info_t *mboot_info)
  cons_init();
  print_cpuinfo();
 
+ exception_table_init();
  cache_init(); // Determine systems's cache properties
  pmem_init(multiboot_kaddr);
  kmem_cache_init();              // Sets up slab allocator
diff --git a/kern/src/ktest/Kconfig.postboot b/kern/src/ktest/Kconfig.postboot
index 90b61ca..70ef774 100644
--- a/kern/src/ktest/Kconfig.postboot
+++ b/kern/src/ktest/Kconfig.postboot
@@ -238,3 +238,8 @@ config TEST_u16pool
     depends on PB_KTESTS
     bool "u16 pool"
     default n
+
+config TEST_uaccess
+    depends on PB_KTESTS
+    bool "Tests user memory access fault trapping"
+    default y
diff --git a/kern/src/ktest/pb_ktests.c b/kern/src/ktest/pb_ktests.c
index 5dfd003..1866b6e 100644
--- a/kern/src/ktest/pb_ktests.c
+++ b/kern/src/ktest/pb_ktests.c
@@ -6,6 +6,7 @@
 
 #include <arch/mmu.h>
 #include <arch/arch.h>
+#include <arch/uaccess.h>
 #include <bitmask.h>
 #include <smp.h>
 
@@ -2085,6 +2086,21 @@ bool test_u16pool(void)
  return FALSE;
 }
 
+bool test_uaccess(void)
+{
+ char buf[128] = { 0 };
+ char buf2[128] = { 0 };
+
+ if (copy_to_user((void *) UDATA, buf, sizeof(buf)) == 0)
+ return false;
+ if (copy_from_user(buf, (const void *) UDATA, sizeof(buf)) == 0)
+ return false;
+ if (copy_from_user(buf, buf2, sizeof(buf)) != 0)
+ return false;
+
+ return true;
+}
+
 static struct ktest ktests[] = {
 #ifdef CONFIG_X86
  KTEST_REG(ipi_sending,        CONFIG_TEST_ipi_sending),
@@ -2124,6 +2140,7 @@ static struct ktest ktests[] = {
  KTEST_REG(alarm,              CONFIG_TEST_alarm),
  KTEST_REG(kmalloc_incref,     CONFIG_TEST_kmalloc_incref),
  KTEST_REG(u16pool,            CONFIG_TEST_u16pool),
+ KTEST_REG(uaccess,            CONFIG_TEST_uaccess),
 };
 static int num_ktests = sizeof(ktests) / sizeof(struct ktest);
 linker_func_1(register_pb_ktests)

Kevin Klues

unread,
Oct 16, 2015, 1:26:53 AM10/16/15
to Akaros
On Thursday, October 15, 2015 at 5:43:26 PM UTC-7, Davide Libenzi wrote:

A better url to use to look at this (which is derived from the sha1s below is:

Notice that the link is to brho/akaros, not dlibenzi/akaros (it actually doesn't matter which remote you use since the shas are shared for all forks of the same repo).  Not that I'm promoting this, but if you comment on these commits when the url has brho/akaros in it, github will generate notifications back to the list... They will not be in response to this thread though, which is a bummer.  Each commit will start it's own thread, but at least we found a way to get it to work if we want to... 
 
The following changes since commit fad2f1e22bbe4a7c8d9f5fb9df7ea4f5b563bd9b:

  Allow freeaddrinfo(NULL) (XCC) (2015-10-14 17:14:45 -0400)

are available in the git repository at:

  g...@github.com:dlibenzi/akaros user_memory_access

for you to fetch changes up to 1e38a5455115b6ef1dd1b2505d38182b9d9adc75:

  Added new kernel test case (2015-10-15 17:34:50 -0700)


In the function below, I'd rework the declaration for fixup_ip and the if statement to read:

uintptr_t fixup_ip = get_fixup_ip(hw_tf->tf_rip);

if (in_kernel(hw_tf) && (fixup_ip != 0)) {
    ...
}
 
+static bool try_handle_exception_fixup(struct hw_trapframe *hw_tf)
+{
+ uintptr_t fixup_ip;
 
+
+ if (in_kernel(hw_tf) && ((fixup_ip = get_fixup_ip(hw_tf->tf_rip)) != 0)) {
 
+ hw_tf->tf_rip = fixup_ip;
+ return true;
+ }
+
+ return false;
+}
+


...
 
@@ -285,6 +304,8 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
  struct per_cpu_info *pcpui;
  bool handled = TRUE;
  unsigned long aux = 0;

Likewise, below, I'd do:

uintptr_t fixup_ip = get_fixup_ip(hw_tf->tf_rip);
 
+ uintptr_t fixup_ip; 
+
  // Handle processor exceptions.
  switch(hw_tf->tf_trapno) {
  case T_NMI:

... 

@@ -362,8 +384,13 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
  handled = FALSE;
  }
  }
- if (!handled)
- reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
+
+ if (!handled) {

And avoid the assignment in the if statement below...
 
+ if ((fixup_ip = get_fixup_ip(hw_tf->tf_rip)) != 0)
+ hw_tf->tf_rip = fixup_ip;
+ else
+ reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
+ }
 }
 

 
The struct name below seems unnecessarily vague if we are going to pollute the namespace of any file that #includes <uaccess.h>.  Maybe insn_fixup?

+struct fixup {
+ uint64_t insn;
+ uint64_t fixup;
+};
+

I've been guilt of this myself in the past, but the'\' character below are all misaligned.  It's probably an artifact of my mail client using 8 space tabs though. I assume it lines up when using 4 space tabs?
 
+#define _ASM_EXTABLE_INIT() \
+ asm volatile( \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 16\n" \
+ " .popsection\n" \
+ : :)

...
 
+#endif /* ROS_INC_LIB_SORT_H */
diff --git a/kern/lib/Kbuild b/kern/lib/Kbuild
index 1cb408f..aa4a004 100644
--- a/kern/lib/Kbuild
+++ b/kern/lib/Kbuild
@@ -1,2 +1,3 @@

I'm fine with either way, but when Barret mentioned mimicing the way we did things for zlib*, he may have meant putting the sort.c in a subfolder called sort.  Seems a bit like overkill though for a single file (unless we plan to eventually implement multiple sorting strategies and we extend sort.h accordingly).
 
+obj-y += sort.o
 obj-y += zlib_deflate/
 obj-y += zlib_inflate/

...

Again, I'd use insn_fixup or something more descriptive than just "fixup" in the names of stuff here.  Though this functions is static so it's only a minor issue here...
 
+extern struct fixup __start___ex_table;
+extern struct fixup __stop___ex_table;
+
+static int fixup_cmp(const void *f1, const void *f2)
+{
+ return ((const struct fixup *) f1)->insn <
+ ((const struct fixup *) f2)->insn ? -1: +1;
+}
+

...
 
I'd probably use the ktest asserts to print out descriptive errors instead of simply returning false on failure in the function below. 

Davide Libenzi

unread,
Oct 16, 2015, 9:42:18 AM10/16/15
to Akaros
On Thu, Oct 15, 2015 at 10:26 PM, Kevin Klues <klu...@gmail.com> wrote:
On Thursday, October 15, 2015 at 5:43:26 PM UTC-7, Davide Libenzi wrote:

A better url to use to look at this (which is derived from the sha1s below is:

Notice that the link is to brho/akaros, not dlibenzi/akaros (it actually doesn't matter which remote you use since the shas are shared for all forks of the same repo).  Not that I'm promoting this, but if you comment on these commits when the url has brho/akaros in it, github will generate notifications back to the list... They will not be in response to this thread though, which is a bummer.  Each commit will start it's own thread, but at least we found a way to get it to work if we want to... 

How did you generate that link?

 
 
The following changes since commit fad2f1e22bbe4a7c8d9f5fb9df7ea4f5b563bd9b:

  Allow freeaddrinfo(NULL) (XCC) (2015-10-14 17:14:45 -0400)

are available in the git repository at:

  g...@github.com:dlibenzi/akaros user_memory_access

for you to fetch changes up to 1e38a5455115b6ef1dd1b2505d38182b9d9adc75:

  Added new kernel test case (2015-10-15 17:34:50 -0700)


In the function below, I'd rework the declaration for fixup_ip and the if statement to read:

uintptr_t fixup_ip = get_fixup_ip(hw_tf->tf_rip);

if (in_kernel(hw_tf) && (fixup_ip != 0)) {
    ...
}

Yeah, forgot. I would have gotten the same comment if this was a Linux patch ☺
Will fix that.
The code above is actually gone. I generate the email/patch before some further fixup.

 
 
The struct name below seems unnecessarily vague if we are going to pollute the namespace of any file that #includes <uaccess.h>.  Maybe insn_fixup?

Agreed. I'd make it even more explicit, like extable_ip_fixup?


 

+struct fixup {
+ uint64_t insn;
+ uint64_t fixup;
+};
+

I've been guilt of this myself in the past, but the'\' character below are all misaligned.  It's probably an artifact of my mail client using 8 space tabs though. I assume it lines up when using 4 space tabs?

It looks fine in my emacs configured with 4 spaces tabs.
I noticed myself that in the email is all screwed up 😐

 
 
+#define _ASM_EXTABLE_INIT() \
+ asm volatile( \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 16\n" \
+ " .popsection\n" \
+ : :)

...
 
+#endif /* ROS_INC_LIB_SORT_H */
diff --git a/kern/lib/Kbuild b/kern/lib/Kbuild
index 1cb408f..aa4a004 100644
--- a/kern/lib/Kbuild
+++ b/kern/lib/Kbuild
@@ -1,2 +1,3 @@

I'm fine with either way, but when Barret mentioned mimicing the way we did things for zlib*, he may have meant putting the sort.c in a subfolder called sort.  Seems a bit like overkill though for a single file (unless we plan to eventually implement multiple sorting strategies and we extend sort.h accordingly).

I can do it, but it seems a little bit excessive (a folder for 30 lines of code), as we aren't libsort 😀
I do not expect many different (if at all) sort implementation here.

 
 
+obj-y += sort.o
 obj-y += zlib_deflate/
 obj-y += zlib_inflate/

...

Again, I'd use insn_fixup or something more descriptive than just "fixup" in the names of stuff here.  Though this functions is static so it's only a minor issue here...
 
+extern struct fixup __start___ex_table;
+extern struct fixup __stop___ex_table;
+
+static int fixup_cmp(const void *f1, const void *f2)
+{
+ return ((const struct fixup *) f1)->insn <
+ ((const struct fixup *) f2)->insn ? -1: +1;
+}
+

...
 
I'd probably use the ktest asserts to print out descriptive errors instead of simply returning false on failure in the function below. 

Agreed. I also need to check if the user address is within valid user address ranges.
Is there a function for that?

 

+bool test_uaccess(void)
+{
+ char buf[128] = { 0 };
+ char buf2[128] = { 0 };
+
+ if (copy_to_user((void *) UDATA, buf, sizeof(buf)) == 0)
+ return false;
+ if (copy_from_user(buf, (const void *) UDATA, sizeof(buf)) == 0)
+ return false;
+ if (copy_from_user(buf, buf2, sizeof(buf)) != 0)
+ return false;
+
+ return true;
+}
+

--
You received this message because you are subscribed to the Google Groups "Akaros" group.
To unsubscribe from this group and stop receiving emails from it, send an email to akaros+un...@googlegroups.com.
To post to this group, send email to aka...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Davide Libenzi

unread,
Oct 16, 2015, 10:13:53 AM10/16/15
to Akaros

Agreed. I also need to check if the user address is within valid user address ranges.
Is there a function for that?

Never mind. Found it.

Barret Rhoden

unread,
Oct 16, 2015, 10:24:31 AM10/16/15
to aka...@googlegroups.com
On 2015-10-16 at 06:42 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> >> https://github.com/dlibenzi/akaros/commits/user_memory_access
> >>

What's the overall intent of this patch set? I'm a little worried
about where you're going with this. I had mentioned before wanting to
use error() for handling kernel page faults. I don't know that using
Linux's approach to user memory accesses is the way Akaros should do it.

> The code above is actually gone. I generate the email/patch before
> some further fixup.

I'll take a look at the next version then.

> >> +#endif /* ROS_INC_LIB_SORT_H */
> >> diff --git a/kern/lib/Kbuild b/kern/lib/Kbuild
> >> index 1cb408f..aa4a004 100644
> >> --- a/kern/lib/Kbuild
> >> +++ b/kern/lib/Kbuild
> >> @@ -1,2 +1,3 @@
> >>
> >
> > I'm fine with either way, but when Barret mentioned mimicing the
> > way we did things for zlib*, he may have meant putting the sort.c
> > in a subfolder called sort. Seems a bit like overkill though for a
> > single file (unless we plan to eventually implement multiple
> > sorting strategies and we extend sort.h accordingly).
> >
>
> I can do it, but it seems a little bit excessive (a folder for 30
> lines of code), as we aren't libsort 😀
> I do not expect many different (if at all) sort implementation here.

I think leaving it in kern/lib/ is fine. If we have a bunch of other
sorts, like bubblesort!, then we can make a sort directory. =)

> Agreed. I also need to check if the user address is within valid user
> address ranges.
> Is there a function for that?

kern/include/umem.h:

static inline bool is_user_raddr(void *addr, size_t len)
static inline bool is_user_rwaddr(void *addr, size_t len)

Barret

Davide Libenzi

unread,
Oct 16, 2015, 10:32:21 AM10/16/15
to Akaros
On Fri, Oct 16, 2015 at 7:24 AM, Barret Rhoden <br...@cs.berkeley.edu> wrote:
On 2015-10-16 at 06:42 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> >> https://github.com/dlibenzi/akaros/commits/user_memory_access
> >>

What's the overall intent of this patch set?  I'm a little worried
about where you're going with this.  I had mentioned before wanting to
use error() for handling kernel page faults.  I don't know that using
Linux's approach to user memory accesses is the way Akaros should do it.

Looking at how we copy memory from/to user in umem.c (directly walking page tables), the Linux way is quite a bit faster.
You simply just access memory.
If you access user memory which require page fault, that would happen automatically.
If it goes well, you won't even notice, otherwise the fixup will get you into the ASM label where you return -EFAULT.

Nothing prevents us to have an error() throwing version, wrapping the core one which returns -EFAULT.
There are many places in the kernel where we cannot "throw", because locks are not protected by waserror()s.
Yep, thanks, that's where my eye fell ☺
 

Barret

Barret Rhoden

unread,
Oct 16, 2015, 10:51:41 AM10/16/15
to aka...@googlegroups.com
On 2015-10-16 at 07:32 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> Looking at how we copy memory from/to user in umem.c (directly
> walking page tables), the Linux way is quite a bit faster.
> You simply just access memory.

That code is quite old. We also just access user memory. There are
some old places where we still do memcpy_from_user, and those are
horrid. The newer style is to access user memory and deal with the
page fault after the fact - similar to Linux.

A lot of our process management and event code directly accesses user
memory. For instance, the ucqs are working on user memory for many of
it's accesses. In the common case, we don't grab locks while accessing
user memory, but it can happen. (grep "ultimate_fallback" in
k/s/event.c, spam_vcore() usually calls send_ucq_msg, and the entire
ucq struct is a user pointer.).

> If you access user memory which require page fault, that would happen
> automatically.
> If it goes well, you won't even notice, otherwise the fixup will get
> you into the ASM label where you return -EFAULT.

I need to spend a little time reading up on the exception tables. I
don't know if marking all of the places where we access user memory is
viable (assuming that's how the tables are built).

> Nothing prevents us to have an error() throwing version, wrapping the
> core one which returns -EFAULT.
> There are many places in the kernel where we cannot "throw", because
> locks are not protected by waserror()s.

My intent was to protect many of those places with some form of
waserror. Originally, before we had waserror, I was planning on
building a form of unwind based on function pointers and void *s. If
you happened to have a lock grabbed (even a spinlock), or some other
state that needed to be unwound, you'd have to push a cleanup in
anticipation of that. We'd have a stack of them in some cases. At the
root, there'd be a "kill the process" handler (if the user gave us a
bad or unmapped address, that is a critical fault on their part -
that's an Akaros thing).

Anyway, waserror() came along and seemed like it would do the trick.
Perhaps the exception tables would also, but I'm not sure about it, and
I'd prefer to go slowly in critical areas like this.

Barret

Davide Libenzi

unread,
Oct 16, 2015, 11:28:47 AM10/16/15
to Akaros
On Fri, Oct 16, 2015 at 7:51 AM, Barret Rhoden <br...@cs.berkeley.edu> wrote:
On 2015-10-16 at 07:32 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> Looking at how we copy memory from/to user in umem.c (directly
> walking page tables), the Linux way is quite a bit faster.
> You simply just access memory.

That code is quite old.  We also just access user memory.  There are
some old places where we still do memcpy_from_user, and those are
horrid.  The newer style is to access user memory and deal with the
page fault after the fact - similar to Linux.

A lot of our process management and event code directly accesses user
memory.  For instance, the ucqs are working on user memory for many of
it's accesses.  In the common case, we don't grab locks while accessing
user memory, but it can happen.  (grep "ultimate_fallback" in
k/s/event.c, spam_vcore() usually calls send_ucq_msg, and the entire
ucq struct is a user pointer.).

> If you access user memory which require page fault, that would happen
> automatically.
> If it goes well, you won't even notice, otherwise the fixup will get
> you into the ASM label where you return -EFAULT.

I need to spend a little time reading up on the exception tables.  I
don't know if marking all of the places where we access user memory is
viable (assuming that's how the tables are built).

Having dedicated functions to access user memory would be nice, to avoid spread pte checks and user address checks all over.
In Linux, and many other OSs, user memory access in only limited at system call time, but I see that in Akaros event delivery you need to touch user memory when there is not a user context up the stack.
Example, in the event delivery, you would simply do a switch-to, and copy-to-user.
Is that OK for Akaros to drop the event (at least, this is what seem to happen) in case there is something wrong with user memory?
Shouldn't it be getting some EFAULT event as consequence?


Anyway, waserror() came along and seemed like it would do the trick.
Perhaps the exception tables would also, but I'm not sure about it, and
I'd prefer to go slowly in critical areas like this.

IMO they are not exclusive. I don't think you can safely error() from a page fault handler also.
This is where the exception tables give you the choice of getting an -EFAULT into memory access location call site, and eventually error() from there.



Kevin Klues

unread,
Oct 16, 2015, 7:07:35 PM10/16/15
to aka...@googlegroups.com
From Davide's updated branch:

-------
I've updated the dlibenzi:user_memory_access branch to incorporate
Kevin's comments (hey Kev, I'm getting good with rebase -i ).

https://github.com/brho/akaros/compare/master...dlibenzi:user_memory_access?diff=unified&ts=4

It also removed a fix-previous-commit change, and added a small change
to umem.h memory range test API.

I need to add positive test cases. Unfortunately, in order to do that,
I need valid RW memory mapped at user address ranges.
------------

This function looks much cleaner now, thanks.

+static bool try_handle_exception_fixup(struct hw_trapframe *hw_tf)
+{
+ if (in_kernel(hw_tf)) {
+ uintptr_t fixup_ip = get_fixup_ip(hw_tf->tf_rip);
+
+ if (fixup_ip != 0) {
+ hw_tf->tf_rip = fixup_ip;
+ return true;
+ }
+ }
+
+ return false;
+}
+

Again, I would separate out the assignment form the if statement below:

+ if (!(handled = try_handle_exception_fixup(hw_tf)))
+ handle_fperr(hw_tf);

EWith a tab space of 4, I still see some misalignment of '\' at the
end of the lines. However, that seems to just be an artifact of the
'+' at the beginning of the line when I do a diff. When I open the
file directly, everything looks aligned.

The struct naming below is much better...

+struct extable_ip_fixup {
+ uint64_t insn;
+ uint64_t fixup;
+};

The new changes below LGTM:

-static inline bool __is_user_addr(void *addr, size_t len, uintptr_t lim)
+static inline bool __is_user_addr(const void *addr, size_t len, uintptr_t lim)
{
- if ((MMAP_LOWEST_VA <= (uintptr_t)addr) &&
- ((uintptr_t)addr < lim) &&
- ((uintptr_t)addr + len <= lim))
- return TRUE;
- else
- return FALSE;
+ if (unlikely((uintptr_t) addr < MMAP_LOWEST_VA))
+ return false;
+ if (unlikely((uintptr_t) addr >= lim))
+ return false;
+ if (unlikely(lim - (uintptr_t) addr < len))
+ return false;
+
+ return true;
}

The test in:

kern/src/ktest/pb_ktests.c

Looks better now using the asserts. We should consider backporting
some of the macro changes I made to the utest macros to ktest (this
won't change anything you have currently though).

https://github.com/brho/akaros/commit/8cdb64f7a3d00ce370e4eebb3a093bc9d7018772


On Fri, Oct 16, 2015 at 8:28 AM, 'Davide Libenzi' via Akaros
<aka...@googlegroups.com> wrote:
>
> On Fri, Oct 16, 2015 at 7:51 AM, Barret Rhoden <br...@cs.berkeley.edu> wrote:
>>
>> On 2015-10-16 at 07:32 "'Davide Libenzi' via Akaros"
>> <aka...@googlegroups.com> wrote:
>> > Looking at how we copy memory from/to user in umem.c (directly
>> > walking page tables), the Linux way is quite a bit faster.
>> > You simply just access memory.
>>
>> That code is quite old. We also just access user memory. There are
>> some old places where we still do memcpy_from_user, and those are
>> horrid. The newer style is to access user memory and deal with the
>> page fault after the fact - similar to Linux.
>>
>> A lot of our process management and event code directly accesses user
>> memory. For instance, the ucqs are working on user memory for many of
>> it's accesses. In the common case, we don't grab locks while accessing
>> user memory, but it can happen. (grep "ultimate_fallback" in
>> k/s/event.c, spam_vcore() usually calls send_ucq_msg, and the entire
>> ucq struct is a user pointer.).

What happens if the user memory being touched gets unmapped out from
under the kernel (i.e. by another core running the user process). As
things stand now, wouldn't that cause a kernel panic (i.e. handled ==
0 in arch/x86/trap.c). I think Davide's macros are designed so we can
write defensive kernel code that will let us fault gracefully so we
can return to the callsite and recover appropriately.

>>
>> > If you access user memory which require page fault, that would happen
>> > automatically.
>> > If it goes well, you won't even notice, otherwise the fixup will get
>> > you into the ASM label where you return -EFAULT.
>>
>> I need to spend a little time reading up on the exception tables. I
>> don't know if marking all of the places where we access user memory is
>> viable (assuming that's how the tables are built).

That is what would happen. Everything is done via macros, so every
call site would get it's own entry in the table.

> Having dedicated functions to access user memory would be nice, to avoid
> spread pte checks and user address checks all over.
> In Linux, and many other OSs, user memory access in only limited at system
> call time, but I see that in Akaros event delivery you need to touch user
> memory when there is not a user context up the stack.
> Example, in the event delivery, you would simply do a switch-to, and
> copy-to-user.
> Is that OK for Akaros to drop the event (at least, this is what seem to
> happen) in case there is something wrong with user memory?
> Shouldn't it be getting some EFAULT event as consequence?

I'll defer this question to Barret. In any case, I still don't see
how we can avoid potentially faulting even if we check the ranges (I
guess we could lock on the pte so an munmap from another core would
fail or stall, but this seems unnecessarily pricey).

>
>> Anyway, waserror() came along and seemed like it would do the trick.
>> Perhaps the exception tables would also, but I'm not sure about it, and
>> I'd prefer to go slowly in critical areas like this.

Even with waserror(), it's not clear to me how we would recover if a
trap actually occurs. Would the idea be to try and protect user memory
such that a trap would *never* occur (by locking on the pte)?

> IMO they are not exclusive. I don't think you can safely error() from a page
> fault handler also.
> This is where the exception tables give you the choice of getting an -EFAULT
> into memory access location call site, and eventually error() from there.

I'm leaning towards agreement with Davide on this (but maybe because I
don't understand very well how the waserror() stuff would work). I
can clearly see how the exception table stuff would work though (with
the added benefot of avoiding a lock on the user pte).


--
~Kevin

Davide Libenzi

unread,
Oct 16, 2015, 7:43:41 PM10/16/15
to Akaros
On Fri, Oct 16, 2015 at 4:06 PM, Kevin Klues <klu...@gmail.com> wrote:
From Davide's updated branch:

-------
I've updated the dlibenzi:user_memory_access branch to incorporate
Kevin's comments (hey Kev, I'm getting good with rebase -i ).

https://github.com/brho/akaros/compare/master...dlibenzi:user_memory_access?diff=unified&ts=4

It also removed a fix-previous-commit change, and added a small change
to umem.h memory range test API.

I need to add positive test cases. Unfortunately, in order to do that,
I need valid RW memory mapped at user address ranges.
------------

Again, I would separate out the assignment form the if statement below:

+                       if (!(handled = try_handle_exception_fixup(hw_tf)))
+                               handle_fperr(hw_tf);

Sorry, I missed that. Fixed now.


>> > If you access user memory which require page fault, that would happen
>> > automatically.
>> > If it goes well, you won't even notice, otherwise the fixup will get
>> > you into the ASM label where you return -EFAULT.
>>
>> I need to spend a little time reading up on the exception tables.  I
>> don't know if marking all of the places where we access user memory is
>> viable (assuming that's how the tables are built).

That is what would happen. Everything is done via macros, so every
call site would get it's own entry in the table.

Yes, you do not have to mark manually all the places.
You'd just use the two APIs to copy to/from user memory, which either succeed and return 0,
or fail and return -EFAULT.
Just because it does not seem clear, this is how it works, in pseudo code (example of one byte copy):

int copy(char *dst, const char *src)
{
INSN: *dst = *src;
    return 0;
FIXUP:
    return -EFAULT;

ASM_TABLE(__ex_table) += {INSN, FIXUP}

}

At init we sort the __ex_table ELF section by INSN address.
If an invalid operation happen at INSN address, we end up calling, within
the trap.c handler, the fixup lookup API, which either returns zero, or a fixup IP.
In the latter case, we fixup the return IP so that the code will resume at FIXUP,
and ends up returning -EFAULT.



> Having dedicated functions to access user memory would be nice, to avoid
> spread pte checks and user address checks all over.
> In Linux, and many other OSs, user memory access in only limited at system
> call time, but I see that in Akaros event delivery you need to touch user
> memory when there is not a user context up the stack.
> Example, in the event delivery, you would simply do a switch-to, and
> copy-to-user.
> Is that OK for Akaros to drop the event (at least, this is what seem to
> happen) in case there is something wrong with user memory?
> Shouldn't it be getting some EFAULT event as consequence?

I'll defer this question to Barret.  In any case, I still don't see
how we can avoid potentially faulting even if we check the ranges (I
guess we could lock on the pte so an munmap from another core would
fail or stall, but this seems unnecessarily pricey).

One thing I noticed, is that an event delivery triggers a CR3 reload, am I right?
This because we need to set the target process context so VM maps to target
process.
If we had a kernel APIs to allocate event buffers, we could play VM tricks, and map
the same event buffer pages, both in user and kernel space.
So event delivery would not trigger CR3 reloads.


Barret Rhoden

unread,
Oct 30, 2015, 4:15:12 PM10/30/15
to aka...@googlegroups.com
In answer to this from a couple weeks ago:

On 2015-10-16 at 16:43 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> > > Is that OK for Akaros to drop the event (at least, this is what
> > > seem to happen) in case there is something wrong with user memory?
> > > Shouldn't it be getting some EFAULT event as consequence?

For the most part, if the kernel fails in delivering an event to the
user because of a bad pointer, that is considered the user misbehaving,
and the kernel can kill the process. In that case, it is fine to drop
the event since the process isn't around to receive it.

> > I'll defer this question to Barret. In any case, I still don't see
> > how we can avoid potentially faulting even if we check the ranges (I
> > guess we could lock on the pte so an munmap from another core would
> > fail or stall, but this seems unnecessarily pricey).
> >
>
> One thing I noticed, is that an event delivery triggers a CR3 reload,
> am I right?
> This because we need to set the target process context so VM maps to
> target process.

If we aren't in the CR3 of the process already, then yes, we'll reload.

> If we had a kernel APIs to allocate event buffers, we could play VM
> tricks, and map
> the same event buffer pages, both in user and kernel space.
> So event delivery would not trigger CR3 reloads.

It's not that easy, since there is a lot of pointer chasing. For the
first part, the kernel follows a pointer to an event queue structure.
Then we go into the evq's mailbox. Based on the type of mailbox, we
might follow more pointers from there.

I also want to avoid dynamically mapping / unmapping memory into the
kernel's address space. That adds a layer of complexity to the whole
system. Right now, once the boot_pgdir is set (which is before any
processes exist), the kernel's part of the address space never changes.

Barret



Davide Libenzi

unread,
Oct 30, 2015, 8:02:36 PM10/30/15
to Akaros

On Fri, Oct 30, 2015 at 1:15 PM, Barret Rhoden <br...@cs.berkeley.edu> wrote:
> If we had a kernel APIs to allocate event buffers, we could play VM
> tricks, and map
> the same event buffer pages, both in user and kernel space.
> So event delivery would not trigger CR3 reloads.

It's not that easy, since there is a lot of pointer chasing.  For the
first part, the kernel follows a pointer to an event queue structure.
Then we go into the evq's mailbox.  Based on the type of mailbox, we
might follow more pointers from there.

I also want to avoid dynamically mapping / unmapping memory into the
kernel's address space.  That adds a layer of complexity to the whole
system.  Right now, once the boot_pgdir is set (which is before any
processes exist), the kernel's part of the address space never changes.

To me, it can be a pretty big win to avoid CR3 reloads on high frequency event deliveries like network, for example.
If we have only one major application using the NIC, we could route IRQs to the cores owned by the application, so that when an IRQ triggers an event delivery, the switch_to already finds the target task as "current", so not CR3 reload happen.
But if that's not the case, you can have thousands of CR3 reloads per second on whatever process which happen to run on the IRQ target core.

Maybe the structure of the event queue could be made simpler?
From the user POV, I guess this is already abstracted by APIs, right?



Kevin Klues

unread,
Nov 3, 2015, 12:11:30 PM11/3/15
to Akaros
Merged on a different branch as described in this thread:

Barret Rhoden

unread,
Nov 5, 2015, 1:46:20 PM11/5/15
to aka...@googlegroups.com
On 2015-10-30 at 17:02 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> To me, it can be a pretty big win to avoid CR3 reloads on high
> frequency event deliveries like network, for example.

Yeah, that's a good point. One of the things we do in a couple places
is keep a cr3 around for a while even if it's not being used (or at
least I used to do this, I'd have to look around). The kernel can use
any processes address space, so as an optimization, we can be lazy
about clearing the cr3 (and thus 'current') if we're asked to
switch_back to 0. The tradeoff is that processes may sit around in the
DYING state for a while (the only cost is memory, I think).

Another mitigating factor is that there might not be exactly one event
for every packet. For most of the events that we send, we're already
in the address space of the process. Specifically, our most common
event (I think!) is the "blocked syscall has completed" event. This is
done by a kthread that was already in the process's address space, so
the switch_to is mostly harmless in those cases.

> If we have only one major application using the NIC, we could route
> IRQs to the cores owned by the application, so that when an IRQ
> triggers an event delivery, the switch_to already finds the target
> task as "current", so not CR3 reload happen.

This is part of the long range plan for network intensive workloads.
The rule for MCPs is that there are no *unexpected* interrupts. A
process (in theory, not in the current code) can request to have
particular IRQs serviced on its cores. Right now, we mostly just
service them on Core 0, though the code exists to route_irqs() - we
just haven't designed or built the user interfaces right yet.

> Maybe the structure of the event queue could be made simpler?
> From the user POV, I guess this is already abstracted by APIs, right?

For an application, there is an API to extract a message from the
mailbox, and in general the address space issues don't affect them
much.

From the kernel's perspective, one idea I had was to use uva2kva to
translate user pointers to kernel pointers. That basically is a page
table walk in software. The danger is that the user's address space
changes after we've walked the page tables. The solution there is to
use some form of deferred-destruction when it comes to munmap (the
main concern is that the page isn't *reused* after being munmapped
while the kernel thinks its still the user's memory). As usual, I have
some notes on that too, which we can pull out if the switch_tos pop up
as a pain point for an application's performance.

Barret

Davide Libenzi

unread,
Nov 5, 2015, 5:20:03 PM11/5/15
to Akaros
On Thu, Nov 5, 2015 at 10:46 AM, Barret Rhoden <br...@cs.berkeley.edu> wrote:
> Maybe the structure of the event queue could be made simpler?
> From the user POV, I guess this is already abstracted by APIs, right?

For an application, there is an API to extract a message from the
mailbox, and in general the address space issues don't affect them
much.

From the kernel's perspective, one idea I had was to use uva2kva to
translate user pointers to kernel pointers.  That basically is a page
table walk in software.  The danger is that the user's address space
changes after we've walked the page tables.  The solution there is to
use some form of deferred-destruction when it comes to munmap (the
main concern is that the page isn't *reused* after being munmapped
while the kernel thinks its still the user's memory).  As usual, I have
some notes on that too, which we can pull out if the switch_tos pop up
as a pain point for an application's performance.

Here is what I was thinking.
There is an VM area of N pages, at KMSG within kernel only area, and at UMSG at user VM space.
The user VM space [UMSG, UMSG+N] is out of mmap() reach.
Userspace has an API we give them to create a mailbox, which turns into a syscall.
The syscall would get the necessary pages, find a free spot X in [KMSG, KMSG+N], map them there, as well as at UMSG + (X - KMSG).
To deliver an event to user mailbox pointer P, the kernel will use (P - UMSG + KMSG).
No need to swap CR3.
The mailbox destruction would undo the mapping, and the struct proc will have to track the mappings as well, so that a process destruction can undo them as well.




Barret Rhoden

unread,
Nov 6, 2015, 5:37:05 PM11/6/15
to aka...@googlegroups.com
On 2015-11-05 at 14:20 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> Here is what I was thinking.
> There is an VM area of N pages, at KMSG within kernel only area, and
> at UMSG at user VM space.
> The user VM space [UMSG, UMSG+N] is out of mmap() reach.
> Userspace has an API we give them to create a mailbox, which turns
> into a syscall.
> The syscall would get the necessary pages, find a free spot X in
> [KMSG, KMSG+N], map them there, as well as at UMSG + (X - KMSG).
> To deliver an event to user mailbox pointer P, the kernel will use (P
> - UMSG + KMSG).
> No need to swap CR3.
> The mailbox destruction would undo the mapping, and the struct proc
> will have to track the mappings as well, so that a process
> destruction can undo them as well.

I think there are a couple difficulties with this. One is that
some mailboxes consist of pointers to other user memory (requiring more
mappings), and those grow and shrink over time. Also, some mailboxes
are statically declared and exist in procdata, and there might be a
bootstrap issue with dynamically allocating them (though since they
are in procdata, we might be able to get by with having the kernel use
another mapping).

Another issue is that we don't just deref the memory for the mailbox.
We also look at the event_queue * for it's flags and whatnot.

Barret

Davide Libenzi

unread,
Nov 6, 2015, 7:43:37 PM11/6/15
to Akaros
Yes, clearly what I described above requires a change in the layout of the data structure.
But the kernel could stay out of it. Meaning, the syscall to create a mailbox could simply just asking for a chunk of specially mapped pages.
All the inter-pointer setups can be done by the userspace interface.
The kernel would just find pointers to where to deliver, but how/where this pointers point to, is up the the userspace handling of the mapped area.
No?



Reply all
Reply to author
Forward
0 new messages