[PATCH 1/3] x86/insn-eval: Add support for 64-bit kernel mode

7 views
Skip to first unread message

Jann Horn

unread,
Nov 12, 2019, 4:10:14 PM11/12/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org
To support evaluating 64-bit kernel mode instructions:

Replace existing checks for user_64bit_mode() with a new helper that
checks whether code is being executed in either 64-bit kernel mode or
64-bit user mode.

Select the GS base depending on whether the instruction is being
evaluated in kernel mode.

Signed-off-by: Jann Horn <ja...@google.com>
---
arch/x86/include/asm/ptrace.h | 13 +++++++++++++
arch/x86/lib/insn-eval.c | 26 +++++++++++++++-----------
2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5057a8ed100b..ac45b06941a5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -159,6 +159,19 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}

+/*
+ * Determine whether the register set came from any context that is running in
+ * 64-bit mode.
+ */
+static inline bool any_64bit_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+ return !user_mode(regs) || user_64bit_mode(regs);
+#else
+ return false;
+#endif
+}
+
#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 306c3a0902ba..31600d851fd8 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -155,7 +155,7 @@ static bool check_seg_overrides(struct insn *insn, int regoff)
*/
static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
{
- if (user_64bit_mode(regs))
+ if (any_64bit_mode(regs))
return INAT_SEG_REG_IGNORE;
/*
* Resolve the default segment register as described in Section 3.7.4
@@ -266,7 +266,7 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
* which may be invalid at this point.
*/
if (regoff == offsetof(struct pt_regs, ip)) {
- if (user_64bit_mode(regs))
+ if (any_64bit_mode(regs))
return INAT_SEG_REG_IGNORE;
else
return INAT_SEG_REG_CS;
@@ -289,7 +289,7 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
* In long mode, segment override prefixes are ignored, except for
* overrides for FS and GS.
*/
- if (user_64bit_mode(regs)) {
+ if (any_64bit_mode(regs)) {
if (idx != INAT_SEG_REG_FS &&
idx != INAT_SEG_REG_GS)
idx = INAT_SEG_REG_IGNORE;
@@ -646,23 +646,27 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
*/
return (unsigned long)(sel << 4);

- if (user_64bit_mode(regs)) {
+ if (any_64bit_mode(regs)) {
/*
* Only FS or GS will have a base address, the rest of
* the segments' bases are forced to 0.
*/
unsigned long base;

- if (seg_reg_idx == INAT_SEG_REG_FS)
+ if (seg_reg_idx == INAT_SEG_REG_FS) {
rdmsrl(MSR_FS_BASE, base);
- else if (seg_reg_idx == INAT_SEG_REG_GS)
+ } else if (seg_reg_idx == INAT_SEG_REG_GS) {
/*
* swapgs was called at the kernel entry point. Thus,
* MSR_KERNEL_GS_BASE will have the user-space GS base.
*/
- rdmsrl(MSR_KERNEL_GS_BASE, base);
- else
+ if (user_mode(regs))
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ rdmsrl(MSR_GS_BASE, base);
+ } else {
base = 0;
+ }
return base;
}

@@ -703,7 +707,7 @@ static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
if (sel < 0)
return 0;

- if (user_64bit_mode(regs) || v8086_mode(regs))
+ if (any_64bit_mode(regs) || v8086_mode(regs))
return -1L;

if (!sel)
@@ -948,7 +952,7 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
* following instruction.
*/
if (*regoff == -EDOM) {
- if (user_64bit_mode(regs))
+ if (any_64bit_mode(regs))
tmp = regs->ip + insn->length;
else
tmp = 0;
@@ -1250,7 +1254,7 @@ static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs)
* After computed, the effective address is treated as an unsigned
* quantity.
*/
- if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
+ if (!any_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
goto out;

/*
--
2.24.0.432.g9d3f5f5b63-goog

Jann Horn

unread,
Nov 12, 2019, 4:10:18 PM11/12/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org
A frequent cause of #GP exceptions are memory accesses to non-canonical
addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
the kernel doesn't currently print the fault address for #GP.
Luckily, we already have the necessary infrastructure for decoding X86
instructions and computing the memory address that is being accessed;
hook it up to the #GP handler so that we can figure out whether the #GP
looks like it was caused by a non-canonical address, and if so, print
that address.

While it is already possible to compute the faulting address manually by
disassembling the opcode dump and evaluating the instruction against the
register dump, this should make it slightly easier to identify crashes
at a glance.

Signed-off-by: Jann Horn <ja...@google.com>
---
arch/x86/kernel/traps.c | 45 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..479cfc6e9507 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -56,6 +56,8 @@
#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -509,6 +511,42 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}

+/*
+ * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
+ * address, print that address.
+ */
+static void print_kernel_gp_address(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+ u8 insn_bytes[MAX_INSN_SIZE];
+ struct insn insn;
+ unsigned long addr_ref;
+
+ if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
+ return;
+
+ kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
+ addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ /*
+ * If insn_get_addr_ref() failed or we got a canonical address in the
+ * kernel half, bail out.
+ */
+ if ((addr_ref | __VIRTUAL_MASK) == ~0UL)
+ return;
+ /*
+ * For the user half, check against TASK_SIZE_MAX; this way, if the
+ * access crosses the canonical address boundary, we don't miss it.
+ */
+ if (addr_ref <= TASK_SIZE_MAX)
+ return;
+
+ pr_alert("dereferencing non-canonical address 0x%016lx\n", addr_ref);
+#endif
+}
+
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
@@ -547,8 +585,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
return;

if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
+ X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ print_kernel_gp_address(regs);
+ die(desc, regs, error_code);
return;
}

--
2.24.0.432.g9d3f5f5b63-goog

Jann Horn

unread,
Nov 12, 2019, 4:10:21 PM11/12/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org
Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
to understand by computing the address of the original access and
printing that. More details are in the comments in the patch.

This turns an error like this:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
traps: dereferencing non-canonical address 0xe017577ddf75b7dd
general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI

into this:

traps: dereferencing non-canonical address 0xe017577ddf75b7dd
kasan: maybe dereferencing invalid pointer in range
[0x00badbeefbadbee8-0x00badbeefbadbeef]
general protection fault: 0000 [#3] PREEMPT SMP KASAN PTI
[...]

Signed-off-by: Jann Horn <ja...@google.com>
---
arch/x86/include/asm/kasan.h | 6 +++++
arch/x86/kernel/traps.c | 2 ++
arch/x86/mm/kasan_init_64.c | 52 +++++++++++++++++++++++++-----------
3 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 13e70da38bed..eaf624a758ed 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -25,6 +25,12 @@

#ifndef __ASSEMBLY__

+#ifdef CONFIG_KASAN_INLINE
+void kasan_general_protection_hook(unsigned long addr);
+#else
+static inline void kasan_general_protection_hook(unsigned long addr) { }
+#endif
+
#ifdef CONFIG_KASAN
void __init kasan_early_init(void);
void __init kasan_init(void);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 479cfc6e9507..e271a5a1ddd4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -58,6 +58,7 @@
#include <asm/umip.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/kasan.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -544,6 +545,7 @@ static void print_kernel_gp_address(struct pt_regs *regs)
return;

pr_alert("dereferencing non-canonical address 0x%016lx\n", addr_ref);
+ kasan_general_protection_hook(addr_ref);
#endif
}

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 296da58f3013..9ef099309489 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -246,20 +246,44 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
}

#ifdef CONFIG_KASAN_INLINE
-static int kasan_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
+/*
+ * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
+ * canonical half of the address space) cause out-of-bounds shadow memory reads
+ * before the actual access. For addresses in the low canonical half of the
+ * address space, as well as most non-canonical addresses, that out-of-bounds
+ * shadow memory access lands in the non-canonical part of the address space,
+ * causing #GP to be thrown.
+ * Help the user figure out what the original bogus pointer was.
+ */
+void kasan_general_protection_hook(unsigned long addr)
{
- if (val == DIE_GPF) {
- pr_emerg("CONFIG_KASAN_INLINE enabled\n");
- pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
- }
- return NOTIFY_OK;
-}
+ unsigned long orig_addr;
+ const char *addr_type;
+
+ if (addr < KASAN_SHADOW_OFFSET)
+ return;

-static struct notifier_block kasan_die_notifier = {
- .notifier_call = kasan_die_handler,
-};
+ orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;
+ /*
+ * For faults near the shadow address for NULL, we can be fairly certain
+ * that this is a KASAN shadow memory access.
+ * For faults that correspond to shadow for low canonical addresses, we
+ * can still be pretty sure - that shadow region is a fairly narrow
+ * chunk of the non-canonical address space.
+ * But faults that look like shadow for non-canonical addresses are a
+ * really large chunk of the address space. In that case, we still
+ * print the decoded address, but make it clear that this is not
+ * necessarily what's actually going on.
+ */
+ if (orig_addr < PAGE_SIZE)
+ addr_type = "dereferencing kernel NULL pointer";
+ else if (orig_addr < TASK_SIZE_MAX)
+ addr_type = "probably dereferencing invalid pointer";
+ else
+ addr_type = "maybe dereferencing invalid pointer";
+ pr_alert("%s in range [0x%016lx-0x%016lx]\n", addr_type,
+ orig_addr, orig_addr + (1 << KASAN_SHADOW_SCALE_SHIFT) - 1);
+}
#endif

void __init kasan_early_init(void)
@@ -298,10 +322,6 @@ void __init kasan_init(void)
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;

-#ifdef CONFIG_KASAN_INLINE
- register_die_notifier(&kasan_die_notifier);
-#endif
-
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));

/*
--
2.24.0.432.g9d3f5f5b63-goog

Dmitry Vyukov

unread,
Nov 13, 2019, 5:11:46 AM11/13/19
to Jann Horn, Andrey Konovalov, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML
On Tue, Nov 12, 2019 at 10:10 PM 'Jann Horn' via kasan-dev
<kasa...@googlegroups.com> wrote:
>
> Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
> to understand by computing the address of the original access and
> printing that. More details are in the comments in the patch.
>
> This turns an error like this:
>
> kasan: CONFIG_KASAN_INLINE enabled
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> traps: dereferencing non-canonical address 0xe017577ddf75b7dd
> general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI
>
> into this:
>
> traps: dereferencing non-canonical address 0xe017577ddf75b7dd
> kasan: maybe dereferencing invalid pointer in range
> [0x00badbeefbadbee8-0x00badbeefbadbeef]
> general protection fault: 0000 [#3] PREEMPT SMP KASAN PTI
> [...]

Nice!

+Andrey, do you see any issues for TAGS mode? Or, Jann, did you test
it by any chance?
Thinking how much sense it makes to compare addr with KASAN_SHADOW_END...
If the addr is > KASAN_SHADOW_END, we know it's not a KASAN access,
but do we ever get GP on canonical addresses?

>
> -static struct notifier_block kasan_die_notifier = {
> - .notifier_call = kasan_die_handler,
> -};
> + orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;
> + /*
> + * For faults near the shadow address for NULL, we can be fairly certain
> + * that this is a KASAN shadow memory access.
> + * For faults that correspond to shadow for low canonical addresses, we
> + * can still be pretty sure - that shadow region is a fairly narrow
> + * chunk of the non-canonical address space.
> + * But faults that look like shadow for non-canonical addresses are a
> + * really large chunk of the address space. In that case, we still
> + * print the decoded address, but make it clear that this is not
> + * necessarily what's actually going on.
> + */
> + if (orig_addr < PAGE_SIZE)
> + addr_type = "dereferencing kernel NULL pointer";
> + else if (orig_addr < TASK_SIZE_MAX)
> + addr_type = "probably dereferencing invalid pointer";

This is access to user memory, right? In outline mode we call it
"user-memory-access". We could say about "user" part here as well.

> + else
> + addr_type = "maybe dereferencing invalid pointer";
> + pr_alert("%s in range [0x%016lx-0x%016lx]\n", addr_type,
> + orig_addr, orig_addr + (1 << KASAN_SHADOW_SCALE_SHIFT) - 1);

"(1 << KASAN_SHADOW_SCALE_SHIFT) - 1)" part may be replaced with
KASAN_SHADOW_MASK.
Overall it can make sense to move this mm/kasan/report.c b/c we are
open-coding a number of things here (e.g. reverse address mapping). If
another arch will do the same, it will need all of this code too (?).

But in general I think it's a very good usability improvement for KASAN.

> +}
> #endif
>
> void __init kasan_early_init(void)
> @@ -298,10 +322,6 @@ void __init kasan_init(void)
> int i;
> void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
>
> -#ifdef CONFIG_KASAN_INLINE
> - register_die_notifier(&kasan_die_notifier);
> -#endif
> -
> memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
>
> /*
> --
> 2.24.0.432.g9d3f5f5b63-goog
>
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20191112211002.128278-3-jannh%40google.com.

Andrey Konovalov

unread,
Nov 13, 2019, 10:19:14 AM11/13/19
to Dmitry Vyukov, Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML
On Wed, Nov 13, 2019 at 11:11 AM 'Dmitry Vyukov' via kasan-dev
<kasa...@googlegroups.com> wrote:
>
> On Tue, Nov 12, 2019 at 10:10 PM 'Jann Horn' via kasan-dev
> <kasa...@googlegroups.com> wrote:
> >
> > Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
> > to understand by computing the address of the original access and
> > printing that. More details are in the comments in the patch.
> >
> > This turns an error like this:
> >
> > kasan: CONFIG_KASAN_INLINE enabled
> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> > traps: dereferencing non-canonical address 0xe017577ddf75b7dd
> > general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI
> >
> > into this:
> >
> > traps: dereferencing non-canonical address 0xe017577ddf75b7dd
> > kasan: maybe dereferencing invalid pointer in range
> > [0x00badbeefbadbee8-0x00badbeefbadbeef]
> > general protection fault: 0000 [#3] PREEMPT SMP KASAN PTI
> > [...]

Would it make sense to use the common "BUG: KASAN: <bug-type>" report
format here? Something like:

BUG: KASAN: invalid-ptr-deref in range ...

Otherwise this looks amazing, distinguishing NULL pointer accesses
from wild memory accesses is much more convenient with this. Thanks
Jann!

>
> Nice!
>
> +Andrey, do you see any issues for TAGS mode? Or, Jann, did you test
> it by any chance?

Hm, this looks like x86-specific change, so I don't think it
interferes with the TAGS mode.
I think we should use the same naming scheme here as in
get_wild_bug_type(): null-ptr-deref, user-memory-access and
wild-memory-access.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/CACT4Y%2BaojSsss3%2BY2FB9Rw%3DOPxXgsFrGF0YiAJ9eo2wJM0ruWg%40mail.gmail.com.

Dmitry Vyukov

unread,
Nov 13, 2019, 10:43:56 AM11/13/19
to Andrey Konovalov, Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML
On Wed, Nov 13, 2019 at 4:19 PM Andrey Konovalov <andre...@google.com> wrote:
>
> On Wed, Nov 13, 2019 at 11:11 AM 'Dmitry Vyukov' via kasan-dev
> <kasa...@googlegroups.com> wrote:
> >
> > On Tue, Nov 12, 2019 at 10:10 PM 'Jann Horn' via kasan-dev
> > <kasa...@googlegroups.com> wrote:
> > >
> > > Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
> > > to understand by computing the address of the original access and
> > > printing that. More details are in the comments in the patch.
> > >
> > > This turns an error like this:
> > >
> > > kasan: CONFIG_KASAN_INLINE enabled
> > > kasan: GPF could be caused by NULL-ptr deref or user memory access
> > > traps: dereferencing non-canonical address 0xe017577ddf75b7dd
> > > general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI
> > >
> > > into this:
> > >
> > > traps: dereferencing non-canonical address 0xe017577ddf75b7dd
> > > kasan: maybe dereferencing invalid pointer in range
> > > [0x00badbeefbadbee8-0x00badbeefbadbeef]
> > > general protection fault: 0000 [#3] PREEMPT SMP KASAN PTI
> > > [...]
>
> Would it make sense to use the common "BUG: KASAN: <bug-type>" report
> format here? Something like:
>
> BUG: KASAN: invalid-ptr-deref in range ...


Currently this line is not the official bug title. The official bug
title is "general protection fault:" line that follows.
If we add "BUG: KASAN:" before that we need to be super careful wrt
effect on syzbot but parsing/reporting.

Jann Horn

unread,
Nov 14, 2019, 10:09:48 AM11/14/19
to Dmitry Vyukov, Andrey Konovalov, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML
On Wed, Nov 13, 2019 at 11:11 AM Dmitry Vyukov <dvy...@google.com> wrote:
> On Tue, Nov 12, 2019 at 10:10 PM 'Jann Horn' via kasan-dev
> <kasa...@googlegroups.com> wrote:
> > Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
> > to understand by computing the address of the original access and
> > printing that. More details are in the comments in the patch.
[...]
> +Andrey, do you see any issues for TAGS mode? Or, Jann, did you test
> it by any chance?

No, I didn't - I don't have anything set up for upstream arm64 testing here.

> > +void kasan_general_protection_hook(unsigned long addr)
> > {
> > - if (val == DIE_GPF) {
> > - pr_emerg("CONFIG_KASAN_INLINE enabled\n");
> > - pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
> > - }
> > - return NOTIFY_OK;
> > -}
> > + unsigned long orig_addr;
> > + const char *addr_type;
> > +
> > + if (addr < KASAN_SHADOW_OFFSET)
> > + return;
>
> Thinking how much sense it makes to compare addr with KASAN_SHADOW_END...
> If the addr is > KASAN_SHADOW_END, we know it's not a KASAN access,
> but do we ever get GP on canonical addresses?

#GP can occur for various reasons, but on x86-64, if it occurs because
of an invalid address, as far as I know it's always non-canonical. The
#GP handler I wrote will check the address and only call the KASAN
hook if the address is noncanonical (because otherwise the #GP
occurred for some other reason).

> > -static struct notifier_block kasan_die_notifier = {
> > - .notifier_call = kasan_die_handler,
> > -};
> > + orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;
> > + /*
> > + * For faults near the shadow address for NULL, we can be fairly certain
> > + * that this is a KASAN shadow memory access.
> > + * For faults that correspond to shadow for low canonical addresses, we
> > + * can still be pretty sure - that shadow region is a fairly narrow
> > + * chunk of the non-canonical address space.
> > + * But faults that look like shadow for non-canonical addresses are a
> > + * really large chunk of the address space. In that case, we still
> > + * print the decoded address, but make it clear that this is not
> > + * necessarily what's actually going on.
> > + */
> > + if (orig_addr < PAGE_SIZE)
> > + addr_type = "dereferencing kernel NULL pointer";
> > + else if (orig_addr < TASK_SIZE_MAX)
> > + addr_type = "probably dereferencing invalid pointer";
>
> This is access to user memory, right? In outline mode we call it
> "user-memory-access". We could say about "user" part here as well.

Okay, I'll copy that naming.

> > + else
> > + addr_type = "maybe dereferencing invalid pointer";
> > + pr_alert("%s in range [0x%016lx-0x%016lx]\n", addr_type,
> > + orig_addr, orig_addr + (1 << KASAN_SHADOW_SCALE_SHIFT) - 1);
>
> "(1 << KASAN_SHADOW_SCALE_SHIFT) - 1)" part may be replaced with
> KASAN_SHADOW_MASK.
> Overall it can make sense to move this mm/kasan/report.c b/c we are
> open-coding a number of things here (e.g. reverse address mapping). If
> another arch will do the same, it will need all of this code too (?).

Alright, I'll try to move it over.

Sean Christopherson

unread,
Nov 14, 2019, 12:46:31 PM11/14/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, linux-...@vger.kernel.org
Any objection to open coding the upper bound instead of using
TASK_SIZE_MASK to make the threshold more obvious?

> + return;
> +
> + pr_alert("dereferencing non-canonical address 0x%016lx\n", addr_ref);

Printing the raw address will confuse users in the case where the access
straddles the lower canonical boundary. Maybe combine this with open
coding the straddle case? With a rough heuristic to hedge a bit for
instructions whose operand size isn't accurately reflected in opnd_bytes.

if (addr_ref > __VIRTUAL_MASK)
pr_alert("dereferencing non-canonical address 0x%016lx\n", addr_ref);
else if ((addr_ref + insn->opnd_bytes - 1) > __VIRTUAL_MASK)
pr_alert("straddling non-canonical boundary 0x%016lx - 0x%016lx\n",
addr_ref, addr_ref + insn->opnd_bytes - 1);
else if ((addr_ref + PAGE_SIZE - 1) > __VIRTUAL_MASK)
pr_alert("potentially straddling non-canonical boundary 0x%016lx - 0x%016lx\n",
addr_ref, addr_ref + PAGE_SIZE - 1);

> +#endif
> +}
> +
> dotraplinkage void
> do_general_protection(struct pt_regs *regs, long error_code)
> {
> @@ -547,8 +585,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
> return;
>
> if (notify_die(DIE_GPF, desc, regs, error_code,
> - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
> - die(desc, regs, error_code);
> + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> + return;
> +
> + print_kernel_gp_address(regs);

This can be conditional on '!error_code', non-canonical faults on the
direct access always have zero for the error code. Doubt it will matter
in practice, but far calls and other silly segment instructions can
generate non-zero error codes on #GP in 64-bit mode.

Andy Lutomirski

unread,
Nov 14, 2019, 1:00:50 PM11/14/19
to Sean Christopherson, Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, X86 ML, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, LKML
This is unnecessarily complicated, and I suspect that Jann had the
right idea but just didn't quite explain it enough. The secret here
is that TASK_SIZE_MAX is a full page below the canonical boundary
(thanks, Intel, for screwing up SYSRET), so, if we get #GP for an
address above TASK_SIZE_MAX, then it's either a #GP for a different
reason or it's a genuine non-canonical access.

So I think that just a comment about this would be enough.

*However*, the printout should at least hedge a bit and say something
like "probably dereferencing non-canonical address", since there are
plenty of ways to get #GP with an operand that is nominally
non-canonical but where the actual cause of #GP is different. And I
think this code should be skipped entirely if error_code != 0.

--Andy

Borislav Petkov

unread,
Nov 14, 2019, 1:09:04 PM11/14/19
to Andy Lutomirski, Sean Christopherson, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, X86 ML, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, LKML
On Thu, Nov 14, 2019 at 10:00:35AM -0800, Andy Lutomirski wrote:
> And I think this code should be skipped entirely if error_code != 0.

... or say that the #GP is happening due to a segment descriptor access.
Would make the figuring out why it happens a bit simpler as to where to
look.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Sean Christopherson

unread,
Nov 14, 2019, 1:20:44 PM11/14/19
to Andy Lutomirski, Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, X86 ML, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, LKML
On Thu, Nov 14, 2019 at 10:00:35AM -0800, Andy Lutomirski wrote:
Ya, I followed all that. My point is that if "addr_ref + insn->opnd_bytes"
straddles the boundary then it's extremely likely the #GP is due to a
non-canonical access, i.e. the pr_alert() doesn't have to hedge (as much).

> then it's either a #GP for a different
> reason or it's a genuine non-canonical access.

Heh, "canonical || !canonical" would be the options :-D

Andy Lutomirski

unread,
Nov 14, 2019, 1:41:21 PM11/14/19
to Sean Christopherson, Andy Lutomirski, Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, X86 ML, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, LKML
On Thu, Nov 14, 2019 at 10:20 AM Sean Christopherson
I suppose. But I don't think we have a real epidemic of failed
accesses to user memory between TASK_SIZE_MAX and the actual boundary
that get #GP instead of #PF but fail for a reason other than
non-canonicality :)

I think we should just go back in time and fix x86_64 to either give
#PF or at least give some useful page fault for a non-canonical
address. The only difficulties I'm aware of is that Intel CPUs would
either need to be redesigned better or would have slightly odd
semantics for jumps to non-canonical addresses -- #PF in Intel's model
of "RIP literally *can't* have a non-canonical value" would be a bit
strange. Also, my time machine is out of commission.

--Andy

Sean Christopherson

unread,
Nov 14, 2019, 1:54:24 PM11/14/19
to Andy Lutomirski, Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, X86 ML, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, LKML
No argument there.

> I think we should just go back in time and fix x86_64 to either give
> #PF or at least give some useful page fault for a non-canonical
> address. The only difficulties I'm aware of is that Intel CPUs would
> either need to be redesigned better or would have slightly odd
> semantics for jumps to non-canonical addresses -- #PF in Intel's model
> of "RIP literally *can't* have a non-canonical value" would be a bit
> strange. Also, my time machine is out of commission.

If you happen to fix your time machine, just go back a bit further and
change protected mode to push the faulting address onto the stack.

Jann Horn

unread,
Nov 14, 2019, 3:03:40 PM11/14/19
to Andy Lutomirski, Sean Christopherson, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, X86 ML, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, LKML
On Thu, Nov 14, 2019 at 7:00 PM Andy Lutomirski <lu...@kernel.org> wrote:
> On Thu, Nov 14, 2019 at 9:46 AM Sean Christopherson
> <sean.j.chr...@intel.com> wrote:
> > On Tue, Nov 12, 2019 at 10:10:01PM +0100, Jann Horn wrote:
> > > A frequent cause of #GP exceptions are memory accesses to non-canonical
> > > addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
> > > the kernel doesn't currently print the fault address for #GP.
> > > Luckily, we already have the necessary infrastructure for decoding X86
> > > instructions and computing the memory address that is being accessed;
> > > hook it up to the #GP handler so that we can figure out whether the #GP
> > > looks like it was caused by a non-canonical address, and if so, print
> > > that address.
[...]
Ah, I didn't realize that insn->opnd_bytes exists. Since I already
have that available, I guess using that is cleaner than being clever
with TASK_SIZE_MAX.

> *However*, the printout should at least hedge a bit and say something
> like "probably dereferencing non-canonical address", since there are
> plenty of ways to get #GP with an operand that is nominally
> non-canonical but where the actual cause of #GP is different.

Ah, yeah, I'll change that.

> And I think this code should be skipped entirely if error_code != 0.

Makes sense. As Borislav suggested, I'll add some code to
do_general_protection() to instead print a hint about it being a
segment-related problem.

Jann Horn

unread,
Nov 15, 2019, 2:17:40 PM11/15/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
To support evaluating 64-bit kernel mode instructions:

Replace existing checks for user_64bit_mode() with a new helper that
checks whether code is being executed in either 64-bit kernel mode or
64-bit user mode.

Select the GS base depending on whether the instruction is being
evaluated in kernel mode.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
no changes

arch/x86/include/asm/ptrace.h | 13 +++++++++++++
arch/x86/lib/insn-eval.c | 26 +++++++++++++++-----------
2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5057a8ed100b..ac45b06941a5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -159,6 +159,19 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}

+/*
+ * Determine whether the register set came from any context that is running in
+ * 64-bit mode.
+ */
+static inline bool any_64bit_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64

Jann Horn

unread,
Nov 15, 2019, 2:17:45 PM11/15/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
A frequent cause of #GP exceptions are memory accesses to non-canonical
addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
the kernel doesn't currently print the fault address for #GP.
Luckily, we already have the necessary infrastructure for decoding X86
instructions and computing the memory address that is being accessed;
hook it up to the #GP handler so that we can figure out whether the #GP
looks like it was caused by a non-canonical address, and if so, print
that address.

While it is already possible to compute the faulting address manually by
disassembling the opcode dump and evaluating the instruction against the
register dump, this should make it slightly easier to identify crashes
at a glance.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
- print different message for segment-related GP (Borislav)
- rewrite check for non-canonical address (Sean)
- make it clear we don't know for sure why the GP happened (Andy)

arch/x86/kernel/traps.c | 45 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..12d42697a18e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -56,6 +56,8 @@
#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -509,6 +511,38 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}

+/*
+ * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
+ * address, print that address.
+ */
+static void print_kernel_gp_address(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+ u8 insn_bytes[MAX_INSN_SIZE];
+ struct insn insn;
+ unsigned long addr_ref;
+
+ if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
+ return;
+
+ kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
+ addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
+ if (addr_ref >= ~__VIRTUAL_MASK)
+ return;
+
+ /* Bail out if the entire operand is in the canonical user half. */
+ if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
+ return;
+
+ pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
+ addr_ref);
+#endif
+}
+
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
@@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
return;

if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
+ X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ if (error_code)
+ pr_alert("GPF is segment-related (see error code)\n");
+ else
+ print_kernel_gp_address(regs);
+

Jann Horn

unread,
Nov 15, 2019, 2:17:48 PM11/15/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
to understand by computing the address of the original access and
printing that. More details are in the comments in the patch.

This turns an error like this:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
traps: probably dereferencing non-canonical address 0xe017577ddf75b7dd
general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI

into this:

traps: dereferencing non-canonical address 0xe017577ddf75b7dd
traps: probably dereferencing non-canonical address 0xe017577ddf75b7dd
KASAN: maybe wild-memory-access in range
[0x00badbeefbadbee8-0x00badbeefbadbeef]
general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI

The hook is placed in architecture-independent code, but is currently
only wired up to the X86 exception handler because I'm not sufficiently
familiar with the address space layout and exception handling mechanisms
on other architectures.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
- move to mm/kasan/report.c (Dmitry)
- change hook name to be more generic
- use TASK_SIZE instead of TASK_SIZE_MAX for compiling on non-x86
- don't open-code KASAN_SHADOW_MASK (Dmitry)
- add "KASAN: " prefix, but not "BUG: " (Andrey, Dmitry)
- use same naming scheme as get_wild_bug_type (Andrey)

arch/x86/kernel/traps.c | 2 ++
arch/x86/mm/kasan_init_64.c | 21 -------------------
include/linux/kasan.h | 6 ++++++
mm/kasan/report.c | 40 +++++++++++++++++++++++++++++++++++++
4 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 12d42697a18e..87b52682a37a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,6 +37,7 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
+#include <linux/kasan.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
@@ -540,6 +541,7 @@ static void print_kernel_gp_address(struct pt_regs *regs)

pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
addr_ref);
+ kasan_non_canonical_hook(addr_ref);
#endif
}

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 296da58f3013..69c437fb21cc 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -245,23 +245,6 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
} while (pgd++, addr = next, addr != end);
}

-#ifdef CONFIG_KASAN_INLINE
-static int kasan_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
-{
- if (val == DIE_GPF) {
- pr_emerg("CONFIG_KASAN_INLINE enabled\n");
- pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kasan_die_notifier = {
- .notifier_call = kasan_die_handler,
-};
-#endif
-
void __init kasan_early_init(void)
{
int i;
@@ -298,10 +281,6 @@ void __init kasan_init(void)
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;

-#ifdef CONFIG_KASAN_INLINE
- register_die_notifier(&kasan_die_notifier);
-#endif
-
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));

/*
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index cc8a03cc9674..7305024b44e3 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -194,4 +194,10 @@ static inline void *kasan_reset_tag(const void *addr)

#endif /* CONFIG_KASAN_SW_TAGS */

+#ifdef CONFIG_KASAN_INLINE
+void kasan_non_canonical_hook(unsigned long addr);
+#else /* CONFIG_KASAN_INLINE */
+static inline void kasan_non_canonical_hook(unsigned long addr) { }
+#endif /* CONFIG_KASAN_INLINE */
+
#endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 621782100eaa..5ef9f24f566b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -512,3 +512,43 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon

end_report(&flags);
}
+
+#ifdef CONFIG_KASAN_INLINE
+/*
+ * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
+ * canonical half of the address space) cause out-of-bounds shadow memory reads
+ * before the actual access. For addresses in the low canonical half of the
+ * address space, as well as most non-canonical addresses, that out-of-bounds
+ * shadow memory access lands in the non-canonical part of the address space.
+ * Help the user figure out what the original bogus pointer was.
+ */
+void kasan_non_canonical_hook(unsigned long addr)
+{
+ unsigned long orig_addr;
+ const char *bug_type;
+
+ if (addr < KASAN_SHADOW_OFFSET)
+ return;
+
+ orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;
+ /*
+ * For faults near the shadow address for NULL, we can be fairly certain
+ * that this is a KASAN shadow memory access.
+ * For faults that correspond to shadow for low canonical addresses, we
+ * can still be pretty sure - that shadow region is a fairly narrow
+ * chunk of the non-canonical address space.
+ * But faults that look like shadow for non-canonical addresses are a
+ * really large chunk of the address space. In that case, we still
+ * print the decoded address, but make it clear that this is not
+ * necessarily what's actually going on.
+ */
+ if (orig_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if (orig_addr < TASK_SIZE)
+ bug_type = "probably user-memory-access";
+ else
+ bug_type = "maybe wild-memory-access";
+ pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
+ orig_addr, orig_addr + KASAN_SHADOW_MASK);
+}
+#endif
--
2.24.0.432.g9d3f5f5b63-goog

Dmitry Vyukov

unread,
Nov 18, 2019, 3:36:16 AM11/18/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
Reviewed-by: Dmitry Vyukov <dvy...@google.com>

Thanks!

Borislav Petkov

unread,
Nov 18, 2019, 9:21:49 AM11/18/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
On Fri, Nov 15, 2019 at 08:17:27PM +0100, Jann Horn wrote:
> dotraplinkage void
> do_general_protection(struct pt_regs *regs, long error_code)
> {
> @@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
> return;
>
> if (notify_die(DIE_GPF, desc, regs, error_code,
> - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
> - die(desc, regs, error_code);
> + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> + return;
> +
> + if (error_code)
> + pr_alert("GPF is segment-related (see error code)\n");
> + else
> + print_kernel_gp_address(regs);
> +
> + die(desc, regs, error_code);

Right, this way, those messages appear before the main "general
protection ..." message:

[ 2.434372] traps: probably dereferencing non-canonical address 0xdfff000000000001
[ 2.442492] general protection fault: 0000 [#1] PREEMPT SMP

Can we glue/merge them together? Or is this going to confuse tools too much:

[ 2.542218] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP

(and that sentence could be shorter too:

"general protection fault for non-canonical address 0xdfff000000000001"

looks ok to me too.)

Here's a dirty diff together with a reproducer ontop of yours:

---
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf796f8c9998..dab702ba28a6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -515,7 +515,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
* On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
* address, print that address.
*/
-static void print_kernel_gp_address(struct pt_regs *regs)
+static unsigned long get_kernel_gp_address(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
u8 insn_bytes[MAX_INSN_SIZE];
@@ -523,7 +523,7 @@ static void print_kernel_gp_address(struct pt_regs *regs)
unsigned long addr_ref;

if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
- return;
+ return 0;

kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
insn_get_modrm(&insn);
@@ -532,22 +532,22 @@ static void print_kernel_gp_address(struct pt_regs *regs)

/* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
if (addr_ref >= ~__VIRTUAL_MASK)
- return;
+ return 0;

/* Bail out if the entire operand is in the canonical user half. */
if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
- return;
+ return 0;

- pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
- addr_ref);
+ return addr_ref;
#endif
}

+#define GPFSTR "general protection fault"
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
- const char *desc = "general protection fault";
struct task_struct *tsk;
+ char desc[90];

RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
@@ -584,12 +584,18 @@ do_general_protection(struct pt_regs *regs, long error_code)
X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
return;

- if (error_code)
- pr_alert("GPF is segment-related (see error code)\n");
- else
- print_kernel_gp_address(regs);
+ if (error_code) {
+ snprintf(desc, 90, "segment-related " GPFSTR);
+ } else {
+ unsigned long addr_ref = get_kernel_gp_address(regs);
+
+ if (addr_ref)
+ snprintf(desc, 90, GPFSTR " while derefing a non-canonical address 0x%lx", addr_ref);
+ else
+ snprintf(desc, 90, GPFSTR);
+ }

- die(desc, regs, error_code);
+ die((const char *)desc, regs, error_code);
return;
}

diff --git a/init/main.c b/init/main.c
index 91f6ebb30ef0..7acc7e660be9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1124,6 +1124,9 @@ static int __ref kernel_init(void *unused)

rcu_end_inkernel_boot();

+ asm volatile("mov $0xdfff000000000001, %rax\n\t"
+ "jmpq *%rax\n\t");
+
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)

Dmitry Vyukov

unread,
Nov 18, 2019, 11:03:11 AM11/18/19
to Borislav Petkov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
This exact form will confuse syzkaller crash parsing for Linux kernel:
https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L1347
It expects a "general protection fault:" line for these crashes.

A graceful way to update kernel crash messages would be to add more
tests with the new format here:
https://github.com/google/syzkaller/tree/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/testdata/linux/report
Update parsing code. Roll out new version. Update all other testing
systems that detect and parse kernel crashes. Then commit kernel
changes.

An unfortunate consequence of offloading testing to third-party systems...
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20191118142144.GC6363%40zn.tnic.

Jann Horn

unread,
Nov 18, 2019, 11:20:11 AM11/18/19
to Dmitry Vyukov, Borislav Petkov, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
So for syzkaller, it'd be fine as long as we keep the colon there?
Something like:

general protection fault: derefing non-canonical address
0xdfff000000000001: 0000 [#1] PREEMPT SMP

And it looks like the 0day test bot doesn't have any specific pattern
for #GP, it seems to just look for the panic triggered by
panic-on-oops as far as I can tell (oops=panic in lkp-exec/qemu, no
"general protection fault" in etc/dmesg-kill-pattern).

> An unfortunate consequence of offloading testing to third-party systems...

And of not having a standard way to signal "this line starts something
that should be reported as a bug"? Maybe as a longer-term idea, it'd
help to have some sort of extra prefix byte that the kernel can print
to say "here comes a bug report, first line should be the subject", or
something like that, similar to how we have loglevels...

Dmitry Vyukov

unread,
Nov 18, 2019, 11:29:55 AM11/18/19
to Jann Horn, Borislav Petkov, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
Probably. Tests help a lot to answer such questions ;) But presumably
it should break parsing.

> And it looks like the 0day test bot doesn't have any specific pattern
> for #GP, it seems to just look for the panic triggered by
> panic-on-oops as far as I can tell (oops=panic in lkp-exec/qemu, no
> "general protection fault" in etc/dmesg-kill-pattern).
>
> > An unfortunate consequence of offloading testing to third-party systems...
>
> And of not having a standard way to signal "this line starts something
> that should be reported as a bug"? Maybe as a longer-term idea, it'd
> help to have some sort of extra prefix byte that the kernel can print
> to say "here comes a bug report, first line should be the subject", or
> something like that, similar to how we have loglevels...

This would be great.
Also a way to denote crash end.
However we have lots of special logic for subjects, not sure if kernel
could provide good subject:
https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
Probably it could, but it won't be completely trivial. E.g. if there
is a stall inside of a timer function, it should give the name of the
actual timer callback as identity ("stall in timer_subsystem_foo"). Or
for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
different than saying "there is a bug in kernel" :)

Jann Horn

unread,
Nov 18, 2019, 11:41:22 AM11/18/19
to Dmitry Vyukov, Borislav Petkov, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
On Mon, Nov 18, 2019 at 5:29 PM Dmitry Vyukov <dvy...@google.com> wrote:
> On Mon, Nov 18, 2019 at 5:20 PM 'Jann Horn' via kasan-dev
> <kasa...@googlegroups.com> wrote:
> > On Mon, Nov 18, 2019 at 5:03 PM Dmitry Vyukov <dvy...@google.com> wrote:
> > > This exact form will confuse syzkaller crash parsing for Linux kernel:
> > > https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L1347
> > > It expects a "general protection fault:" line for these crashes.
> > >
> > > A graceful way to update kernel crash messages would be to add more
> > > tests with the new format here:
> > > https://github.com/google/syzkaller/tree/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/testdata/linux/report
> > > Update parsing code. Roll out new version. Update all other testing
> > > systems that detect and parse kernel crashes. Then commit kernel
> > > changes.
[...]
> > > An unfortunate consequence of offloading testing to third-party systems...
> >
> > And of not having a standard way to signal "this line starts something
> > that should be reported as a bug"? Maybe as a longer-term idea, it'd
> > help to have some sort of extra prefix byte that the kernel can print
> > to say "here comes a bug report, first line should be the subject", or
> > something like that, similar to how we have loglevels...
>
> This would be great.
> Also a way to denote crash end.
> However we have lots of special logic for subjects, not sure if kernel
> could provide good subject:
> https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
> Probably it could, but it won't be completely trivial. E.g. if there
> is a stall inside of a timer function, it should give the name of the
> actual timer callback as identity ("stall in timer_subsystem_foo"). Or
> for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
> different than saying "there is a bug in kernel" :)

Maybe I'm overthinking things, and maybe this is too much effort
relative to the benefit it brings, but here's a crazy idea:

For the specific case of stalls, it might help if the kernel could put
markers on the stack on the first stall warning (e.g. assuming that
ORC is enabled, by walking the stack and replacing all saved
instruction pointers with a pointer to some magic trampoline that
jumps back to the original caller using some sort of shadow stack),
then wait a few seconds, and then check how far on the stack the
markers have been cleared. Then hopefully you'd know exactly in which
function you're looping.

Borislav Petkov

unread,
Nov 18, 2019, 11:44:13 AM11/18/19
to Dmitry Vyukov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
On Mon, Nov 18, 2019 at 05:29:42PM +0100, Dmitry Vyukov wrote:
> > And of not having a standard way to signal "this line starts something
> > that should be reported as a bug"? Maybe as a longer-term idea, it'd
> > help to have some sort of extra prefix byte that the kernel can print
> > to say "here comes a bug report, first line should be the subject", or
> > something like that, similar to how we have loglevels...
>
> This would be great.
> Also a way to denote crash end.
> However we have lots of special logic for subjects, not sure if kernel
> could provide good subject:
> https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
> Probably it could, but it won't be completely trivial. E.g. if there
> is a stall inside of a timer function, it should give the name of the
> actual timer callback as identity ("stall in timer_subsystem_foo"). Or
> for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
> different than saying "there is a bug in kernel" :)

While external tools are fine and cool, they can't really block kernel
development and printk strings format is not an ABI. And yeah, we have
this discussion each time someone proposes changes to those "magic"
strings but I guess it is about time to fix this in a way that any
future changes don't break tools.

And so I like the idea of marking *only* the first splat with some small
prefix char as that first splat is the special and very important one.
I.e., the one where die_counter is 0.

So I could very well imagine something like:

...
[ 2.523708] Write protecting the kernel read-only data: 16384k
[ 2.524729] Freeing unused kernel image (text/rodata gap) memory: 2040K
[ 2.525594] Freeing unused kernel image (rodata/data gap) memory: 368K
[ 2.541414] x86/mm: Checked W+X mappings: passed, no W+X pages found.

<--- important first splat starts here:

[ 2.542218] [*] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
[ 2.543343] [*] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
[ 2.544138] [*] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
[ 2.545120] [*] RIP: 0010:kernel_init+0x58/0x107
[ 2.546055] [*] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
[ 2.550242] [*] RSP: 0018:ffffc90000013f50 EFLAGS: 00010246
[ 2.551691] [*] RAX: dfff000000000001 RBX: ffffffff817b7ac9 RCX: 0000000000000040
[ 2.553435] [*] RDX: 0000000000000030 RSI: ffff88807da2f170 RDI: 000000000002f170
[ 2.555169] [*] RBP: 0000000000000000 R08: 00000000000001a6 R09: 00000000ad55ad55
[ 2.556393] [*] R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000
[ 2.557268] [*] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 2.558417] [*] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[ 2.559370] [*] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2.560138] [*] CR2: 0000000000000000 CR3: 0000000002009000 CR4: 00000000003406f0
[ 2.561013] [*] Call Trace:
[ 2.561506] [*] ret_from_fork+0x22/0x40
[ 2.562080] [*] Modules linked in:
[ 2.583706] [*] ---[ end trace 8ceb5a62d3ebbfa6 ]---
[ 2.584384] [*] RIP: 0010:kernel_init+0x58/0x107
[ 2.584999] [*] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
[ 2.591746] [*] RSP: 0018:ffffc90000013f50 EFLAGS: 00010246
[ 2.593175] [*] RAX: dfff000000000001 RBX: ffffffff817b7ac9 RCX: 0000000000000040
[ 2.594892] [*] RDX: 0000000000000030 RSI: ffff88807da2f170 RDI: 000000000002f170
[ 2.599706] [*] RBP: 0000000000000000 R08: 00000000000001a6 R09: 00000000ad55ad55
[ 2.600585] [*] R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000
[ 2.601433] [*] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 2.602283] [*] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[ 2.603207] [*] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2.607706] [*] CR2: 0000000000000000 CR3: 0000000002009000 CR4: 00000000003406f0
[ 2.608565] [*] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
[ 2.609600] [*] Kernel Offset: disabled
[ 2.610168] [*] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---

<--- and ends here.

to denote that first splat. And the '*' thing is just an example - it
can be any char - whatever's easier to grep for.

Thx.

Borislav Petkov

unread,
Nov 18, 2019, 12:38:56 PM11/18/19
to Dmitry Vyukov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
On Mon, Nov 18, 2019 at 05:44:07PM +0100, Borislav Petkov wrote:
> [ 2.523708] Write protecting the kernel read-only data: 16384k
> [ 2.524729] Freeing unused kernel image (text/rodata gap) memory: 2040K
> [ 2.525594] Freeing unused kernel image (rodata/data gap) memory: 368K
> [ 2.541414] x86/mm: Checked W+X mappings: passed, no W+X pages found.
>
> <--- important first splat starts here:
>
> [ 2.542218] [*] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
^

Btw, tglx just suggested on IRC to simply slap the die_counter number here so
that you have

[ 2.543343] [1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
[ 2.544138] [1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
...

which also tells you to which splat the line belongs to.

Also useful.

Andi Kleen

unread,
Nov 19, 2019, 11:25:35 PM11/19/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
Jann Horn <ja...@google.com> writes:

> +
> + if (error_code)
> + pr_alert("GPF is segment-related (see error code)\n");
> + else
> + print_kernel_gp_address(regs);

Is this really correct? There are a lot of instructions that can do #GP
(it's the CPU's equivalent of EINVAL) and I'm pretty sure many of them
don't set an error code, and many don't have operands either.

You would need to make sure the instruction decoder handles these
cases correctly, and ideally that you detect it instead of printing
a bogus address.

-Andi

Jann Horn

unread,
Nov 20, 2019, 5:32:14 AM11/20/19
to Andi Kleen, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
Is there a specific concern you have about the instruction decoder? As
far as I can tell, all the paths of insn_get_addr_ref() only work if
the instruction has a mod R/M byte according to the instruction
tables, and then figures out the address based on that. While that
means that there's a wide variety of cases in which we won't be able
to figure out the address, I'm not aware of anything specific that is
likely to lead to false positives.

But Andy did suggest that we hedge a bit in the error message because
even if the address passed to the instruction is non-canonical, we
don't know for sure whether that's actually the reason why things
failed, and that's why it says "probably" in the message about the
address now.

Jann Horn

unread,
Nov 20, 2019, 5:36:39 AM11/20/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
To support evaluating 64-bit kernel mode instructions:

Replace existing checks for user_64bit_mode() with a new helper that
checks whether code is being executed in either 64-bit kernel mode or
64-bit user mode.

Select the GS base depending on whether the instruction is being
evaluated in kernel mode.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
no changes
v3:
no changes

arch/x86/include/asm/ptrace.h | 13 +++++++++++++
arch/x86/lib/insn-eval.c | 26 +++++++++++++++-----------
2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5057a8ed100b..ac45b06941a5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -159,6 +159,19 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}

+/*
+ * Determine whether the register set came from any context that is running in
+ * 64-bit mode.
+ */
+static inline bool any_64bit_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64

Jann Horn

unread,
Nov 20, 2019, 5:36:43 AM11/20/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
A frequent cause of #GP exceptions are memory accesses to non-canonical
addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
the kernel doesn't currently print the fault address for #GP.
Luckily, we already have the necessary infrastructure for decoding X86
instructions and computing the memory address that is being accessed;
hook it up to the #GP handler so that we can figure out whether the #GP
looks like it was caused by a non-canonical address, and if so, print
that address.

While it is already possible to compute the faulting address manually by
disassembling the opcode dump and evaluating the instruction against the
register dump, this should make it slightly easier to identify crashes
at a glance.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
- print different message for segment-related GP (Borislav)
- rewrite check for non-canonical address (Sean)
- make it clear we don't know for sure why the GP happened (Andy)
v3:
- change message format to one line (Borislav)

I have already sent a patch to syzkaller that relaxes their parsing of GPF
messages (https://github.com/google/syzkaller/commit/432c7650) such that
changes like the one in this patch don't break it.
That patch has already made its way into syzbot's syzkaller instances
according to <https://syzkaller.appspot.com/upstream>.

arch/x86/kernel/traps.c | 56 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..19afedcd6f4e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -56,6 +56,8 @@
#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -509,11 +511,45 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}

+/*
+ * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
+ * address, return that address.
+ */
+static unsigned long get_kernel_gp_address(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+ u8 insn_bytes[MAX_INSN_SIZE];
+ struct insn insn;
+ unsigned long addr_ref;
+
+ if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
+ return 0;
+
+ kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
+ addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
+ if (addr_ref >= ~__VIRTUAL_MASK)
+ return 0;
+
+ /* Bail out if the entire operand is in the canonical user half. */
+ if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
+ return 0;
+
+ return addr_ref;
+#else
+ return 0;
+#endif
+}
+
+#define GPFSTR "general protection fault"
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
- const char *desc = "general protection fault";
struct task_struct *tsk;
+ char desc[90] = GPFSTR;

RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
@@ -531,6 +567,8 @@ do_general_protection(struct pt_regs *regs, long error_code)

tsk = current;
if (!user_mode(regs)) {
+ unsigned long non_canonical_addr = 0;
+
if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
return;

@@ -547,8 +585,20 @@ do_general_protection(struct pt_regs *regs, long error_code)
return;

if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
+ X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ if (error_code)
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
+ else
+ non_canonical_addr = get_kernel_gp_address(regs);
+
+ if (non_canonical_addr)
+ snprintf(desc, sizeof(desc),
+ GPFSTR " probably for non-canonical address 0x%lx",
+ non_canonical_addr);
+
+ die(desc, regs, error_code);
return;
}

--
2.24.0.432.g9d3f5f5b63-goog

Jann Horn

unread,
Nov 20, 2019, 5:36:46 AM11/20/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
Split __die() into __die_header() and __die_body(). This allows callers to
insert extra information below the header line that initiates the bug
report.

This can e.g. be used by __die() callers to allow KASAN to print additional
information below the header line of the bug report.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
I think that it's nicer to have KASAN's notes about the
bug below the first oops line from the kernel.
This also means that tools that work with kernel oops
reports can just trigger on the "general protection fault"
line with the die counter and so on, and just include the
text from on there, and the KASAN message will automatically
be included.
But if you think that the code looks too ugly, I'd be
happy to change that back and drop this patch from the
series.

v3:
new patch

arch/x86/include/asm/kdebug.h | 3 +++
arch/x86/kernel/dumpstack.c | 13 ++++++++++++-
2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 75f1e35e7c15..a0050fabce42 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,6 +33,9 @@ enum show_regs_mode {
};

extern void die(const char *, struct pt_regs *,long);
+extern void __die_header(const char *str, struct pt_regs *regs, long err);
+extern int __must_check __die_body(const char *str, struct pt_regs *regs,
+ long err);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, enum show_regs_mode);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e07424e19274..6436f3f5f803 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -365,7 +365,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
}
NOKPROBE_SYMBOL(oops_end);

-int __die(const char *str, struct pt_regs *regs, long err)
+void __die_header(const char *str, struct pt_regs *regs, long err)
{
const char *pr = "";

@@ -384,7 +384,11 @@ int __die(const char *str, struct pt_regs *regs, long err)
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+}
+NOKPROBE_SYMBOL(__die_header);

+int __die_body(const char *str, struct pt_regs *regs, long err)
+{
show_regs(regs);
print_modules();

@@ -394,6 +398,13 @@ int __die(const char *str, struct pt_regs *regs, long err)

return 0;
}
+NOKPROBE_SYMBOL(__die_body);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ __die_header(str, regs, err);
+ return __die_body(str, regs, err);
+}
NOKPROBE_SYMBOL(__die);

/*
--
2.24.0.432.g9d3f5f5b63-goog

Jann Horn

unread,
Nov 20, 2019, 5:36:51 AM11/20/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
Make #GP exceptions caused by out-of-bounds KASAN shadow accesses easier
to understand by computing the address of the original access and
printing that. More details are in the comments in the patch.

This turns an error like this:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault probably for non-canonical address
0xe017577ddf75b7dd: 0000 [#1] PREEMPT SMP KASAN PTI

into this:

general protection fault probably for non-canonical address
0xe017577ddf75b7dd: 0000 [#1] PREEMPT SMP KASAN PTI
KASAN: maybe wild-memory-access in range
[0x00badbeefbadbee8-0x00badbeefbadbeef]

The hook is placed in architecture-independent code, but is currently
only wired up to the X86 exception handler because I'm not sufficiently
familiar with the address space layout and exception handling mechanisms
on other architectures.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
- move to mm/kasan/report.c (Dmitry)
- change hook name to be more generic
- use TASK_SIZE instead of TASK_SIZE_MAX for compiling on non-x86
- don't open-code KASAN_SHADOW_MASK (Dmitry)
- add "KASAN: " prefix, but not "BUG: " (Andrey, Dmitry)
- use same naming scheme as get_wild_bug_type (Andrey)
- this version was "Reviewed-by: Dmitry Vyukov <dvy...@google.com>"
v3:
- adjusted example output in commit message based on
changes in preceding patch
- ensure that KASAN output happens after bust_spinlocks(1)
- moved hook in arch/x86/kernel/traps.c such that output
appears after the first line of KASAN-independent error report

arch/x86/kernel/traps.c | 11 ++++++++++
arch/x86/mm/kasan_init_64.c | 21 -------------------
include/linux/kasan.h | 6 ++++++
mm/kasan/report.c | 40 +++++++++++++++++++++++++++++++++++++
4 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 19afedcd6f4e..b5baf1114d44 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,6 +37,7 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
+#include <linux/kasan.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
@@ -568,6 +569,8 @@ do_general_protection(struct pt_regs *regs, long error_code)
tsk = current;
if (!user_mode(regs)) {
unsigned long non_canonical_addr = 0;
+ unsigned long flags;
+ int sig;

if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
return;
@@ -598,6 +601,14 @@ do_general_protection(struct pt_regs *regs, long error_code)
GPFSTR " probably for non-canonical address 0x%lx",
non_canonical_addr);

+ flags = oops_begin();
+ sig = SIGSEGV;
+ __die_header(desc, regs, error_code);
+ if (non_canonical_addr)
+ kasan_non_canonical_hook(non_canonical_addr);
+ if (__die_body(desc, regs, error_code))
+ sig = 0;
+ oops_end(flags, regs, sig);
die(desc, regs, error_code);
return;
}

Ingo Molnar

unread,
Nov 20, 2019, 6:19:03 AM11/20/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
I had to look twice to realize that the 'insn_bytes' isn't an integer
that shows the number of bytes in the instruction, but the instruction
buffer itself.

Could we please do s/insn_bytes/insn_buf or such?

> +
> + /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
> + if (addr_ref >= ~__VIRTUAL_MASK)
> + return 0;
> +
> + /* Bail out if the entire operand is in the canonical user half. */
> + if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
> + return 0;

BTW., it would be nice to split this logic in two: return the faulting
address to do_general_protection(), and print it out both for
non-canonical and canonical addresses as well -and use the canonical
check to *additionally* print out a short note when the operand is
non-canonical?

> +#define GPFSTR "general protection fault"
> dotraplinkage void

Please separate macro and function definitions by an additional newline.

> do_general_protection(struct pt_regs *regs, long error_code)
> {
> - const char *desc = "general protection fault";
> struct task_struct *tsk;
> + char desc[90] = GPFSTR;


How was this maximum string length of '90' derived? In what way will that
have to change if someone changes the message?

Thanks,

Ingo

Borislav Petkov

unread,
Nov 20, 2019, 6:24:16 AM11/20/19
to Ingo Molnar, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
On Wed, Nov 20, 2019 at 12:18:59PM +0100, Ingo Molnar wrote:
> How was this maximum string length of '90' derived? In what way will
> that have to change if someone changes the message?

That was me counting the string length in a dirty patch in a previous
thread. We probably should say why we decided for a certain length and
maybe have a define for it.

Also, I could use your opinion on this here:

https://lkml.kernel.org/r/2019111816...@zn.tnic

and the following mail.

I think that marking the splat with its number would *immensely* help us
with the question: was this the first splat or wasn't? A question we've
been asking since I got involved in kernel development. :)

Ingo Molnar

unread,
Nov 20, 2019, 6:40:35 AM11/20/19
to Borislav Petkov, Dmitry Vyukov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Linus Torvalds
> <--- and ends here.
>
> to denote that first splat. And the '*' thing is just an example - it
> can be any char - whatever's easier to grep for.

Well, this would break various pieces of tooling I'm sure.

Maybe it would be nicer to tooling to embedd the splat-counter in the
timestamp in a way:

> [ 2.542218-#1] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> [ 2.543343-#1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> [ 2.544138-#1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
> [ 2.545120-#1] RIP: 0010:kernel_init+0x58/0x107
> [ 2.546055-#1] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89

That way we'd not only know that it's the first splat, but we'd know it
from all the *other* splats as well where they are in the splat-rank ;-)

(Also Cc:-ed Linus.)

Thanks,

Ingo

Ingo Molnar

unread,
Nov 20, 2019, 6:41:42 AM11/20/19
to Borislav Petkov, Dmitry Vyukov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson

* Borislav Petkov <b...@alien8.de> wrote:

Yeah - but I think it would be even better to make it part of the
timestamp - most tools will already discard the [] bit, so why not merge
the two:

> [ 2.543343-#1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> [ 2.544138-#1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014

Thanks,

Ingo

Borislav Petkov

unread,
Nov 20, 2019, 6:52:44 AM11/20/19
to Ingo Molnar, Dmitry Vyukov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, kasan-dev, LKML, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Linus Torvalds
On Wed, Nov 20, 2019 at 12:40:31PM +0100, Ingo Molnar wrote:
> Well, this would break various pieces of tooling I'm sure.

Well, if at all, this will break them one last time. Ironically, the
intent here is to have a markup which doesn't break them anymore, once
that markup is agreed upon by all parties.

Because each time we touch those printk formats, tools people complain
about us breaking their tools. So we should get the best of both worlds
by marking those splats in a way that tools can grep for and we won't
touch the markers anymore, once established.

Also, "[]" was only an example. It can be anything we want, as in "<>"
or "!" or whatever is a short prefix that prepends those lines.

> Maybe it would be nicer to tooling to embedd the splat-counter in the
> timestamp in a way:

Or that. Whatever we agree, as long as it is a unique marker for splats.
And it should say which splat it is, as that is also very useful
information to have it in each line.

> > [ 2.542218-#1] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> > [ 2.543343-#1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> > [ 2.544138-#1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
> > [ 2.545120-#1] RIP: 0010:kernel_init+0x58/0x107
> > [ 2.546055-#1] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
>
> That way we'd not only know that it's the first splat, but we'd know it
> from all the *other* splats as well where they are in the splat-rank ;-)

That's exactly why I'd want the number in there.

Jann Horn

unread,
Nov 20, 2019, 7:15:15 AM11/20/19
to Ingo Molnar, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
On Wed, Nov 20, 2019 at 12:19 PM Ingo Molnar <mi...@kernel.org> wrote:
> * Jann Horn <ja...@google.com> wrote:
>
> > A frequent cause of #GP exceptions are memory accesses to non-canonical
> > addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
> > the kernel doesn't currently print the fault address for #GP.
> > Luckily, we already have the necessary infrastructure for decoding X86
> > instructions and computing the memory address that is being accessed;
> > hook it up to the #GP handler so that we can figure out whether the #GP
> > looks like it was caused by a non-canonical address, and if so, print
> > that address.
[...]
> > +/*
> > + * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
> > + * address, return that address.
> > + */
> > +static unsigned long get_kernel_gp_address(struct pt_regs *regs)
> > +{
> > +#ifdef CONFIG_X86_64
> > + u8 insn_bytes[MAX_INSN_SIZE];
> > + struct insn insn;
> > + unsigned long addr_ref;
> > +
> > + if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
> > + return 0;
> > +
> > + kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
> > + insn_get_modrm(&insn);
> > + insn_get_sib(&insn);
> > + addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
>
> I had to look twice to realize that the 'insn_bytes' isn't an integer
> that shows the number of bytes in the instruction, but the instruction
> buffer itself.
>
> Could we please do s/insn_bytes/insn_buf or such?

Will change it.

> > +
> > + /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
> > + if (addr_ref >= ~__VIRTUAL_MASK)
> > + return 0;
> > +
> > + /* Bail out if the entire operand is in the canonical user half. */
> > + if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
> > + return 0;
>
> BTW., it would be nice to split this logic in two: return the faulting
> address to do_general_protection(), and print it out both for
> non-canonical and canonical addresses as well -and use the canonical
> check to *additionally* print out a short note when the operand is
> non-canonical?

You mean something like this?

========================
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9b23c4bda243..16a6bdaccb51 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -516,32 +516,36 @@ dotraplinkage void do_bounds(struct pt_regs
*regs, long error_code)
* On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
* address, return that address.
*/
-static unsigned long get_kernel_gp_address(struct pt_regs *regs)
+static bool get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr,
+ bool *non_canonical)
{
#ifdef CONFIG_X86_64
u8 insn_buf[MAX_INSN_SIZE];
struct insn insn;
- unsigned long addr_ref;

if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
- return 0;
+ return false;

kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
insn_get_modrm(&insn);
insn_get_sib(&insn);
- addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);

- /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
- if (addr_ref >= ~__VIRTUAL_MASK)
- return 0;
+ if (*addr == (unsigned long)-1L)
+ return false;

- /* Bail out if the entire operand is in the canonical user half. */
- if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
- return 0;
+ /*
+ * Check that:
+ * - the address is not in the kernel half or -1 (which means the
+ * decoder failed to decode it)
+ * - the last byte of the address is not in the user canonical half
+ */
+ *non_canonical = *addr < ~__VIRTUAL_MASK &&
+ *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK;

- return addr_ref;
+ return true;
#else
- return 0;
+ return false;
#endif
}

@@ -569,8 +573,10 @@ do_general_protection(struct pt_regs *regs, long
error_code)

tsk = current;
if (!user_mode(regs)) {
- unsigned long non_canonical_addr = 0;
+ bool addr_resolved = false;
+ unsigned long gp_addr;
unsigned long flags;
+ bool non_canonical;
int sig;

if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
@@ -595,18 +601,19 @@ do_general_protection(struct pt_regs *regs, long
error_code)
if (error_code)
snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
else
- non_canonical_addr = get_kernel_gp_address(regs);
+ addr_resolved = get_kernel_gp_address(regs, &gp_addr,
+ &non_canonical);

- if (non_canonical_addr)
+ if (addr_resolved)
snprintf(desc, sizeof(desc),
- GPFSTR " probably for non-canonical address 0x%lx",
- non_canonical_addr);
+ GPFSTR " probably for %saddress 0x%lx",
+ non_canonical ? "non-canonical " : "", gp_addr);

flags = oops_begin();
sig = SIGSEGV;
__die_header(desc, regs, error_code);
- if (non_canonical_addr)
- kasan_non_canonical_hook(non_canonical_addr);
+ if (addr_resolved && non_canonical)
+ kasan_non_canonical_hook(gp_addr);
if (__die_body(desc, regs, error_code))
sig = 0;
oops_end(flags, regs, sig);
========================

I guess that could potentially be useful if a #GP is triggered by
something like an SSE alignment error? I'll add it in unless someone
else complains.

> > +#define GPFSTR "general protection fault"
> > dotraplinkage void
>
> Please separate macro and function definitions by an additional newline.

Will change it.

Jann Horn

unread,
Nov 20, 2019, 7:25:57 AM11/20/19
to Borislav Petkov, Ingo Molnar, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
On Wed, Nov 20, 2019 at 12:24 PM Borislav Petkov <b...@alien8.de> wrote:
> On Wed, Nov 20, 2019 at 12:18:59PM +0100, Ingo Molnar wrote:
> > How was this maximum string length of '90' derived? In what way will
> > that have to change if someone changes the message?
>
> That was me counting the string length in a dirty patch in a previous
> thread. We probably should say why we decided for a certain length and
> maybe have a define for it.

Do you think something like this would be better?

char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;

Ingo Molnar

unread,
Nov 20, 2019, 7:31:02 AM11/20/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen

* Jann Horn <ja...@google.com> wrote:

> You mean something like this?
>
> ========================
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 9b23c4bda243..16a6bdaccb51 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -516,32 +516,36 @@ dotraplinkage void do_bounds(struct pt_regs
> *regs, long error_code)
> * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
> * address, return that address.
> */
> -static unsigned long get_kernel_gp_address(struct pt_regs *regs)
> +static bool get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr,
> + bool *non_canonical)

Yeah, that's pretty much the perfect end result!

> I guess that could potentially be useful if a #GP is triggered by
> something like an SSE alignment error? I'll add it in unless someone
> else complains.

Yeah - also it's correct information about the context of the fault, so
it probably cannot *hurt*, and will allow us to debug/validate everything
in this area faster.

> > > +#define GPFSTR "general protection fault"
> > > dotraplinkage void
> >
> > Please separate macro and function definitions by an additional newline.
>
> Will change it.

Thanks!

Ingo

Borislav Petkov

unread,
Nov 20, 2019, 7:39:34 AM11/20/19
to Ingo Molnar, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
On Wed, Nov 20, 2019 at 01:30:58PM +0100, Ingo Molnar wrote:
>
> * Jann Horn <ja...@google.com> wrote:
>
> > You mean something like this?
> >
> > ========================
> > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> > index 9b23c4bda243..16a6bdaccb51 100644
> > --- a/arch/x86/kernel/traps.c
> > +++ b/arch/x86/kernel/traps.c
> > @@ -516,32 +516,36 @@ dotraplinkage void do_bounds(struct pt_regs
> > *regs, long error_code)
> > * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
> > * address, return that address.
> > */
> > -static unsigned long get_kernel_gp_address(struct pt_regs *regs)
> > +static bool get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr,
> > + bool *non_canonical)
>
> Yeah, that's pretty much the perfect end result!

Why do we need the bool thing? Can't we rely on the assumption that an
address of 0 is the error case and use that to determine whether the
resolving succeeded or not?

Borislav Petkov

unread,
Nov 20, 2019, 7:41:09 AM11/20/19
to Jann Horn, Ingo Molnar, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
Yap, and the 50 is a sufficiently large number so that all possible
string combinations in this case can fit in the resulting string array.

Thx.

Jann Horn

unread,
Nov 20, 2019, 7:42:59 AM11/20/19
to Borislav Petkov, Ingo Molnar, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
True, that'd work, too. I didn't really want to special-case an
integer here - especially one that has the same value as NULL - but I
guess it's fine.

Ingo Molnar

unread,
Nov 20, 2019, 8:16:32 AM11/20/19
to Jann Horn, Borislav Petkov, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
I'd much prefer this for, because it's a big honking warning for people
to not just assume things but double check the limits.

I.e. this mild obfuscation of the array size *helps* code quality in the
long run :-)

Thanks,

Ingo

Jann Horn

unread,
Nov 20, 2019, 8:23:36 AM11/20/19
to Ingo Molnar, Borislav Petkov, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
On Wed, Nov 20, 2019 at 2:16 PM Ingo Molnar <mi...@kernel.org> wrote:
> * Jann Horn <ja...@google.com> wrote:
>
> > On Wed, Nov 20, 2019 at 12:24 PM Borislav Petkov <b...@alien8.de> wrote:
> > > On Wed, Nov 20, 2019 at 12:18:59PM +0100, Ingo Molnar wrote:
> > > > How was this maximum string length of '90' derived? In what way will
> > > > that have to change if someone changes the message?
> > >
> > > That was me counting the string length in a dirty patch in a previous
> > > thread. We probably should say why we decided for a certain length and
> > > maybe have a define for it.
> >
> > Do you think something like this would be better?
> >
> > char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
>
> I'd much prefer this for, because it's a big honking warning for people
> to not just assume things but double check the limits.

Sorry, I can't parse the start of this sentence. I _think_ you're
saying you want me to make the change to "char desc[sizeof(GPFSTR) +
50 + 2*sizeof(unsigned long) + 1]"?

Ingo Molnar

unread,
Nov 20, 2019, 8:28:34 AM11/20/19
to Borislav Petkov, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen

* Borislav Petkov <b...@alien8.de> wrote:

I'd rather we not trust the decoder and the execution environment so much
that it never produces a 0 linear address in a #GP:

in get_addr_ref_32() we could get zero:

linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base;

in get_addr_ref_16() we could get zero too:

linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base;

Or in particularly exotic crashes we could get zero in get_addr_ref_64()
as well:

linear_addr = (unsigned long)eff_addr + seg_base;

although it's unlikely I suspect.

But the 32-bit case should be plausible enough?

It's also the simplest, most straightforward printout of the decoder
state: we either see an error, or an (address,canonical) pair of values.

Thanks,

Ingo

Borislav Petkov

unread,
Nov 20, 2019, 8:39:24 AM11/20/19
to Ingo Molnar, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen
On Wed, Nov 20, 2019 at 02:28:30PM +0100, Ingo Molnar wrote:
> I'd rather we not trust the decoder and the execution environment so much
> that it never produces a 0 linear address in a #GP:

I was just scratching my head whether I could trigger a #GP with address
of 0. But yeah, I agree, let's be really cautious here. I wouldn't want
to debug a #GP with a wrong address reported.

Thx.

Andi Kleen

unread,
Nov 20, 2019, 8:56:10 AM11/20/19
to Jann Horn, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
> Is there a specific concern you have about the instruction decoder? As
> far as I can tell, all the paths of insn_get_addr_ref() only work if
> the instruction has a mod R/M byte according to the instruction
> tables, and then figures out the address based on that. While that
> means that there's a wide variety of cases in which we won't be able
> to figure out the address, I'm not aware of anything specific that is
> likely to lead to false positives.

First there will be a lot of cases you'll just print 0, even
though 0 is canonical if there is no operand.

Then it might be that the address is canonical, but triggers
#GP anyways (e.g. unaligned SSE)

Or it might be the wrong address if there is an operand,
there are many complex instructions that reference something
in memory, and usually do canonical checking there.

And some other odd cases. For example when the instruction length
exceeds 15 bytes. I know there is fuzzing for the instruction
decoder, but it might be worth double checking it handles
all of that correctly. I'm not sure how good the fuzzer's coverage
is.

At a minimum you should probably check if the address is
actually non canonical. Maybe that's simple enough and weeds out
most cases.

-Andi

Ingo Molnar

unread,
Nov 20, 2019, 9:05:36 AM11/20/19
to Jann Horn, Borislav Petkov, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson, Andi Kleen

* Jann Horn <ja...@google.com> wrote:

> On Wed, Nov 20, 2019 at 2:16 PM Ingo Molnar <mi...@kernel.org> wrote:
> > * Jann Horn <ja...@google.com> wrote:
> >
> > > On Wed, Nov 20, 2019 at 12:24 PM Borislav Petkov <b...@alien8.de> wrote:
> > > > On Wed, Nov 20, 2019 at 12:18:59PM +0100, Ingo Molnar wrote:
> > > > > How was this maximum string length of '90' derived? In what way will
> > > > > that have to change if someone changes the message?
> > > >
> > > > That was me counting the string length in a dirty patch in a previous
> > > > thread. We probably should say why we decided for a certain length and
> > > > maybe have a define for it.
> > >
> > > Do you think something like this would be better?
> > >
> > > char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
> >
> > I'd much prefer this for, because it's a big honking warning for people
> > to not just assume things but double check the limits.
>
> Sorry, I can't parse the start of this sentence. I _think_ you're
> saying you want me to make the change to "char desc[sizeof(GPFSTR) +
> 50 + 2*sizeof(unsigned long) + 1]"?

Yeah, correct. There was an extra 'for' in my first sentence:

> > I'd much prefer this, because it's a big honking warning for people
> > to not just assume things but double check the limits.

Thanks,

Ingo

Jann Horn

unread,
Nov 20, 2019, 9:24:46 AM11/20/19
to Andi Kleen, Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
On Wed, Nov 20, 2019 at 2:56 PM Andi Kleen <a...@linux.intel.com> wrote:
> > Is there a specific concern you have about the instruction decoder? As
> > far as I can tell, all the paths of insn_get_addr_ref() only work if
> > the instruction has a mod R/M byte according to the instruction
> > tables, and then figures out the address based on that. While that
> > means that there's a wide variety of cases in which we won't be able
> > to figure out the address, I'm not aware of anything specific that is
> > likely to lead to false positives.
>
> First there will be a lot of cases you'll just print 0, even
> though 0 is canonical if there is no operand.

Why would I print zeroes if there is no operand? The decoder logic
returns a -1 if it can't find a mod r/m byte, which causes the #GP
handler to not print any address at all. Or are you talking about some
weird instruction that takes an operand that is actually ignored, or
something weird like that?

> Then it might be that the address is canonical, but triggers
> #GP anyways (e.g. unaligned SSE)

Which is an argument for printing the address even if it is canonical,
as Ingo suggested, I guess.

> Or it might be the wrong address if there is an operand,
> there are many complex instructions that reference something
> in memory, and usually do canonical checking there.

In which case you'd probably usually see a canonical address in the
instruction's argument, which causes the error message to not appear
(in patch v2/v3) / to be different (in my current draft for patch v4).
And as Ingo said over in the other thread, even if the argument is not
directly the faulting address at all, it might still help with
figuring out what's going on.

> And some other odd cases. For example when the instruction length
> exceeds 15 bytes.

But this is the #GP handler. Don't overlong instructions give you #UD instead?

> I know there is fuzzing for the instruction
> decoder, but it might be worth double checking it handles
> all of that correctly. I'm not sure how good the fuzzer's coverage
> is.
>
> At a minimum you should probably check if the address is
> actually non canonical. Maybe that's simple enough and weeds out
> most cases.

The patch you're commenting on does that already; quoting the patch:

+ /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
+ if (addr_ref >= ~__VIRTUAL_MASK)
+ return;
+
+ /* Bail out if the entire operand is in the canonical user half. */
+ if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
+ return;

But at Ingo's request, I'm planning to change that in the v4 patch;
see <https://lore.kernel.org/lkml/20191120111...@gmail.com/>
and <https://lore.kernel.org/lkml/CAG48ez0Frp4-+xHZ=UhbHh0hC_h-1VtJfwHw=kDo6Na...@mail.gmail.com/>.

Sean Christopherson

unread,
Nov 20, 2019, 11:21:46 AM11/20/19
to Borislav Petkov, Ingo Molnar, Jann Horn, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasan-dev, kernel list, Andrey Konovalov, Andy Lutomirski, Andi Kleen
On Wed, Nov 20, 2019 at 02:39:13PM +0100, Borislav Petkov wrote:
> On Wed, Nov 20, 2019 at 02:28:30PM +0100, Ingo Molnar wrote:
> > I'd rather we not trust the decoder and the execution environment so much
> > that it never produces a 0 linear address in a #GP:
>
> I was just scratching my head whether I could trigger a #GP with address
> of 0. But yeah, I agree, let's be really cautious here. I wouldn't want
> to debug a #GP with a wrong address reported.

It's definitely possible, there are a handful of non-SIMD instructions that
generate #GP(0) it CPL=0 in 64-bit mode *and* have a memory operand. Some
of them might even be legitimately encountered in the wild.

- CMPXCHG16B if it's not supported by the CPU.
- VMXON if CR4 is misconfigured or VMX isn't enabled in FEATURE_CONTROL.
- MONITOR if ECX has an invalid hint (although MONITOR hardcodes the
address in DS:RAX and so doesn't have a ModR/M byte).

Undoudbtedly there are other instructions with similar sources of #GP.

Jann Horn

unread,
Nov 20, 2019, 12:02:20 PM11/20/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
To support evaluating 64-bit kernel mode instructions:

Replace existing checks for user_64bit_mode() with a new helper that
checks whether code is being executed in either 64-bit kernel mode or
64-bit user mode.

Select the GS base depending on whether the instruction is being
evaluated in kernel mode.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
no changes
v3:
no changes
v4:
no changes

arch/x86/include/asm/ptrace.h | 13 +++++++++++++
arch/x86/lib/insn-eval.c | 26 +++++++++++++++-----------
2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5057a8ed100b..ac45b06941a5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -159,6 +159,19 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}

+/*
+ * Determine whether the register set came from any context that is running in
+ * 64-bit mode.
+ */
+static inline bool any_64bit_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64

Jann Horn

unread,
Nov 20, 2019, 12:02:23 PM11/20/19
to Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin, x...@kernel.org, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, kasa...@googlegroups.com, ja...@google.com, linux-...@vger.kernel.org, Andrey Konovalov, Andy Lutomirski, Sean Christopherson
A frequent cause of #GP exceptions are memory accesses to non-canonical
addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
the kernel doesn't currently print the fault address for #GP.
Luckily, we already have the necessary infrastructure for decoding X86
instructions and computing the memory address that is being accessed;
hook it up to the #GP handler so that we can figure out whether the #GP
looks like it was caused by a non-canonical address, and if so, print
that address.

While it is already possible to compute the faulting address manually by
disassembling the opcode dump and evaluating the instruction against the
register dump, this should make it slightly easier to identify crashes
at a glance.

Signed-off-by: Jann Horn <ja...@google.com>
---

Notes:
v2:
- print different message for segment-related GP (Borislav)
- rewrite check for non-canonical address (Sean)
- make it clear we don't know for sure why the GP happened (Andy)
v3:
- change message format to one line (Borislav)
v4:
- rename insn_bytes to insn_buf (Ingo)
- add space after GPFSTR (Ingo)
- make sizeof(desc) clearer (Ingo, Borislav)
- also print the address (with a different message) if it's canonical (Ingo)

I have already sent a patch to syzkaller that relaxes their parsing of GPF
messages (https://github.com/google/syzkaller/commit/432c7650) such that
changes like the one in this patch don't break it.
That patch has already made its way into syzbot's syzkaller instances
according to <https://syzkaller.appspot.com/upstream>.

arch/x86/kernel/traps.c | 64 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..b90635f29b9f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -56,6 +56,8 @@
#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -509,11 +511,50 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}

+/*
+ * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
+ * address, return that address.
+ */
+static bool get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr,
+ bool *non_canonical)
+{
+#ifdef CONFIG_X86_64
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
+
+ if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return false;
+
+ kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ if (*addr == (unsigned long)-1L)
+ return false;
+
+ /*
+ * Check that:
+ * - the address is not in the kernel half or -1 (which means the
+ * decoder failed to decode it)
+ * - the last byte of the address is not in the user canonical half
+ */
+ *non_canonical = *addr < ~__VIRTUAL_MASK &&
+ *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK;
+
+ return true;
+#else
+ return false;
+#endif
+}
+
+#define GPFSTR "general protection fault"
+
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
- const char *desc = "general protection fault";
struct task_struct *tsk;
+ char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;

RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
@@ -531,6 +572,10 @@ do_general_protection(struct pt_regs *regs, long error_code)

tsk = current;
if (!user_mode(regs)) {
+ bool addr_resolved = false;
+ unsigned long gp_addr;
+ bool non_canonical;
+
if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
return;

@@ -547,8 +592,21 @@ do_general_protection(struct pt_regs *regs, long error_code)
return;

if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
+ X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ if (error_code)
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
+ else
+ addr_resolved = get_kernel_gp_address(regs, &gp_addr,
+ &non_canonical);
+
+ if (addr_resolved)
+ snprintf(desc, sizeof(desc),
+ GPFSTR " probably for %saddress 0x%lx",
+ non_canonical ? "non-canonical " : "", gp_addr);

Jann Horn

unread,
Nov 20, 2019, 12:02:26 PM11/20/19