Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] fs/proc/kcore.c: Omit kernel text area for hardened usercopy feature

5 views
Skip to first unread message

Jiri Olsa

unread,
Sep 2, 2016, 8:25:56 AM9/2/16
to lkml, Kees Cook, Ingo Molnar, Adrian Hunter, Andi Kleen, KAMEZAWA Hiroyuki, Linus Torvalds
One of the bullets for hardened usercopy feature is:
- object must not overlap with kernel text

which is what we expose via /proc/kcore. We can hit
this check and crash the system very easily just by
reading the text area in kcore file:

usercopy: kernel memory exposure attempt detected from ffffffff8179a01f (<kernel text>) (4065 bytes)
kernel BUG at mm/usercopy.c:75!

Omitting kernel text area from kcore when there's
hardened usercopy feature is enabled.

Fixes: f5509cc18daa ("mm: Hardened usercopy")
Reported-by: Steve Best <sb...@redhat.com>
Cc: Kees Cook <kees...@chromium.org>
Signed-off-by: Jiri Olsa <jo...@kernel.org>
---
fs/proc/kcore.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..e322d4e0be4d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -629,8 +629,12 @@ static int __init proc_kcore_init(void)
pr_err("couldn't create /proc/kcore\n");
return 0; /* Always returns 0. */
}
+
+#ifndef CONFIG_HARDENED_USERCOPY
/* Store text area if it's special */
proc_kcore_text_init();
+#endif
+
/* Store vmalloc area */
kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
--
2.7.4

Andi Kleen

unread,
Sep 2, 2016, 11:17:23 AM9/2/16
to Jiri Olsa, lkml, Kees Cook, Ingo Molnar, Adrian Hunter, Andi Kleen, KAMEZAWA Hiroyuki, Linus Torvalds
On Fri, Sep 02, 2016 at 02:25:45PM +0200, Jiri Olsa wrote:
> One of the bullets for hardened usercopy feature is:
> - object must not overlap with kernel text
>
> which is what we expose via /proc/kcore. We can hit
> this check and crash the system very easily just by
> reading the text area in kcore file:
>
> usercopy: kernel memory exposure attempt detected from ffffffff8179a01f (<kernel text>) (4065 bytes)
> kernel BUG at mm/usercopy.c:75!
>
> Omitting kernel text area from kcore when there's
> hardened usercopy feature is enabled.

That will completely break PT decoding, which relies on looking
at the kernel text in /proc/kcore.

Need a different fix here, perhaps some special copy function
that is not hardened.

-Andi

Jiri Olsa

unread,
Sep 2, 2016, 12:15:41 PM9/2/16
to Andi Kleen, Jiri Olsa, lkml, Kees Cook, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki, Linus Torvalds
ok, I'll try to come up with something

thanks,
jirka

Jiri Olsa

unread,
Sep 5, 2016, 4:47:36 AM9/5/16
to Andi Kleen, Jiri Olsa, lkml, Kees Cook, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki, Linus Torvalds
On Fri, Sep 02, 2016 at 08:17:13AM -0700, Andi Kleen wrote:
how about something like this

jirka


---
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c3f291195294..43f5404f0e61 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -726,7 +726,8 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
}

static inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
+copy_to_user_check(void __user *to, const void *from,
+ unsigned long n, bool check)
{
int sz = __compiletime_object_size(from);

@@ -735,7 +736,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
might_fault();

if (likely(sz < 0 || sz >= n)) {
- check_object_size(from, n, true);
+ if (check)
+ check_object_size(from, n, true);
n = _copy_to_user(to, from, n);
} else if (!__builtin_constant_p(n))
copy_user_overflow(sz, n);
@@ -745,6 +747,19 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
return n;
}

+static inline unsigned long __must_check
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user_check(to, from, n, true);
+}
+
+static inline unsigned long __must_check
+copy_to_user_nocheck(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user_check(to, from, n, false);
+}
+
+
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 673059a109fe..e80e4a146b7d 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -50,7 +50,7 @@ __must_check unsigned long
copy_in_user(void __user *to, const void __user *from, unsigned len);

static __always_inline __must_check
-int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
+int __copy_from_user_nofaultcheck(void *dst, const void __user *src, unsigned size)
{
int ret = 0;

@@ -112,7 +112,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
{
might_fault();
kasan_check_write(dst, size);
- return __copy_from_user_nocheck(dst, src, size);
+ return __copy_from_user_nofaultcheck(dst, src, size);
}

static __always_inline __must_check
@@ -248,7 +248,7 @@ static __must_check __always_inline int
__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
{
kasan_check_write(dst, size);
- return __copy_from_user_nocheck(dst, src, size);
+ return __copy_from_user_nofaultcheck(dst, src, size);
}

static __must_check __always_inline int
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..c7a22a8a157e 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -516,7 +516,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
if (kern_addr_valid(start)) {
unsigned long n;

- n = copy_to_user(buffer, (char *)start, tsz);
+ n = copy_to_user_nocheck(buffer, (char *)start, tsz);
/*
* We cannot distinguish between fault on source
* and fault on destination. When this happens

Andi Kleen

unread,
Sep 5, 2016, 12:27:52 PM9/5/16
to Jiri Olsa, Andi Kleen, Jiri Olsa, lkml, Kees Cook, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki, Linus Torvalds
On Mon, Sep 05, 2016 at 10:47:22AM +0200, Jiri Olsa wrote:
> On Fri, Sep 02, 2016 at 08:17:13AM -0700, Andi Kleen wrote:
> > On Fri, Sep 02, 2016 at 02:25:45PM +0200, Jiri Olsa wrote:
> > > One of the bullets for hardened usercopy feature is:
> > > - object must not overlap with kernel text
> > >
> > > which is what we expose via /proc/kcore. We can hit
> > > this check and crash the system very easily just by
> > > reading the text area in kcore file:
> > >
> > > usercopy: kernel memory exposure attempt detected from ffffffff8179a01f (<kernel text>) (4065 bytes)
> > > kernel BUG at mm/usercopy.c:75!
> > >
> > > Omitting kernel text area from kcore when there's
> > > hardened usercopy feature is enabled.
> >
> > That will completely break PT decoding, which relies on looking
> > at the kernel text in /proc/kcore.
> >
> > Need a different fix here, perhaps some special copy function
> > that is not hardened.
>
> how about something like this

Looks good to me, but you would need the *_nocheck variant for non x86
architectures too of course.

-Andi

Kees Cook

unread,
Sep 6, 2016, 1:56:54 PM9/6/16
to Jiri Olsa, Andi Kleen, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki, Linus Torvalds
On Mon, Sep 5, 2016 at 4:47 AM, Jiri Olsa <jo...@redhat.com> wrote:
> On Fri, Sep 02, 2016 at 08:17:13AM -0700, Andi Kleen wrote:
>> On Fri, Sep 02, 2016 at 02:25:45PM +0200, Jiri Olsa wrote:
>> > One of the bullets for hardened usercopy feature is:
>> > - object must not overlap with kernel text
>> >
>> > which is what we expose via /proc/kcore. We can hit
>> > this check and crash the system very easily just by
>> > reading the text area in kcore file:
>> >
>> > usercopy: kernel memory exposure attempt detected from ffffffff8179a01f (<kernel text>) (4065 bytes)
>> > kernel BUG at mm/usercopy.c:75!

It looks like it was this one in kcore.c:

} else {
if (kern_addr_valid(start)) {
unsigned long n;

This patch is x86-specific (but ARCH_PROC_KCORE_TEXT is on multiple
architectures), which I don't think we want. Instead, let's get the
usercopy helper code centralized (Al Viro is looking at this already),
and then we can design arch-agnostic methods to handle this.

In the meantime, how about continuing to use a bounce buffer like
already done in the vmalloc_or_module_addr() case immediately above?

-Kees

--
Kees Cook
Nexus Security

Linus Torvalds

unread,
Sep 6, 2016, 2:34:36 PM9/6/16
to Kees Cook, Jiri Olsa, Andi Kleen, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
On Tue, Sep 6, 2016 at 10:56 AM, Kees Cook <kees...@chromium.org> wrote:
>
> In the meantime, how about continuing to use a bounce buffer like
> already done in the vmalloc_or_module_addr() case immediately above?

Yes please. Let's not make up even more of the user access functions
with magical properties, for some special-case code in /proc/kcore.

Linus

Andi Kleen

unread,
Sep 6, 2016, 3:41:24 PM9/6/16
to Linus Torvalds, Kees Cook, Jiri Olsa, Andi Kleen, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
I suspect it's more than just /proc/kcore, there could be also
legitimate cases to read kernel text from /dev/mem or /dev/kmem

I suppose could add bounce buffers everywhere.

-Andi

Linus Torvalds

unread,
Sep 6, 2016, 3:48:26 PM9/6/16
to Andi Kleen, Kees Cook, Jiri Olsa, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
On Tue, Sep 6, 2016 at 12:41 PM, Andi Kleen <an...@firstfloor.org> wrote:
>
> I suspect it's more than just /proc/kcore, there could be also
> legitimate cases to read kernel text from /dev/mem or /dev/kmem

Yes, that's probably true. Although I suspect that we should just say
that user-copy hardening is incompatible with /dev/kmem and
!STRICT_DEVMEM.

At least Fedora seems to have

CONFIG_DEVMEM=y
# CONFIG_DEVKMEM is not set
CONFIG_STRICT_DEVMEM=y

which should mean that you already should not be able to access normal
RAM using /dev/[k]mem - ie it's purely for legacy X server kind of
situations.

So we could just make HARDENED_USERCOPY force those settings. It's
not like you should ever have anything else in any situation where you
care about security *anyway*, so...

Linus

Jiri Olsa

unread,
Sep 7, 2016, 3:33:13 AM9/7/16
to Kees Cook, Andi Kleen, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki, Linus Torvalds
On Tue, Sep 06, 2016 at 01:56:40PM -0400, Kees Cook wrote:

SNIP

> > static __must_check __always_inline int
> > diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> > index a939f5ed7f89..c7a22a8a157e 100644
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -516,7 +516,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > if (kern_addr_valid(start)) {
> > unsigned long n;
> >
> > - n = copy_to_user(buffer, (char *)start, tsz);
> > + n = copy_to_user_nocheck(buffer, (char *)start, tsz);
> > /*
> > * We cannot distinguish between fault on source
> > * and fault on destination. When this happens
>
> This patch is x86-specific (but ARCH_PROC_KCORE_TEXT is on multiple
> architectures), which I don't think we want. Instead, let's get the
> usercopy helper code centralized (Al Viro is looking at this already),
> and then we can design arch-agnostic methods to handle this.
>
> In the meantime, how about continuing to use a bounce buffer like
> already done in the vmalloc_or_module_addr() case immediately above?

ok, sounds good.. so something like below? untested

thanks,
jirka


---
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..de07c273f725 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -515,8 +515,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
} else {
if (kern_addr_valid(start)) {
unsigned long n;
+ char *buf;

- n = copy_to_user(buffer, (char *)start, tsz);
+ buf = kzalloc(tsz, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * Using bounce buffer to bypass the hardened
+ * user copy kernel text checks.
+ */
+ memcpy(buf, (char *) start, tsz);
+
+ n = copy_to_user(buffer, buf, tsz);
+ kfree(buf);

Andi Kleen

unread,
Sep 7, 2016, 12:39:02 PM9/7/16
to Jiri Olsa, Kees Cook, Andi Kleen, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki, Linus Torvalds
> ---
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index a939f5ed7f89..de07c273f725 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -515,8 +515,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> } else {
> if (kern_addr_valid(start)) {
> unsigned long n;
> + char *buf;
>
> - n = copy_to_user(buffer, (char *)start, tsz);
> + buf = kzalloc(tsz, GFP_KERNEL);

You have to add some limit and a loop, otherwise a user can eat all kernel memory,
or copies > KMALLOC_MAX wouldn't work. Probably only get a single page.

Also don't need kzalloc, kmalloc is sufficient.

-Andi

Linus Torvalds

unread,
Sep 7, 2016, 12:58:22 PM9/7/16
to Andi Kleen, Jiri Olsa, Kees Cook, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
On Wed, Sep 7, 2016 at 9:38 AM, Andi Kleen <an...@firstfloor.org> wrote:
>>
>> - n = copy_to_user(buffer, (char *)start, tsz);
>> + buf = kzalloc(tsz, GFP_KERNEL);
>
> You have to add some limit and a loop, otherwise a user can eat all kernel memory,
> or copies > KMALLOC_MAX wouldn't work. Probably only get a single page.

'start' and 'tsz' is already chunked to be aligned pages (well, as
aligned as they can be: the beginning and end obviously won't be).
Above the loop:

if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;

and then inside the loop:

tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);

so it's already limited to one page.

That said, it *might* be worth moving the temporary allocation to the
top, or even to move it to open_kcore(). It used to be a special case
for just the vmalloc region, now it's always done.

So instead of having two different copies of the same special case for
the two different cases, why not try to unify them and just have one
common (page-sized) buffer allocation?

Linus

Kees Cook

unread,
Sep 7, 2016, 1:17:12 PM9/7/16
to Linus Torvalds, Andi Kleen, Jiri Olsa, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
!DEVKMEM is easy to represent, but STRICT_DEVMEM=y gets a little ugly,
since the logic desired is actually "STRICT_DEVMEM=y if STRICT_DEVMEM
available" and STRICT_DEVMEM looks like this:

config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
depends on ARCH_HAS_DEVMEM_IS_ALLOWED

But I don't want to limit hardened usercopy to MMU only, so...

depends on !DEVKMEM
depends on STRICT_DEVMEM=y || !ARCH_HAS_DEVMEM_IS_ALLOWED || !MMU

This looks a bit ugly to me, but I'm happy to add it if people think
it's worth it.

Linus Torvalds

unread,
Sep 7, 2016, 1:24:33 PM9/7/16
to Kees Cook, Andi Kleen, Jiri Olsa, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
On Wed, Sep 7, 2016 at 10:17 AM, Kees Cook <kees...@chromium.org> wrote:
>
> !DEVKMEM is easy to represent, but STRICT_DEVMEM=y gets a little ugly,

I think you can just do

config STRICT_DEVMEM
bool "Filter access to /dev/mem" if !HARDENED_USERCOPY
depends on MMU
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
default y

ie you put the "if !HARDENED_USERCOPY" on the *question*, so that if
HARDENED_USERCOPY is set you'll never actually ask it.

Or you just make it go the other way, and make HARDENED_USERCOPY
depend on STRICT_DEVMEM.

Linus

Jiri Olsa

unread,
Sep 7, 2016, 3:26:11 PM9/7/16
to Linus Torvalds, Andi Kleen, Kees Cook, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
ook, sounds good.. will repost soon

jirka

Jiri Olsa

unread,
Sep 7, 2016, 5:25:03 PM9/7/16
to Linus Torvalds, Andi Kleen, Kees Cook, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
I'll give this more testing, but it looks ok so far,
also maybe it should be split into 2 parts:
- adding the common buffer
- ktext using bounce buffer

jirka


diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..7405b3894183 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
static ssize_t
read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
+ char *buf = file->private_data;
ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
@@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
if (clear_user(buffer, tsz))
return -EFAULT;
} else if (is_vmalloc_or_module_addr((void *)start)) {
- char * elf_buf;
-
- elf_buf = kzalloc(tsz, GFP_KERNEL);
- if (!elf_buf)
- return -ENOMEM;
- vread(elf_buf, (char *)start, tsz);
+ vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, elf_buf, tsz)) {
- kfree(elf_buf);
+ if (copy_to_user(buffer, buf, tsz))
return -EFAULT;
- }
- kfree(elf_buf);
} else {
if (kern_addr_valid(start)) {
unsigned long n;

- n = copy_to_user(buffer, (char *)start, tsz);
+ /*
+ * Using bounce buffer to bypass the
+ *hardened user copy kernel text checks.
+ */
+ memcpy(buf, (char *) start, tsz);
+ n = copy_to_user(buffer, buf, tsz);
/*
* We cannot distinguish between fault on source
* and fault on destination. When this happens
@@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
{
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
+
+ filp->private_data = (void *) __get_free_pages(GFP_KERNEL, 0);
+ if (!filp->private_data)
+ return -ENOMEM;
+
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -556,13 +559,20 @@ static int open_kcore(struct inode *inode, struct file *filp)
i_size_write(inode, proc_root_kcore->size);
inode_unlock(inode);
}
+
return 0;
}

+static int release_kcore(struct inode *inode, struct file *file)
+{
+ free_pages((unsigned long) file->private_data, 0);
+ return 0;
+}

static const struct file_operations proc_kcore_operations = {
.read = read_kcore,
.open = open_kcore,
+ .release = release_kcore,
.llseek = default_llseek,
};

Linus Torvalds

unread,
Sep 7, 2016, 6:52:45 PM9/7/16
to Jiri Olsa, Andi Kleen, Kees Cook, Jiri Olsa, lkml, Ingo Molnar, Adrian Hunter, KAMEZAWA Hiroyuki
On Wed, Sep 7, 2016 at 2:24 PM, Jiri Olsa <jo...@redhat.com> wrote:
>
> I'll give this more testing, but it looks ok so far,

Looks fine to me.

I'd perhaps suggest using a simpler interface than
"__get_free_pages(GFP_KERNEL, 0);".

If nothing else, just drop the "order" argument and use
"__get_free_page()", but maybe even just doing "kmalloc/kfree".

Linus
0 new messages