Src dump of the RIP address
/vmlinux: file format elf64-x86-64
Disassembly of section .text:
ffffffff810ac394 <next_pidmap>:
}
return -1;
}
int next_pidmap(struct pid_namespace *pid_ns, int last)
{
ffffffff810ac394: 55 push %rbp
ffffffff810ac395: 48 89 e5 mov %rsp,%rbp
ffffffff810ac398: 41 55 push %r13
ffffffff810ac39a: 41 54 push %r12
ffffffff810ac39c: 53 push %rbx
ffffffff810ac39d: 48 83 ec 08 sub $0x8,%rsp
ffffffff810ac3a1: e8 1a 88 e7 00 callq ffffffff81f24bc0 <mcount>
int offset;
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
ffffffff810ac3a6: ff c6 inc %esi
}
return -1;
}
int next_pidmap(struct pid_namespace *pid_ns, int last)
{
ffffffff810ac3a8: 49 89 fc mov %rdi,%r12
int offset;
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
ffffffff810ac3ab: 89 f2 mov %esi,%edx
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
ffffffff810ac3ad: 4c 8d af 08 08 00 00 lea 0x808(%rdi),%r13
{
int offset;
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
ffffffff810ac3b4: 48 63 f6 movslq %esi,%rsi
int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
ffffffff810ac3b7: 81 e2 ff 7f 00 00 and $0x7fff,%edx
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
ffffffff810ac3bd: 48 c1 ee 0f shr $0xf,%rsi
ffffffff810ac3c1: 48 c1 e6 04 shl $0x4,%rsi
ffffffff810ac3c5: 48 8d 5c 37 08 lea 0x8(%rdi,%rsi,1),%rbx
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
for (; map < end; map++, offset = 0) {
ffffffff810ac3ca: eb 36 jmp
ffffffff810ac402 <next_pidmap+0x6e>
if (unlikely(!map->page))
ffffffff810ac3cc: 48 8b 7b 08 mov 0x8(%rbx),%rdi
ffffffff810ac3d0: 48 85 ff test %rdi,%rdi
ffffffff810ac3d3: 74 27 je
ffffffff810ac3fc <next_pidmap+0x68>
continue;
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
ffffffff810ac3d5: 48 63 d2 movslq %edx,%rdx
ffffffff810ac3d8: be 00 80 00 00 mov $0x8000,%esi
ffffffff810ac3dd: e8 66 ab 3e 00 callq
ffffffff81496f48 <find_next_bit>
if (offset < BITS_PER_PAGE)
ffffffff810ac3e2: 3d ff 7f 00 00 cmp $0x7fff,%eax
ffffffff810ac3e7: 77 13 ja
ffffffff810ac3fc <next_pidmap+0x68>
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
static inline int mk_pid(struct pid_namespace *pid_ns,
struct pidmap *map, int off)
{
return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
ffffffff810ac3e9: 49 83 c4 08 add $0x8,%r12
ffffffff810ac3ed: 4c 29 e3 sub %r12,%rbx
ffffffff810ac3f0: 48 c1 fb 04 sar $0x4,%rbx
ffffffff810ac3f4: c1 e3 0f shl $0xf,%ebx
ffffffff810ac3f7: 8d 04 03 lea (%rbx,%rax,1),%eax
for (; map < end; map++, offset = 0) {
if (unlikely(!map->page))
continue;
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
if (offset < BITS_PER_PAGE)
return mk_pid(pid_ns, map, offset);
ffffffff810ac3fa: eb 0e jmp
ffffffff810ac40a <next_pidmap+0x76>
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
for (; map < end; map++, offset = 0) {
ffffffff810ac3fc: 48 83 c3 10 add $0x10,%rbx
ffffffff810ac400: 31 d2 xor %edx,%edx
ffffffff810ac402: 4c 39 eb cmp %r13,%rbx
ffffffff810ac405: 72 c5 jb
ffffffff810ac3cc <next_pidmap+0x38>
ffffffff810ac407: 83 c8 ff or $0xffffffffffffffff,%eax
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
if (offset < BITS_PER_PAGE)
return mk_pid(pid_ns, map, offset);
}
return -1;
}
ffffffff810ac40a: 41 5b pop %r11
ffffffff810ac40c: 5b pop %rbx
ffffffff810ac40d: 41 5c pop %r12
ffffffff810ac40f: 41 5d pop %r13
ffffffff810ac411: c9 leaveq
--
Robert Święcki
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
I'm surprised by the discrepancy between this oops, and kdb memory dump:
From oops/kdb (from address 0xffffffff810ac3a1 which is start of next_pidmap)
Code: 0f 1f 44 00 00 ff c6 49 89 fc 89 f2 4c 8d af 08 08 00 00 48....
From objdump vmlinux (the same address)
Code: e8 1a 88 e7 00 ff c6 49 89 fc 89 f2 4c 8d af 08 08 00 00 48...
Which decodes to
kdb:
0: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
5: ff c6 inc %esi
7: 49 89 fc mov %rdi,%r12
a: 89 f2 mov %esi,%edx
c: 4c 8d af 08 08 00 00 lea 0x808(%rdi),%r13
vmlinux:
ffffffff810ac3a1: e8 1a 88 e7 00 callq ffffffff81f24bc0 <mcount>
ffffffff810ac3a6: ff c6 inc %esi
ffffffff810ac3a8: 49 89 fc mov %rdi,%r12
ffffffff810ac3ab: 89 f2 mov %esi,%edx
ffffffff810ac3ad: 4c 8d af 08 08 00 00 lea 0x808(%rdi),%r13
Might this difference (nopl 0x0(%rax,%rax,1) vs callq
ffffffff81f24bc0 <mcount> at 0xffffffff810ac3a1) be some kind of
kernel instrumentation (ftrace, perf or so), or a symptom of a bug
(overwritten memory)?
And a quick kgdb session:
(gdb) target remote /dev/ttyS0
Remote debugging using /dev/ttyS0
next_pidmap (pid_ns=0xffffffff82827000, last=<value optimized out>) at
kernel/pid.c:229
229 if (unlikely(!map->page))
(gdb) p map
$1 = (struct pidmap *) 0x1fffff82745c68
(gdb) p map->page
Cannot access memory at address 0x1fffff82745c70
(gdb) bt
#0 next_pidmap (pid_ns=0xffffffff82827000, last=<value optimized
out>) at kernel/pid.c:229
#1 0xffffffff810ac446 in find_ge_pid (nr=-1889315394,
ns=0xffffffff82827000) at kernel/pid.c:527
#2 0xffffffff811aedbb in next_tgid (ns=0xffffffff82827000, iter=...)
at fs/proc/base.c:3087
#3 0xffffffff811aef6f in proc_pid_readdir (filp=0xffff880020155840,
dirent=<value optimized out>, filldir=0xffffffff811992c0
<compat_fillonedir>) at fs/proc/base.c:3146
#4 0xffffffff811ab262 in proc_root_readdir (filp=0xffff880020155840,
dirent=0xffff8800c82d1f48, filldir=0xffffffff811992c0
<compat_fillonedir>) at fs/proc/root.c:159
#5 0xffffffff8116bd57 in vfs_readdir (file=0xffff880020155840,
filler=0xffffffff811992c0 <compat_fillonedir>, buf=0xffff8800c82d1f48)
at fs/readdir.c:40
#6 0xffffffff81197da9 in compat_sys_old_readdir (fd=<value optimized
out>, dirent=0xf777a000, count=<value optimized out>) at
fs/compat.c:901
#7 <signal handler called>
#8 0x00000000080658d2 in ?? ()
Cannot access memory at address 0x29d58137
(gdb) up
#1 0xffffffff810ac446 in find_ge_pid (nr=-1889315394,
ns=0xffffffff82827000) at kernel/pid.c:527
527 nr = next_pidmap(ns, nr);
(gdb) p nr
$5 = -1889315394
(gdb) up
#2 0xffffffff811aedbb in next_tgid (ns=0xffffffff82827000, iter=...)
at fs/proc/base.c:3087
3087 pid = find_ge_pid(iter.tgid, ns);
(gdb) p iter
$8 = {tgid = 2405651902, task = 0x0}
Might it be unsigned (2405651902) vs signed (-1889315394) problem? The
map is counted as
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
so
and a repro - should be quite obvious for FS folks, I guess
filp->f_pos needs better checking in proc's readdir (or in llseek).
Works with at least 2.6.32 - 2.6.39-rc3
=====================================================
// Found by Tavis Ormandy's (tav...@cmpxchg8b.com):
// http://code.google.com/p/iknowthis/
// Analyzed by Robert Swiecki <rob...@swiecki.net>
#define _GNU_SOURCE 1
#define _LARGEFILE64_SOURCE
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/syscall.h>
int main(void)
{
int fd = open("/proc", O_DIRECTORY | O_RDONLY);
if (fd == -1) {
perror("open");
return -1;
}
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
lseek64(fd, 4000000000ULL, SEEK_SET);
struct linux_dirent b[100];
syscall(__NR_getdents, fd, b, sizeof(b));
}
=====================================================
On Mon, Apr 18, 2011 at 02:57:55PM +0200, Robert Święcki wrote:
> and a repro - should be quite obvious for FS folks, I guess
> filp->f_pos needs better checking in proc's readdir (or in llseek).
Yup:
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
..
for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
..
}
ns = filp->f_dentry->d_sb->s_fs_info;
iter.task = NULL;
iter.tgid = filp->f_pos - TGID_OFFSET;
..
There's no test to validate f_pos. If it's out of bounds, the "for"
just doesn't run.
-Kees
--
Kees Cook
Ubuntu Security Team