I tracked this down to is_hugepage_only_range() (include/asm-ppc64/page.h)
which is doing a touches_hugepage_low_range() which is checking
current->mm->context.htlb_segs. The problem is that exit_mm()
cleared tsk->mm before doing the mmput() which leads to the exit_aio()
and then the panic. Looks like is_hugepage_only_range() is only used
in ia64 and ppc64. Possible fix is to change is_hugepage_only_range()
to take an 'mm' as a parameter as well as 'addr' and 'len' and then
the ppc64 code could change to use 'mm'. It looks like it has been
broken for quite a while.
Here's the stack trace:
cpu 0x2: Vector: 300 (Data Access) at [c0000001d1be7590]
pc: c000000000092960: .unmap_region+0x17c/0x4a4
lr: c000000000092bb0: .unmap_region+0x3cc/0x4a4
sp: c0000001d1be7810
msr: 8000000000009032
dar: 298
dsisr: 40000000
current = 0xc000000001dd77b0
paca = 0xc000000000595c00
pid = 11336, comm = aiodio_readoff
[c0000001d1be78e0] c000000000093d08 .do_munmap+0x240/0x408
[c0000001d1be79b0] c0000000000d11b4 .aio_free_ring+0x10c/0x1d8
[c0000001d1be7a50] c0000000000d162c .__put_ioctx+0x84/0x120
[c0000001d1be7af0] c0000000000d3640 .exit_aio+0xf4/0x100
[c0000001d1be7b80] c00000000004dfd4 .mmput+0x80/0x15c
[c0000001d1be7c20] c000000000053648 .exit_mm+0x1b4/0x264
[c0000001d1be7cc0] c0000000000555ac .do_exit+0x10c/0xdb0
[c0000001d1be7d90] c0000000000562a8 .do_group_exit+0x58/0xd8
[c0000001d1be7e30] c00000000000d500 syscall_exit+0x0/0x18
Here's a program that produces the panic:
(compile using cc -o aiodio_read aiodio_read.c -laio).
--------------------------
#define _XOPEN_SOURCE 600
#define _GNU_SOURCE
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <libaio.h>
int pagesize;
char *iobuf;
io_context_t myctx;
int aio_maxio = 4;
/*
* do a AIO DIO write
*/
int do_aio_direct_read(int fd, char *iobuf, int offset, int size)
{
struct iocb myiocb;
struct iocb *iocbp = &myiocb;
int ret;
struct io_event e;
struct stat s;
io_prep_pread(&myiocb, fd, iobuf, size, offset);
if ((ret = io_submit(myctx, 1, &iocbp)) != 1) {
perror("io_submit");
return ret;
}
ret = io_getevents(myctx, 1, 1, &e, 0);
if (ret) {
struct iocb *iocb = e.obj;
int iosize = iocb->u.c.nbytes;
char *buf = iocb->u.c.buf;
long long loffset = iocb->u.c.offset;
printf("AIO read of %d at offset %lld returned %d\n",
iosize, loffset, e.res);
}
return ret;
}
int main(int argc, char *argv[])
{
char *filename;
int fd;
int err;
filename = "test.aio.file";
fd = open(filename, O_RDWR|O_DIRECT|O_CREAT|O_TRUNC, 0666);
pagesize = getpagesize();
err = posix_memalign((void**) &iobuf, pagesize, pagesize);
if (err) {
fprintf(stderr, "Error allocating %d aligned bytes.\n",
pagesize);
exit(1);
}
err = write(fd, iobuf, pagesize);
if (err != pagesize) {
fprintf(stderr, "Error ret = %d writing %d bytes.\n",
err, pagesize);
perror("");
exit(1);
}
memset(&myctx, 0, sizeof(myctx));
io_queue_init(aio_maxio, &myctx);
err = do_aio_direct_read(fd, iobuf, 0, pagesize);
close(fd);
printf("This will panic on ppc64\n");
return err;
}
--------------------------
Daniel
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Looks good to me.
Acked-by: David Gibson <d...@au1.ibm.com>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson
Daniel McNeil wrote on Tuesday, March 22, 2005 11:25 AM
> Here's a patch against 2.6.11 that fixes the problem.
> It changes is_hugepage_only_range() to take mm as an argument
> and then changes the places that call it to pass 'mm'.
> It includes a change for ia64 which has not been compiled.
Just a sanity check, tested the patch on ia64. Nothing blows up
and everything is working.
Here's a patch against 2.6.11 that fixes the problem.
It changes is_hugepage_only_range() to take mm as an argument
and then changes the places that call it to pass 'mm'.
It includes a change for ia64 which has not been compiled.
It applies against the latest bk with some offset.
Signed-off-by: Daniel McNeil <dan...@osdl.org>
diff -urp linux-2.6.11.orig/arch/ppc64/mm/hugetlbpage.c linux-2.6.11/arch/ppc64/mm/hugetlbpage.c
--- linux-2.6.11.orig/arch/ppc64/mm/hugetlbpage.c 2005-03-22 09:43:09.000000000 -0800
+++ linux-2.6.11/arch/ppc64/mm/hugetlbpage.c 2005-03-22 09:45:46.000000000 -0800
@@ -512,7 +512,7 @@ unsigned long arch_get_unmapped_area(str
vma = find_vma(mm, addr);
if (((TASK_SIZE - len) >= addr)
&& (!vma || (addr+len) <= vma->vm_start)
- && !is_hugepage_only_range(addr,len))
+ && !is_hugepage_only_range(mm, addr,len))
return addr;
}
start_addr = addr = mm->free_area_cache;
@@ -522,7 +522,7 @@ full_search:
while (TASK_SIZE - len >= addr) {
BUG_ON(vma && (addr >= vma->vm_end));
- if (touches_hugepage_low_range(addr, len)) {
+ if (touches_hugepage_low_range(mm, addr, len)) {
addr = ALIGN(addr+1, 1<<SID_SHIFT);
vma = find_vma(mm, addr);
continue;
@@ -583,7 +583,7 @@ arch_get_unmapped_area_topdown(struct fi
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start)
- && !is_hugepage_only_range(addr,len))
+ && !is_hugepage_only_range(mm, addr,len))
return addr;
}
@@ -596,7 +596,7 @@ try_again:
addr = (mm->free_area_cache - len) & PAGE_MASK;
do {
hugepage_recheck:
- if (touches_hugepage_low_range(addr, len)) {
+ if (touches_hugepage_low_range(mm, addr, len)) {
addr = (addr & ((~0) << SID_SHIFT)) - len;
goto hugepage_recheck;
} else if (touches_hugepage_high_range(addr, len)) {
diff -urp linux-2.6.11.orig/include/asm-ia64/page.h linux-2.6.11/include/asm-ia64/page.h
--- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/page.h 2005-03-21 16:58:54.000000000 -0800
@@ -137,7 +137,7 @@ typedef union ia64_va {
# define htlbpage_to_page(x) (((unsigned long) REGION_NUMBER(x) << 61) \
| (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT)))
# define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
-# define is_hugepage_only_range(addr, len) \
+# define is_hugepage_only_range(mm, addr, len) \
(REGION_NUMBER(addr) == REGION_HPAGE && \
REGION_NUMBER((addr)+(len)) == REGION_HPAGE)
extern unsigned int hpage_shift;
diff -urp linux-2.6.11.orig/include/asm-ppc64/page.h linux-2.6.11/include/asm-ppc64/page.h
--- linux-2.6.11.orig/include/asm-ppc64/page.h 2005-03-01 23:37:30.000000000 -0800
+++ linux-2.6.11/include/asm-ppc64/page.h 2005-03-21 16:59:46.000000000 -0800
@@ -48,8 +48,8 @@
#define ARCH_HAS_HUGEPAGE_ONLY_RANGE
#define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
-#define touches_hugepage_low_range(addr, len) \
- (LOW_ESID_MASK((addr), (len)) & current->mm->context.htlb_segs)
+#define touches_hugepage_low_range(mm, addr, len) \
+ (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs)
#define touches_hugepage_high_range(addr, len) \
(((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
@@ -61,9 +61,9 @@
#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
&& ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
-#define is_hugepage_only_range(addr, len) \
+#define is_hugepage_only_range(mm, addr, len) \
(touches_hugepage_high_range((addr), (len)) || \
- touches_hugepage_low_range((addr), (len)))
+ touches_hugepage_low_range((mm), (addr), (len)))
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#define in_hugepage_area(context, addr) \
diff -urp linux-2.6.11.orig/include/linux/hugetlb.h linux-2.6.11/include/linux/hugetlb.h
--- linux-2.6.11.orig/include/linux/hugetlb.h 2005-03-21 16:50:21.000000000 -0800
+++ linux-2.6.11/include/linux/hugetlb.h 2005-03-22 09:41:24.000000000 -0800
@@ -36,7 +36,7 @@ extern const unsigned long hugetlb_zero,
extern int sysctl_hugetlb_shm_group;
#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
-#define is_hugepage_only_range(addr, len) 0
+#define is_hugepage_only_range(mm, addr, len) 0
#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
#endif
@@ -71,7 +71,7 @@ static inline unsigned long hugetlb_tota
#define is_aligned_hugepage_range(addr, len) 0
#define prepare_hugepage_range(addr, len) (-EINVAL)
#define pmd_huge(x) 0
-#define is_hugepage_only_range(addr, len) 0
+#define is_hugepage_only_range(mm, addr, len) 0
#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
#define alloc_huge_page() ({ NULL; })
#define free_huge_page(p) ({ (void)(p); BUG(); })
diff -urp linux-2.6.11.orig/mm/mmap.c linux-2.6.11/mm/mmap.c
--- linux-2.6.11.orig/mm/mmap.c 2005-03-21 17:00:35.000000000 -0800
+++ linux-2.6.11/mm/mmap.c 2005-03-21 17:01:20.000000000 -0800
@@ -1334,7 +1334,7 @@ get_unmapped_area(struct file *file, uns
* reserved hugepage range. For some archs like IA-64,
* there is a separate region for hugepages.
*/
- ret = is_hugepage_only_range(addr, len);
+ ret = is_hugepage_only_range(current->mm, addr, len);
}
if (ret)
return -EINVAL;
@@ -1707,7 +1707,7 @@ static void unmap_region(struct mm_struc
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- if (is_hugepage_only_range(start, end - start))
+ if (is_hugepage_only_range(mm, start, end - start))
hugetlb_free_pgtables(tlb, prev, start, end);
else
free_pgtables(tlb, prev, start, end);