Fix for vunmap, which happens to be one of them, is included.
Note that some false-positives might come up when call_rcu() is called on
uninitialized struct rcu_head. The solution is, surprisingly enough, to
initialize them.
Thanks,
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Update: init at structure allocation time rather than just before
calling call_rcu(). Ensures we do not void the debug race test.
Signed-off-by: Mathieu Desnoyers <mathieu....@polymtl.ca>
CC: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>
CC: mi...@elte.hu
CC: ak...@linux-foundation.org
---
kernel/fork.c | 1 +
kernel/pid.c | 1 +
kernel/rcupdate.c | 1 +
3 files changed, 3 insertions(+)
Index: linux-2.6-lttng/kernel/rcupdate.c
===================================================================
--- linux-2.6-lttng.orig/kernel/rcupdate.c 2009-10-06 10:34:25.000000000 -0400
+++ linux-2.6-lttng/kernel/rcupdate.c 2009-10-06 11:17:16.000000000 -0400
@@ -90,6 +90,7 @@ void synchronize_rcu(void)
if (rcu_blocking_is_gp())
return;
+ INIT_RCU_HEAD(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu(&rcu.head, wakeme_after_rcu);
Index: linux-2.6-lttng/kernel/pid.c
===================================================================
--- linux-2.6-lttng.orig/kernel/pid.c 2009-10-06 10:34:25.000000000 -0400
+++ linux-2.6-lttng/kernel/pid.c 2009-10-06 11:12:46.000000000 -0400
@@ -264,6 +264,7 @@ struct pid *alloc_pid(struct pid_namespa
get_pid_ns(ns);
pid->level = ns->level;
+ INIT_RCU_HEAD(&pid->rcu);
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
Index: linux-2.6-lttng/kernel/fork.c
===================================================================
--- linux-2.6-lttng.orig/kernel/fork.c 2009-10-06 11:11:10.000000000 -0400
+++ linux-2.6-lttng/kernel/fork.c 2009-10-06 11:11:54.000000000 -0400
@@ -1012,6 +1012,7 @@ static struct task_struct *copy_process(
p->rcu_read_lock_nesting = 0;
p->rcu_flipctr_idx = 0;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ INIT_RCU_HEAD(&p->rcu);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
I assumed that call_rcu in vunmap() is only used to postpone kfree()
from the hot path, but I ask for comfirmation/infirmation about that. I
fear delayed reclamation might be there for a reason. Are there rcu read
sides depending on this behavior ?
If yes, then this patch is wrong, and we could change it for
synchronize_rcu();
kfree(....);
Mathieu
> Impact:
> (-) slight slowdown on vunmap
> (+) stop messing up rcu callback lists ;)
>
> Caught it with DEBUG_RCU_HEAD, running LTTng armall/disarmall in loops.
> Immediate values (with breakpoint-ipi scheme) are still using vunmap.
>
> ------------[ cut here ]------------
> WARNING: at kernel/rcutree.c:1199 __call_rcu+0x181/0x190()
> Hardware name: X7DAL
> Modules linked in: loop ltt_statedump ltt_userspace_event ipc_tr]
> Pid: 4527, comm: ltt-armall Not tainted 2.6.30.9-trace #30
> Call Trace:
> [<ffffffff8027b181>] ? __call_rcu+0x181/0x190
> [<ffffffff8027b181>] ? __call_rcu+0x181/0x190
> [<ffffffff8023a6a9>] ? warn_slowpath_common+0x79/0xd0
> [<ffffffff802b4890>] ? rcu_free_va+0x0/0x10
> [<ffffffff8027b181>] ? __call_rcu+0x181/0x190
> [<ffffffff802b5298>] ? __purge_vmap_area_lazy+0x1a8/0x1e0
> [<ffffffff802b59d4>] ? free_unmap_vmap_area_noflush+0x74/0x80
> [<ffffffff802b5a0e>] ? remove_vm_area+0x2e/0x80
> [<ffffffff802b5b35>] ? __vunmap+0x45/0xf0
> [<ffffffffa002e826>] ? ltt_statedump_start+0x7b6/0x820 [ltt_sta]
> [<ffffffff8068978a>] ? arch_imv_update+0x16a/0x2f0
> [<ffffffff8027dd13>] ? imv_update_range+0x53/0xa0
> [<ffffffff8026235b>] ? _module_imv_update+0x4b/0x60
> [<ffffffff80262385>] ? module_imv_update+0x15/0x30
> [<ffffffff80280049>] ? marker_probe_register+0x149/0xb90
> [<ffffffff8040c8e0>] ? ltt_vtrace+0x0/0x8c0
> [<ffffffff80409330>] ? ltt_marker_connect+0xd0/0x150
> [<ffffffff8040f8dc>] ? marker_enable_write+0xec/0x120
> [<ffffffff802c6cb8>] ? __dentry_open+0x268/0x350
> [<ffffffff80280b2c>] ? marker_probe_cb+0x9c/0x170
> [<ffffffff80280b2c>] ? marker_probe_cb+0x9c/0x170
> [<ffffffff802c973b>] ? vfs_write+0xcb/0x170
> [<ffffffff802c9984>] ? sys_write+0x64/0x130
> [<ffffffff8020beeb>] ? system_call_fastpath+0x16/0x1b
> ---[ end trace ef92443e716fa199 ]---
> ------------[ cut here ]------------
>
> Signed-off-by: Mathieu Desnoyers <mathieu....@polymtl.ca>
> CC: c...@linux-foundation.org
> CC: mi...@elte.hu
> CC: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>
> CC: linu...@kvack.org
> CC: ak...@linux-foundation.org
> ---
> mm/vmalloc.c | 18 ++----------------
> 1 file changed, 2 insertions(+), 16 deletions(-)
>
> Index: linux-2.6-lttng/mm/vmalloc.c
> ===================================================================
> --- linux-2.6-lttng.orig/mm/vmalloc.c 2009-10-06 09:37:04.000000000 -0400
> +++ linux-2.6-lttng/mm/vmalloc.c 2009-10-06 09:37:56.000000000 -0400
> @@ -417,13 +417,6 @@ overflow:
> return va;
> }
>
> -static void rcu_free_va(struct rcu_head *head)
> -{
> - struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
> -
> - kfree(va);
> -}
> -
> static void __free_vmap_area(struct vmap_area *va)
> {
> BUG_ON(RB_EMPTY_NODE(&va->rb_node));
> @@ -431,7 +424,7 @@ static void __free_vmap_area(struct vmap
> RB_CLEAR_NODE(&va->rb_node);
> list_del_rcu(&va->list);
>
> - call_rcu(&va->rcu_head, rcu_free_va);
> + kfree(va);
> }
>
> /*
> @@ -757,13 +750,6 @@ static struct vmap_block *new_vmap_block
> return vb;
> }
>
> -static void rcu_free_vb(struct rcu_head *head)
> -{
> - struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
> -
> - kfree(vb);
> -}
> -
> static void free_vmap_block(struct vmap_block *vb)
> {
> struct vmap_block *tmp;
> @@ -778,7 +764,7 @@ static void free_vmap_block(struct vmap_
> BUG_ON(tmp != vb);
>
> free_unmap_vmap_area_noflush(vb->va);
> - call_rcu(&vb->rcu_head, rcu_free_vb);
> + kfree(vb);
> }
>
> static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
I see :)
> --- linux-2.6-lttng.orig/include/linux/rcupdate.h 2009-10-06 10:35:15.000000000 -0400
> +++ linux-2.6-lttng/include/linux/rcupdate.h 2009-10-06 10:35:18.000000000 -0400
> @@ -45,6 +45,8 @@
> * struct rcu_head - callback structure for use with RCU
> * @next: next update requests in a list
> * @func: actual update function to call after the grace period.
> + *
> + * Debug mode assumes func pointer value is word-aligned.
Ouch, I guess you never tried CONFIG_CC_OPTIMIZE_FOR_SIZE=y ?
random extract from "nm -v vmlinux"
c0415e58 T in4_pton
c0415f66 T in6_pton
c04161f0 T in_aton
c0416240 T net_ratelimit
c0416250 t linkwatch_add_event
c0416281 t linkwatch_schedule_work
c0416301 T linkwatch_fire_event
c0416368 t __linkwatch_run_queue
c04164d2 t linkwatch_event
c04164f4 T linkwatch_run_queue
c0416500 T sk_chk_filter
c0416703 t sk_filter_rcu_release
c041671d T sk_detach_filter
c041676a T sk_attach_filter
c0416862 T sk_run_filter
c0416f5d T sk_filter
c0416fc4 t __flow_cache_shrink
gcc -v
Using built-in specs.
Target: i686-pc-linux-gnu
Configured with: ../gcc-4.4.1/configure --enable-languages=c,c++ --prefix=/usr
Thread model: posix
gcc version 4.4.1 (GCC)
I don't know if there is an easy fix for __pad_to_align_refcnt ?
> > --- linux-2.6-lttng.orig/include/linux/rcupdate.h 2009-10-06 10:35:15.000000000 -0400
> > +++ linux-2.6-lttng/include/linux/rcupdate.h 2009-10-06 10:35:18.000000000 -0400
> > @@ -45,6 +45,8 @@
> > * struct rcu_head - callback structure for use with RCU
> > * @next: next update requests in a list
> > * @func: actual update function to call after the grace period.
> > + *
> > + * Debug mode assumes func pointer value is word-aligned.
>
> Ouch, I guess you never tried CONFIG_CC_OPTIMIZE_FOR_SIZE=y ?
>
oops. Well, simple fix would be : depends on !CC_OPTIMIZE_FOR_SIZE
It's a debug option after all... but I'd prefer growing the structure
with a supplementary field if possible.
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
This check was added to make sure some tbench regression was not added if
dst->__refcnt was moved around, I am sure you dont care about tbench being 10% slower,
do you ?
diff --git a/include/net/dst.h b/include/net/dst.h
index 5a900dd..b8fba74 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -154,7 +154,9 @@ static inline void dst_hold(struct dst_entry * dst)
* If your kernel compilation stops here, please check
* __pad_to_align_refcnt declaration in struct dst_entry
*/
+#if !defined(DEBUG_RCU_HEAD)
BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
+#endif
atomic_inc(&dst->__refcnt);
Indeed, I absolutely don't care when I'm doing debugging :)
Will merge, and add a "struct rcu_head *debug" field in struct rcu_head.
Thanks !
Mathieu
>
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 5a900dd..b8fba74 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -154,7 +154,9 @@ static inline void dst_hold(struct dst_entry * dst)
> * If your kernel compilation stops here, please check
> * __pad_to_align_refcnt declaration in struct dst_entry
> */
> +#if !defined(DEBUG_RCU_HEAD)
> BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
> +#endif
> atomic_inc(&dst->__refcnt);
> }
>
>
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> Simplest fix: directly kfree the data structure rather than doing it lazily.
The delay is necessary as far as I can tell for performance reasons. But
Nick was the last one fiddling around with the subsystem as far as I
remember. CCing him. May be he has a minute to think about a fix that
preserved performance.
Thanks,
More info on the problem:
I added a check in alloc_vmap_area() at:
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
WARN_ON(va->rcu_head.debug == LIST_POISON1);
(memory allocation debugging is off in my config)
I used this along with my new call rcu debug patch to check if memory is
reallocated before the callback (doing kfree) is executed. It triggers
this:
WARNING: at mm/vmalloc.c:343 alloc_vmap_area+0x305/0x340()
Hardware name: X7DAL
Modules linked in: loop ltt_statedump ltt_userspace_event ipc_tr]
Pid: 5584, comm: ltt-disarmall Not tainted 2.6.30.9-trace #41
Call Trace:
[<ffffffff802b8665>] ? alloc_vmap_area+0x305/0x340
[<ffffffff802b8665>] ? alloc_vmap_area+0x305/0x340
[<ffffffff8023cf29>] ? warn_slowpath_common+0x79/0xd0
[<ffffffff802b8665>] ? alloc_vmap_area+0x305/0x340
[<ffffffff80237670>] ? default_wake_function+0x0/0x10
[<ffffffff802b8769>] ? __get_vm_area_node+0xc9/0x1d0
[<ffffffff805ae965>] ? sys_getsockopt+0x85/0x160
[<ffffffff8040f160>] ? ltt_vtrace+0x0/0x8d0
[<ffffffff802b88dd>] ? get_vm_area_caller+0x2d/0x40
[<ffffffff8068bc3e>] ? arch_imv_update+0x10e/0x2f0
[<ffffffff802b93aa>] ? vmap+0x4a/0x80
[<ffffffff8068bc3e>] ? arch_imv_update+0x10e/0x2f0
[<ffffffff80283e6d>] ? get_tracepoint+0x25d/0x290
[<ffffffff80280b93>] ? imv_update_range+0x53/0xa0
[<ffffffff80284821>] ? tracepoint_update_probes+0x21/0x30
[<ffffffff802848b3>] ? tracepoint_probe_update_all+0x83/0x100
[<ffffffff80282ac1>] ? marker_update_probes+0x21/0x40
[<ffffffff8040f160>] ? ltt_vtrace+0x0/0x8d0
[<ffffffff80282d5d>] ? marker_probe_unregister+0xad/0x130
[<ffffffff8040b9dc>] ? ltt_marker_disconnect+0xdc/0x120
[<ffffffff804121d2>] ? marker_enable_write+0x112/0x120
[<ffffffff80688dd0>] ? _spin_unlock+0x10/0x30
[<ffffffff802e6b36>] ? mnt_drop_write+0x76/0x180
[<ffffffff802d96dd>] ? do_filp_open+0x2cd/0xa00
[<ffffffff804377c1>] ? tty_write+0x221/0x270
[<ffffffff802cc6cb>] ? vfs_write+0xcb/0x170
[<ffffffff802cc914>] ? sys_write+0x64/0x130
[<ffffffff8020beab>] ? system_call_fastpath+0x16/0x1b
---[ end trace 7ef506680d7a9e26 ]---
Could it be that:
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
rb_erase(&va->rb_node, &vmap_area_root);
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
call_rcu(&va->rcu_head, rcu_free_va);
}
(especially clearing from the rb tree and va list)
allows reallocation of the node by kmalloc_node before its reclamation
(only done later by rcu_free_va) ?
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Very cool, Mathieu!!!
I do like adding the debug-only field to struct rcu_head. Lots of
additional places need initialization, or am I missing a trick here?
Thanx, Paul
In my case, only 2 sites in the process management and 1 site in the now
deprecated marker.c complained about not being initialized.
I can guess we will find others though. Please see the v2 of my patch,
which adds the supplementary "debug" field rather than depending on
word-aligned pointers. I volountarily complain about the .debug pointer
being non-NULL when we find an uninitialized site. I could be more
specific and complain "differently" if the pointer happens to be random
vs if it's LIST_POISON1.
Thanks,
Mathieu
> Thanx, Paul
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68