On Tue, Mar 07, 2017 at 05:51:32PM +0100, Oleg Nesterov wrote:
> I am looking at perf_event_init_task() too and I can't understand the
> error handling...
>
> perf_event_init_context() can return success even if inherit_task_group() in
> the first list_for_each_entry(pinned_groups) fails, "ret" will be overwritten
> by the 2nd list_for_each_entry(flexible_groups) loop. "inherited_all" should
> be cleared, still this looks confusing at least.
Yes, this looks buggy. But I cannot explain how that would result in the
observed use-after-free. If we 'loose' an error this way,
perf_event_init_context() will not fail, and hence we'll not hit that
perf_event_free_task() path.
The task would continue living with an incorrectly copied context, and
counter behaviour would be affected.
> inherit_event() returns NULL under is_orphaned_event() check, not ERR_PTR().
> Is it correct?
Yes. This is all a tad tricky, but it seems to be correct.
By returning NULL, not an error, we affect the silent discard of
orphaned events. This is correct, because otherwise
perf_event_release_kernel() would have come by and explicitly discarded
those events for us anyway.
My current patch is below; won't do much good I suspect..
---
kernel/events/core.c | 55 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 37 insertions(+), 18 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a17ed56c8ce1..3cd907e00377 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4256,7 +4256,7 @@ int perf_event_release_kernel(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
/*
- * Mark this even as STATE_DEAD, there is no external reference to it
+ * Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
*
* Anybody acquiring event->child_mutex after the below loop _must_
@@ -10417,21 +10417,9 @@ void perf_event_free_task(struct task_struct *task)
continue;
mutex_lock(&ctx->mutex);
-again:
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
- group_entry)
- perf_free_event(event, ctx);
-
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
perf_free_event(event, ctx);
-
- if (!list_empty(&ctx->pinned_groups) ||
- !list_empty(&ctx->flexible_groups))
- goto again;
-
mutex_unlock(&ctx->mutex);
-
put_ctx(ctx);
}
}
@@ -10469,7 +10457,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
}
/*
- * inherit a event from parent task to child task:
+ * Inherit a event from parent task to child task.
+ *
+ * Returns:
+ * - valid pointer on success
+ * - NULL for orphaned events
+ * - IS_ERR() on error
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
@@ -10563,6 +10556,16 @@ inherit_event(struct perf_event *parent_event,
return child_event;
}
+/*
+ * Inherits an event group.
+ *
+ * This will quietly suppress orphaned events; !inherit_event() is not an error.
+ * This matches with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10577,6 +10580,11 @@ static int inherit_group(struct perf_event *parent_event,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
+ /*
+ * @leader can be NULL here because of is_orphaned_event(). In this
+ * case inherit_event() will create individual events, similar to what
+ * perf_group_detach() would do anyway.
+ */
list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
@@ -10586,6 +10594,17 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}
+/*
+ * Creates the child task context and tries to inherit the event-group.
+ *
+ * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
+ * inherited_all set when we 'fail' to inherit an orphaned event; this is
+ * consistent with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10608,7 +10627,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
-
child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -10670,7 +10688,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
/*
@@ -10686,7 +10704,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -10714,6 +10732,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);