[PATCH 10/15] perf: simplify __perf_event_sync

Peter Zijlstra

unread,

Nov 20, 2009, 4:30:02 PM11/20/09

to

perf-time-3.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:30:03 PM11/20/09

to

perf-event-4.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:02 PM11/20/09

to

perf-time-1.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:02 PM11/20/09

to

perf-sw-counting-state.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:02 PM11/20/09

to

perf-overflow.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:02 PM11/20/09

to

perf-time-6.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:02 PM11/20/09

to

perf-time-2.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:03 PM11/20/09

to

stephane_perf_events-fix_default_watermark_calculation.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:04 PM11/20/09

to

perf-event-3.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:03 PM11/20/09

to

perf-opt-single.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:03 PM11/20/09

to

perf-time-4.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:03 PM11/20/09

to

perf-time-7.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:02 PM11/20/09

to

perf-time-5.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:04 PM11/20/09

to

perf-time-0.patch

Peter Zijlstra

unread,

Nov 20, 2009, 4:40:04 PM11/20/09

to

perf-event-5.patch

Frederic Weisbecker

unread,

Nov 20, 2009, 5:10:02 PM11/20/09

to

On Fri, Nov 20, 2009 at 10:19:43PM +0100, Peter Zijlstra wrote:
> in-kernel perf users might wish to have custom actions on the sample
> interrupt.
>
> Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

I haven't yet answered to your previous message about using such
callback but yes, that's exactly the type of callback we want for
events that need to dispatch events in several different ways.

I'll try to use it for the hw-breakpoints.

Thanks.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:01 AM11/21/09

to

Commit-ID: d6ff86cfb50a72df820e7e839836d55d245306fb
Gitweb: http://git.kernel.org/tip/d6ff86cfb50a72df820e7e839836d55d245306fb
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:46 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:36 +0100

perf: Optimize perf_event_task_ctx()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---
kernel/perf_event.c | 7 +------
1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index cda17ac..2afb305 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3267,15 +3267,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
{
struct perf_event *event;

- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (perf_event_task_match(event))
perf_event_task_output(event, task_event);
}
- rcu_read_unlock();
}

static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3283,11 +3278,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx = task_event->task_ctx;

+ rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);
perf_event_task_ctx(&cpuctx->ctx, task_event);
put_cpu_var(perf_cpu_context);

- rcu_read_lock();
if (!ctx)
ctx = rcu_dereference(task_event->task->perf_event_ctxp);
if (ctx)

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:01 AM11/21/09

to

Commit-ID: 0cff784ae41cc125368ae77f1c01328ae2fdc6b3
Gitweb: http://git.kernel.org/tip/0cff784ae41cc125368ae77f1c01328ae2fdc6b3
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:44 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:35 +0100

perf: Optimize some swcounter attr.sample_period==1 paths

Avoid the rather expensive perf_swevent_set_period() if we know
we have to sample every single event anyway.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 17 +++++++++++------
1 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1dfb6cc..8e55b44 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3759,16 +3759,16 @@ again:
return nr;
}

-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
int nmi, struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
int throttle = 0;
- u64 overflow;

data->period = event->hw.last_period;
- overflow = perf_swevent_set_period(event);
+ if (!overflow)
+ overflow = perf_swevent_set_period(event);

if (hwc->interrupts == MAX_INTERRUPTS)
return;
@@ -3801,14 +3801,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,

atomic64_add(nr, &event->count);

+ if (!regs)
+ return;
+
if (!hwc->sample_period)
return;

- if (!regs)
+ if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+ return perf_swevent_overflow(event, 1, nmi, data, regs);
+
+ if (atomic64_add_negative(nr, &hwc->period_left))
return;

- if (!atomic64_add_negative(nr, &hwc->period_left))
- perf_swevent_overflow(event, nmi, data, regs);
+ perf_swevent_overflow(event, 0, nmi, data, regs);
}

static int perf_swevent_is_counting(struct perf_event *event)

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:02 AM11/21/09

to

Commit-ID: 2b8988c9f7defe319cffe0cd362a7cd356c86f62
Gitweb: http://git.kernel.org/tip/2b8988c9f7defe319cffe0cd362a7cd356c86f62
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:54 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:39 +0100

perf: Fix time locking

Most sites updating ctx->time and event times do so under
ctx->lock, make sure they all do.

This was made possible by removing the __perf_event_read() call
from __perf_event_sync_stat(), which already had this lock
taken.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 028619d..fdfae88 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1526,8 +1526,11 @@ static void __perf_event_read(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;

+ spin_lock(&ctx->lock);
update_context_time(ctx);
update_event_times(event);
+ spin_unlock(&ctx->lock);
+
event->pmu->read(event);
}

@@ -1541,7 +1544,13 @@ static u64 perf_event_read(struct perf_event *event)
smp_call_function_single(event->oncpu,
__perf_event_read, event, 1);
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ struct perf_event_context *ctx = event->ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ update_context_time(ctx);
update_event_times(event);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}

return atomic64_read(&event->count);

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:03 AM11/21/09

to

Commit-ID: f6d9dd237da400effb265f3554c64413f8a3e7b4
Gitweb: http://git.kernel.org/tip/f6d9dd237da400effb265f3554c64413f8a3e7b4
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:48 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:37 +0100

perf: Optimize perf_event_mmap_ctx()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 7 +------
1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4deefaa..68fbf4f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3493,15 +3493,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,

{
struct perf_event *event;

- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {

if (perf_event_mmap_match(event, mmap_event))
perf_event_mmap_output(event, mmap_event);
}
- rcu_read_unlock();
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3557,11 +3552,11 @@ got_name:

mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

+ rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);

perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
put_cpu_var(perf_cpu_context);

- rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:01 AM11/21/09

to

Commit-ID: abf4868b8548cae18d4fe8bbfb4e207443be01be
Gitweb: http://git.kernel.org/tip/abf4868b8548cae18d4fe8bbfb4e207443be01be
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:49 +0100

Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:37 +0100

perf: Fix PERF_FORMAT_GROUP scale info

As Corey reported, the total_enabled and total_running times
could occasionally be 0, even though there were events counted.

It turns out this is because we record the times before reading
the counter while the latter updates the times.

This patch corrects that.

While looking at this code I found that there is a lot of
locking iffyness around, the following patches correct most of
that.

Reported-by: Corey Ashford <cjas...@linux.vnet.ibm.com>

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 52 ++++++++++++++++++++------------------------------
1 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 68fbf4f..9a18ff2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1784,30 +1784,15 @@ u64 perf_event_read_value(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

-static int perf_event_read_entry(struct perf_event *event,
- u64 read_format, char __user *buf)
-{
- int n = 0, count = 0;
- u64 values[2];
-
- values[n++] = perf_event_read_value(event);
- if (read_format & PERF_FORMAT_ID)
- values[n++] = primary_event_id(event);
-
- count = n * sizeof(u64);
-
- if (copy_to_user(buf, values, count))
- return -EFAULT;
-
- return count;
-}
-
static int perf_event_read_group(struct perf_event *event,
u64 read_format, char __user *buf)
{
struct perf_event *leader = event->group_leader, *sub;
- int n = 0, size = 0, err = -EFAULT;
- u64 values[3];
+ int n = 0, size = 0, ret = 0;
+ u64 values[5];
+ u64 count;
+
+ count = perf_event_read_value(leader);

values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
@@ -1818,28 +1803,33 @@ static int perf_event_read_group(struct perf_event *event,
values[n++] = leader->total_time_running +
atomic64_read(&leader->child_total_time_running);
}
+ values[n++] = count;
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(leader);

size = n * sizeof(u64);

if (copy_to_user(buf, values, size))
return -EFAULT;

- err = perf_event_read_entry(leader, read_format, buf + size);
- if (err < 0)
- return err;
-
- size += err;
+ ret += size;

list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- err = perf_event_read_entry(sub, read_format,
- buf + size);
- if (err < 0)
- return err;
+ n = 0;

- size += err;
+ values[n++] = perf_event_read_value(sub);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf + size, values, size))
+ return -EFAULT;
+
+ ret += size;
}

- return size;
+ return ret;
}

static int perf_event_read_one(struct perf_event *event,

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:02 AM11/21/09

to

Commit-ID: 58e5ad1de3d6ad931c84f0cc8ef0655c922f30ad
Gitweb: http://git.kernel.org/tip/58e5ad1de3d6ad931c84f0cc8ef0655c922f30ad
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:53 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:39 +0100

perf: Simplify __perf_event_read

cpuctx is always active, task context is always active for
current

the previous condition verifies that if its a task context its
for current, hence we can assume ctx->is_active.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 5 ++---
1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index af150bb..028619d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1526,10 +1526,9 @@ static void __perf_event_read(void *info)

if (ctx->task && cpuctx->task_ctx != ctx)
return;

- if (ctx->is_active)
- update_context_time(ctx);
- event->pmu->read(event);
+ update_context_time(ctx);
update_event_times(event);
+ event->pmu->read(event);

}

static u64 perf_event_read(struct perf_event *event)

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:03 AM11/21/09

to

Commit-ID: f6f83785222b0ee037f7be90731f62a649292b5e
Gitweb: http://git.kernel.org/tip/f6f83785222b0ee037f7be90731f62a649292b5e
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:51 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:38 +0100

perf: Optimize __perf_event_read()

Both callers actually have IRQs disabled, no need doing so
again.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 3 ---
1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 65f4dab..e66f6c4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1517,7 +1517,6 @@ static void __perf_event_read(void *info)
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_event *event = info;

struct perf_event_context *ctx = event->ctx;

- unsigned long flags;

/*
* If this is a task context, we need to check whether it is
@@ -1529,12 +1528,10 @@ static void __perf_event_read(void *info)

if (ctx->task && cpuctx->task_ctx != ctx)
return;

- local_irq_save(flags);
if (ctx->is_active)
update_context_time(ctx);
event->pmu->read(event);
update_event_times(event);
- local_irq_restore(flags);

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:03 AM11/21/09

to

Commit-ID: f6595f3a9680c86b6332f881a7ae2cbbcfdc8619
Gitweb: http://git.kernel.org/tip/f6595f3a9680c86b6332f881a7ae2cbbcfdc8619
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:47 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:36 +0100

perf: Optimize perf_event_comm_ctx()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 7 +------
1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2afb305..4deefaa 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3374,15 +3374,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,

{
struct perf_event *event;

- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {

if (perf_event_comm_match(event))
perf_event_comm_output(event, comm_event);
}
- rcu_read_unlock();
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3401,11 +3396,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)

comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

+ rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);

perf_event_comm_ctx(&cpuctx->ctx, comm_event);

put_cpu_var(perf_cpu_context);

- rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:04 AM11/21/09

to

Commit-ID: 3dbebf15c5d3e265f751eec72c1538a00da4be27
Gitweb: http://git.kernel.org/tip/3dbebf15c5d3e265f751eec72c1538a00da4be27
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:52 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:39 +0100

perf: Simplify __perf_event_sync_stat

Removes constraints from __perf_event_read() by leaving it with
a single callsite; this callsite had ctx->lock held, the other
one does not.

Removes some superfluous code from __perf_event_sync_stat().

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 6 ++----
1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e66f6c4..af150bb 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1061,8 +1061,6 @@ static int context_equiv(struct perf_event_context *ctx1,
&& !ctx1->pin_count && !ctx2->pin_count;
}

-static void __perf_event_read(void *event);
-
static void __perf_event_sync_stat(struct perf_event *event,
struct perf_event *next_event)
{
@@ -1080,8 +1078,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
*/
switch (event->state) {
case PERF_EVENT_STATE_ACTIVE:
- __perf_event_read(event);
- break;
+ event->pmu->read(event);
+ /* fall-through */

case PERF_EVENT_STATE_INACTIVE:
update_event_times(event);

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:02 AM11/21/09

to

Commit-ID: 59ed446f792cc07d37b1536b9c4664d14e25e425
Gitweb: http://git.kernel.org/tip/59ed446f792cc07d37b1536b9c4664d14e25e425
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:55 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:40 +0100

perf: Fix event scaling for inherited counters

Properly account the full hierarchy of counters for both the
count (we already did so) and the scale times (new).

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

include/linux/perf_event.h | 3 +-
kernel/perf_event.c | 48 +++++++++++++++++++++++--------------------
2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a430ac3..36fe89f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -782,7 +782,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
int cpu,
pid_t pid,
perf_callback_t callback);
-extern u64 perf_event_read_value(struct perf_event *event);
+extern u64 perf_event_read_value(struct perf_event *event,
+ u64 *enabled, u64 *running);

struct perf_sample_data {
u64 type;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fdfae88..80f40da 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1774,14 +1774,25 @@ static int perf_event_read_size(struct perf_event *event)
return size;
}

-u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;

+ *enabled = 0;
+ *running = 0;
+
total += perf_event_read(event);
- list_for_each_entry(child, &event->child_list, child_list)
+ *enabled += event->total_time_enabled +
+ atomic64_read(&event->child_total_time_enabled);
+ *running += event->total_time_running +
+ atomic64_read(&event->child_total_time_running);
+
+ list_for_each_entry(child, &event->child_list, child_list) {
total += perf_event_read(child);
+ *enabled += child->total_time_enabled;
+ *running += child->total_time_running;
+ }

return total;
}
@@ -1793,19 +1804,15 @@ static int perf_event_read_group(struct perf_event *event,

struct perf_event *leader = event->group_leader, *sub;

int n = 0, size = 0, ret = 0;

u64 values[5];
- u64 count;
+ u64 count, enabled, running;

- count = perf_event_read_value(leader);
+ count = perf_event_read_value(leader, &enabled, &running);

values[n++] = 1 + leader->nr_siblings;

- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- values[n++] = leader->total_time_enabled +
- atomic64_read(&leader->child_total_time_enabled);
- }
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- values[n++] = leader->total_time_running +
- atomic64_read(&leader->child_total_time_running);
- }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
values[n++] = count;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
@@ -1820,7 +1827,7 @@ static int perf_event_read_group(struct perf_event *event,
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;

- values[n++] = perf_event_read_value(sub);
+ values[n++] = perf_event_read_value(sub, &enabled, &running);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);

@@ -1838,18 +1845,15 @@ static int perf_event_read_group(struct perf_event *event,

static int perf_event_read_one(struct perf_event *event,

u64 read_format, char __user *buf)

{
+ u64 enabled, running;
u64 values[4];
int n = 0;

- values[n++] = perf_event_read_value(event);

- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- values[n++] = event->total_time_enabled +
- atomic64_read(&event->child_total_time_enabled);
- }
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- values[n++] = event->total_time_running +
- atomic64_read(&event->child_total_time_running);
- }
+ values[n++] = perf_event_read_value(event, &enabled, &running);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:03 AM11/21/09

to

Commit-ID: 453f19eea7dbad837425e9b07d84568d14898794
Gitweb: http://git.kernel.org/tip/453f19eea7dbad837425e9b07d84568d14898794
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:43 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:35 +0100

perf: Allow for custom overflow handlers

in-kernel perf users might wish to have custom actions on the
sample interrupt.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

include/linux/perf_event.h | 6 ++++++
kernel/perf_event.c | 8 +++++++-
2 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b5cdac0..a430ac3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -567,6 +567,8 @@ struct perf_pending_entry {

typedef void (*perf_callback_t)(struct perf_event *, void *);

+struct perf_sample_data;
+
/**
* struct perf_event - performance event kernel representation:
*/
@@ -658,6 +660,10 @@ struct perf_event {
struct pid_namespace *ns;
u64 id;

+ void (*overflow_handler)(struct perf_event *event,
+ int nmi, struct perf_sample_data *data,
+ struct pt_regs *regs);
+
#ifdef CONFIG_EVENT_PROFILE
struct event_filter *filter;
#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3852e26..1dfb6cc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3710,7 +3710,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
perf_event_disable(event);
}

- perf_event_output(event, nmi, data, regs);
+ if (event->overflow_handler)
+ event->overflow_handler(event, nmi, data, regs);
+ else
+ perf_event_output(event, nmi, data, regs);
+
return ret;
}

@@ -4836,6 +4840,8 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->attr.freq)
child_event->hw.sample_period = parent_event->hw.sample_period;

+ child_event->overflow_handler = parent_event->overflow_handler;
+
/*
* Link it up in the child's context:
*/

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:04 AM11/21/09

to

Commit-ID: 6f10581aeaa5543a3b7a8c7a87a064375ec357f8
Gitweb: http://git.kernel.org/tip/6f10581aeaa5543a3b7a8c7a87a064375ec357f8
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:56 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:40 +0100

perf: Fix locking for PERF_FORMAT_GROUP

We should hold event->child_mutex when iterating the inherited
counters, we should hold ctx->mutex when iterating siblings.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 20 +++++++++++++-------
1 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 80f40da..3ede098 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1782,6 +1782,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
*enabled = 0;
*running = 0;

+ mutex_lock(&event->child_mutex);
total += perf_event_read(event);

*enabled += event->total_time_enabled +

atomic64_read(&event->child_total_time_enabled);
@@ -1793,6 +1794,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
+ mutex_unlock(&event->child_mutex);

return total;
}
@@ -1802,10 +1804,12 @@ static int perf_event_read_group(struct perf_event *event,

u64 read_format, char __user *buf)
{

struct perf_event *leader = event->group_leader, *sub;

- int n = 0, size = 0, ret = 0;
+ int n = 0, size = 0, ret = -EFAULT;
+ struct perf_event_context *ctx = leader->ctx;
u64 values[5];
u64 count, enabled, running;

+ mutex_lock(&ctx->mutex);

count = perf_event_read_value(leader, &enabled, &running);

values[n++] = 1 + leader->nr_siblings;

@@ -1820,9 +1824,9 @@ static int perf_event_read_group(struct perf_event *event,

size = n * sizeof(u64);

if (copy_to_user(buf, values, size))

- return -EFAULT;
+ goto unlock;

- ret += size;
+ ret = size;

list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;

@@ -1833,11 +1837,15 @@ static int perf_event_read_group(struct perf_event *event,

size = n * sizeof(u64);

- if (copy_to_user(buf + size, values, size))
- return -EFAULT;

+ if (copy_to_user(buf + size, values, size)) {

+ ret = -EFAULT;
+ goto unlock;
+ }

ret += size;
}
+unlock:
+ mutex_unlock(&ctx->mutex);

return ret;
}
@@ -1884,12 +1892,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
return -ENOSPC;

WARN_ON_ONCE(event->ctx->parent_ctx);
- mutex_lock(&event->child_mutex);
if (read_format & PERF_FORMAT_GROUP)
ret = perf_event_read_group(event, read_format, buf);
else
ret = perf_event_read_one(event, read_format, buf);
- mutex_unlock(&event->child_mutex);

return ret;

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:03 AM11/21/09

to

Commit-ID: 02ffdbc866c8b1c8644601e9aa6155700eed4c91
Gitweb: http://git.kernel.org/tip/02ffdbc866c8b1c8644601e9aa6155700eed4c91
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:50 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:38 +0100

perf: Optimize perf_event_task_sched_out

Remove an update_context_time() call from the
perf_event_task_sched_out() path and into the branch its needed.

The call was both superfluous, because __perf_event_sched_out()
already does it, and wrong, because it was done without holding
ctx->lock.

Place it in perf_event_sync_stat(), which is the only place it
is needed and which does already hold ctx->lock.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9a18ff2..65f4dab 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1120,6 +1120,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
if (!ctx->nr_stat)
return;

+ update_context_time(ctx);
+
event = list_first_entry(&ctx->event_list,
struct perf_event, event_entry);

@@ -1163,8 +1165,6 @@ void perf_event_task_sched_out(struct task_struct *task,
if (likely(!ctx || !cpuctx->task_ctx))
return;

- update_context_time(ctx);
-
rcu_read_lock();
parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp;

tip-bot for Peter Zijlstra

unread,

Nov 21, 2009, 8:50:02 AM11/21/09

to

Commit-ID: 81520183878a8813c71c9372de28bb70913ba549
Gitweb: http://git.kernel.org/tip/81520183878a8813c71c9372de28bb70913ba549
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 20 Nov 2009 22:19:45 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Sat, 21 Nov 2009 14:11:35 +0100

perf: Optimize perf_swevent_ctx_event()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

Cc: Paul Mackerras <pau...@samba.org>
LKML-Reference: <200911202125...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/perf_event.c | 7 +------
1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8e55b44..cda17ac 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3886,15 +3886,10 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,

{
struct perf_event *event;

- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {

if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_add(event, nr, nmi, data, regs);
}
- rcu_read_unlock();
}

static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
@@ -3926,9 +3921,9 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
(*recursion)++;
barrier();

+ rcu_read_lock();
perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
nr, nmi, data, regs);

- rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.

Paul Mackerras

unread,

Nov 23, 2009, 1:00:01 AM11/23/09

to

Peter Zijlstra writes:

> We can do away with the system_state check if the machine still boots
> after this patch (seems to be the case).

I have a recollection (possible faulty) that the problem we can get
into if we don't have this check is that if we take a bad page fault
in the kernel (e.g. NULL dereference) early in boot before the perf
cpu context has been initialized, we then get another NULL dereference
because the pointers in ctx->event_list are NULL, and recurse to
death.

So that check was possibly more about debugging than correctness.
Possibly also the x86 do_page_fault() is different enough from the
powerpc one that the problem can't occur on x86.

Paul.

Peter Zijlstra

unread,

Nov 23, 2009, 2:40:02 AM11/23/09

to

On Mon, 2009-11-23 at 16:50 +1100, Paul Mackerras wrote:
> Peter Zijlstra writes:
>
> > We can do away with the system_state check if the machine still boots
> > after this patch (seems to be the case).
>
> I have a recollection (possible faulty) that the problem we can get
> into if we don't have this check is that if we take a bad page fault
> in the kernel (e.g. NULL dereference) early in boot before the perf
> cpu context has been initialized, we then get another NULL dereference
> because the pointers in ctx->event_list are NULL, and recurse to
> death.
>
> So that check was possibly more about debugging than correctness.
> Possibly also the x86 do_page_fault() is different enough from the
> powerpc one that the problem can't occur on x86.

Right, I remembered there was _something_ we added them for, but
couldn't for the live of me remember what.

Hmm, maybe we can initialize all the recursion variables to 1, that
should avoid us ever entering into the swcounter code until we reset
them.

Peter Zijlstra

unread,

Nov 23, 2009, 3:40:03 AM11/23/09

to

On Mon, 2009-11-23 at 08:31 +0100, Peter Zijlstra wrote:
> On Mon, 2009-11-23 at 16:50 +1100, Paul Mackerras wrote:
> > Peter Zijlstra writes:
> >
> > > We can do away with the system_state check if the machine still boots
> > > after this patch (seems to be the case).
> >
> > I have a recollection (possible faulty) that the problem we can get
> > into if we don't have this check is that if we take a bad page fault
> > in the kernel (e.g. NULL dereference) early in boot before the perf
> > cpu context has been initialized, we then get another NULL dereference
> > because the pointers in ctx->event_list are NULL, and recurse to
> > death.
> >
> > So that check was possibly more about debugging than correctness.
> > Possibly also the x86 do_page_fault() is different enough from the
> > powerpc one that the problem can't occur on x86.
>
> Right, I remembered there was _something_ we added them for, but
> couldn't for the live of me remember what.
>
> Hmm, maybe we can initialize all the recursion variables to 1, that
> should avoid us ever entering into the swcounter code until we reset
> them.

I think the below patch fixed that..

---

commit f29ac756a40d0f1bb07d682ea521e7b666ff06d5
Author: Peter Zijlstra <a.p.zi...@chello.nl>
Date: Fri Jun 19 18:27:26 2009 +0200

perf_counter: Optimize perf_swcounter_event()

Similar to tracepoints, use an enable variable to reduce
overhead when unused.

Only look for a counter of a particular event type when we know
there is at least one in the system.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
LKML-Reference: <new-submission>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@redhat.com>
Signed-off-by: Ingo Molnar <mi...@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 89698d8..e7213e4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -669,7 +669,16 @@ static inline int is_software_counter(struct perf_counter *counter)
(counter->attr.type != PERF_TYPE_HW_CACHE);
}

-extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
+extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
+
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+ if (atomic_read(&perf_swcounter_enabled[event]))
+ __perf_swcounter_event(event, nr, nmi, regs, addr);
+}

extern void __perf_counter_mmap(struct vm_area_struct *vma);

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a2..7515c76 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3317,8 +3317,8 @@ out:
put_cpu_var(perf_cpu_context);
}

-void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
+ struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data = {
.regs = regs,
@@ -3509,9 +3509,19 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
}
#endif

+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+ u64 event = counter->attr.config;
+
+ atomic_dec(&perf_swcounter_enabled[event]);
+}
+
static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
{
const struct pmu *pmu = NULL;
+ u64 event = counter->attr.config;

/*
* Software counters (currently) can't in general distinguish
@@ -3520,7 +3530,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
* to be kernel events, and page faults are never hypervisor
* events.
*/
- switch (counter->attr.config) {
+ switch (event) {
case PERF_COUNT_SW_CPU_CLOCK:
pmu = &perf_ops_cpu_clock;

@@ -3541,6 +3551,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
case PERF_COUNT_SW_CONTEXT_SWITCHES:
case PERF_COUNT_SW_CPU_MIGRATIONS:
+ atomic_inc(&perf_swcounter_enabled[event]);
+ counter->destroy = sw_perf_counter_destroy;
pmu = &perf_ops_generic;
break;

[PATCH 10/15] perf: simplify __perf_event_sync_stat

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Peter Zijlstra

Frederic Weisbecker

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

tip-bot for Peter Zijlstra

Paul Mackerras

Peter Zijlstra

Peter Zijlstra