Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[GIT PULL] perf fixes

0 views
Skip to first unread message

Ingo Molnar

unread,
Mar 26, 2010, 11:20:02 AM3/26/10
to
Linus,

Please pull the latest perf-fixes-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git perf-fixes-for-linus

Thanks,

Ingo

------------------>
Arnaldo Carvalho de Melo (2):
perf top: Improve the autosizing of column lenghts
perf top: Add missing initialization to zero

Masami Hiramatsu (4):
perf probe: Fix probe_point buffer overrun
perf probe: Fix need_dwarf flag if lazy matching is used
perf probe: Fix offset to allow signed value
perf probe: Use original address instead of CU-based address

Paul Mackerras (1):
powerpc/perf_events: Fix call-graph recording, add perf_arch_fetch_caller_regs


arch/powerpc/include/asm/asm-compat.h | 2 ++
arch/powerpc/kernel/misc.S | 28 ++++++++++++++++++++++++++++
tools/perf/builtin-probe.c | 1 -
tools/perf/builtin-top.c | 13 +++++++++----
tools/perf/util/probe-event.c | 2 +-
tools/perf/util/probe-finder.c | 18 +++++++-----------
tools/perf/util/probe-finder.h | 1 -
tools/perf/util/symbol.c | 18 +++++++++++++-----
tools/perf/util/symbol.h | 3 ++-
9 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
index c1b475a..a9b91ed 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -28,6 +28,7 @@
#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh)
#define PPC_STLCX stringify_in_c(stdcx.)
#define PPC_CNTLZL stringify_in_c(cntlzd)
+#define PPC_LR_STKOFF 16

/* Move to CR, single-entry optimized version. Only available
* on POWER4 and later.
@@ -51,6 +52,7 @@
#define PPC_STLCX stringify_in_c(stwcx.)
#define PPC_CNTLZL stringify_in_c(cntlzw)
#define PPC_MTOCRF stringify_in_c(mtcrf)
+#define PPC_LR_STKOFF 4

#endif

diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 2d29752..b485a87 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -127,3 +127,31 @@ _GLOBAL(__setup_cpu_power7)
_GLOBAL(__restore_cpu_power7)
/* place holder */
blr
+
+#ifdef CONFIG_EVENT_TRACING
+/*
+ * Get a minimal set of registers for our caller's nth caller.
+ * r3 = regs pointer, r5 = n.
+ *
+ * We only get R1 (stack pointer), NIP (next instruction pointer)
+ * and LR (link register). These are all we can get in the
+ * general case without doing complicated stack unwinding, but
+ * fortunately they are enough to do a stack backtrace, which
+ * is all we need them for.
+ */
+_GLOBAL(perf_arch_fetch_caller_regs)
+ mr r6,r1
+ cmpwi r5,0
+ mflr r4
+ ble 2f
+ mtctr r5
+1: PPC_LL r6,0(r6)
+ bdnz 1b
+ PPC_LL r4,PPC_LR_STKOFF(r6)
+2: PPC_LL r7,0(r6)
+ PPC_LL r7,PPC_LR_STKOFF(r7)
+ PPC_STL r6,GPR1-STACK_FRAME_OVERHEAD(r3)
+ PPC_STL r4,_NIP-STACK_FRAME_OVERHEAD(r3)
+ PPC_STL r7,_LINK-STACK_FRAME_OVERHEAD(r3)
+ blr
+#endif /* CONFIG_EVENT_TRACING */
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index c30a335..152d6c9 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -47,7 +47,6 @@
#include "util/probe-event.h"

#define MAX_PATH_LEN 256
-#define MAX_PROBES 128

/* Session management structure */
static struct {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 0b719e3..1f52932 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -455,7 +455,7 @@ static void print_sym_table(void)
struct sym_entry *syme, *n;
struct rb_root tmp = RB_ROOT;
struct rb_node *nd;
- int sym_width = 0, dso_width = 0, max_dso_width;
+ int sym_width = 0, dso_width = 0, dso_short_width = 0;
const int win_width = winsize.ws_col - 1;

samples = userspace_samples = 0;
@@ -545,15 +545,20 @@ static void print_sym_table(void)
if (syme->map->dso->long_name_len > dso_width)
dso_width = syme->map->dso->long_name_len;

+ if (syme->map->dso->short_name_len > dso_short_width)
+ dso_short_width = syme->map->dso->short_name_len;
+
if (syme->name_len > sym_width)
sym_width = syme->name_len;
}

printed = 0;

- max_dso_width = winsize.ws_col - sym_width - 29;
- if (dso_width > max_dso_width)
- dso_width = max_dso_width;
+ if (sym_width + dso_width > winsize.ws_col - 29) {
+ dso_width = dso_short_width;
+ if (sym_width + dso_width > winsize.ws_col - 29)
+ sym_width = winsize.ws_col - dso_width - 29;
+ }
putchar('\n');
if (nr_counters == 1)
printf(" samples pcnt");
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 53181db..7c004b6 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -242,7 +242,7 @@ void parse_perf_probe_event(const char *str, struct probe_point *pp,

/* Parse probe point */
parse_perf_probe_probepoint(argv[0], pp);
- if (pp->file || pp->line)
+ if (pp->file || pp->line || pp->lazy_line)
*need_dwarf = true;

/* Copy arguments and ensure return probe has no C argument */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1e6c65e..c171a24 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -333,8 +333,8 @@ static void show_location(Dwarf_Op *op, struct probe_finder *pf)
die("%u exceeds max register number.", regn);

if (deref)
- ret = snprintf(pf->buf, pf->len, " %s=+%ju(%s)",
- pf->var, (uintmax_t)offs, regs);
+ ret = snprintf(pf->buf, pf->len, " %s=%+jd(%s)",
+ pf->var, (intmax_t)offs, regs);
else
ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs);
DIE_IF(ret < 0);
@@ -352,8 +352,7 @@ static void show_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
goto error;
/* TODO: handle more than 1 exprs */
- ret = dwarf_getlocation_addr(&attr, (pf->addr - pf->cu_base),
- &expr, &nexpr, 1);
+ ret = dwarf_getlocation_addr(&attr, pf->addr, &expr, &nexpr, 1);
if (ret <= 0 || nexpr == 0)
goto error;

@@ -437,8 +436,7 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)

/* Get the frame base attribute/ops */
dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr);
- ret = dwarf_getlocation_addr(&fb_attr, (pf->addr - pf->cu_base),
- &pf->fb_ops, &nops, 1);
+ ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1);
if (ret <= 0 || nops == 0)
pf->fb_ops = NULL;

@@ -455,6 +453,9 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
/* *pf->fb_ops will be cached in libdw. Don't free it. */
pf->fb_ops = NULL;

+ if (pp->found == MAX_PROBES)
+ die("Too many( > %d) probe point found.\n", MAX_PROBES);
+
pp->probes[pp->found] = strdup(tmp);
pp->found++;
}
@@ -641,7 +642,6 @@ static void find_probe_point_by_func(struct probe_finder *pf)
int find_probe_point(int fd, struct probe_point *pp)
{
struct probe_finder pf = {.pp = pp};
- int ret;
Dwarf_Off off, noff;
size_t cuhl;
Dwarf_Die *diep;
@@ -668,10 +668,6 @@ int find_probe_point(int fd, struct probe_point *pp)
pf.fname = NULL;

if (!pp->file || pf.fname) {
- /* Save CU base address (for frame_base) */
- ret = dwarf_lowpc(&pf.cu_die, &pf.cu_base);
- if (ret != 0)
- pf.cu_base = 0;
if (pp->function)
find_probe_point_by_func(&pf);
else if (pp->lazy_line)
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index d1a6517..21f7354 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -71,7 +71,6 @@ struct probe_finder {

/* For variable searching */
Dwarf_Op *fb_ops; /* Frame base attribute */
- Dwarf_Addr cu_base; /* Current CU base address */
const char *var; /* Current variable name */
char *buf; /* Current output buffer */
int len; /* Length of output buffer */
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 323c0ae..c458c4a 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -163,9 +163,17 @@ void dso__set_long_name(struct dso *self, char *name)
self->long_name_len = strlen(name);
}

+static void dso__set_short_name(struct dso *self, const char *name)
+{
+ if (name == NULL)
+ return;
+ self->short_name = name;
+ self->short_name_len = strlen(name);
+}
+
static void dso__set_basename(struct dso *self)
{
- self->short_name = basename(self->long_name);
+ dso__set_short_name(self, basename(self->long_name));
}

struct dso *dso__new(const char *name)
@@ -176,7 +184,7 @@ struct dso *dso__new(const char *name)
int i;
strcpy(self->name, name);
dso__set_long_name(self, self->name);
- self->short_name = self->name;
+ dso__set_short_name(self, self->name);
for (i = 0; i < MAP__NR_TYPES; ++i)
self->symbols[i] = self->symbol_names[i] = RB_ROOT;
self->slen_calculated = 0;
@@ -897,7 +905,6 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
struct kmap *kmap = self->kernel ? map__kmap(map) : NULL;
struct map *curr_map = map;
struct dso *curr_dso = self;
- size_t dso_name_len = strlen(self->short_name);
Elf_Data *symstrs, *secstrs;
uint32_t nr_syms;
int err = -1;
@@ -987,7 +994,8 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
char dso_name[PATH_MAX];

if (strcmp(section_name,
- curr_dso->short_name + dso_name_len) == 0)
+ (curr_dso->short_name +
+ self->short_name_len)) == 0)
goto new_symbol;

if (strcmp(section_name, ".text") == 0) {
@@ -1782,7 +1790,7 @@ struct dso *dso__new_kernel(const char *name)
struct dso *self = dso__new(name ?: "[kernel.kallsyms]");

if (self != NULL) {
- self->short_name = "[kernel]";
+ dso__set_short_name(self, "[kernel]");
self->kernel = 1;
}

diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 280dadd..f30a374 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -110,9 +110,10 @@ struct dso {
u8 sorted_by_name;
u8 loaded;
u8 build_id[BUILD_ID_SIZE];
- u16 long_name_len;
const char *short_name;
char *long_name;
+ u16 long_name_len;
+ u16 short_name_len;
char name[0];
};

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Frederic Weisbecker

unread,
Mar 28, 2010, 1:20:02 AM3/28/10
to
Ingo,

Please pull the perf/urgent branch that can be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
perf/urgent

Thanks,
Frederic
---

Frederic Weisbecker (2):
perf: Correctly align perf event tracing buffer
perf: Use hot regs with software sched switch/migrate events


include/linux/perf_event.h | 21 ++++++++++++++-------
kernel/perf_event.c | 4 +---
kernel/trace/trace_event_perf.c | 11 +++++++++--
3 files changed, 24 insertions(+), 12 deletions(-)

Frederic Weisbecker

unread,
Mar 28, 2010, 1:20:02 AM3/28/10
to
Scheduler's task migration events don't work because they always
pass NULL regs perf_sw_event(). The event hence gets filtered
in perf_swevent_add().

Scheduler's context switches events use task_pt_regs() to get
the context when the event occured which is a wrong thing to
do as this won't give us the place in the kernel where we went
to sleep but the place where we left userspace. The result is
even more wrong if we switch from a kernel thread.

Use the hot regs snapshot for both events as they belong to the
non-interrupt/exception based events family. Unlike page faults
or so that provide the regs matching the exact origin of the event,
we need to save the current context.

This makes the task migration event working and fix the context
switch callchains and origin ip.

Example: perf record -a -e cs

Before:

10.91% ksoftirqd/0 0 [k] 0000000000000000
|
--- (nil)
perf_callchain
perf_prepare_sample
__perf_event_overflow
perf_swevent_overflow
perf_swevent_add
perf_swevent_ctx_event
do_perf_sw_event
__perf_sw_event
perf_event_task_sched_out
schedule
run_ksoftirqd
kthread
kernel_thread_helper

After:

23.77% hald-addon-stor [kernel.kallsyms] [k] schedule
|
--- schedule
|
|--60.00%-- schedule_timeout
| wait_for_common
| wait_for_completion
| blk_execute_rq
| scsi_execute
| scsi_execute_req
| sr_test_unit_ready
| |
| |--66.67%-- sr_media_change
| | media_changed
| | cdrom_media_changed
| | sr_block_media_changed
| | check_disk_change
| | cdrom_open

Signed-off-by: Frederic Weisbecker <fwei...@gmail.com>
Cc: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Arnaldo Carvalho de Melo <ac...@redhat.com>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: David Miller <da...@davemloft.net>
---


include/linux/perf_event.h | 21 ++++++++++++++-------
kernel/perf_event.c | 4 +---

2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9547703..c8e3754 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -842,13 +842,6 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);

-static inline void
-perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
-{
- if (atomic_read(&perf_swevent_enabled[event_id]))
- __perf_sw_event(event_id, nr, nmi, regs, addr);
-}
-
extern void
perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);

@@ -887,6 +880,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
return perf_arch_fetch_caller_regs(regs, ip, skip);
}

+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+ if (atomic_read(&perf_swevent_enabled[event_id])) {
+ struct pt_regs hot_regs;
+
+ if (!regs) {
+ perf_fetch_caller_regs(&hot_regs, 1);
+ regs = &hot_regs;
+ }
+ __perf_sw_event(event_id, nr, nmi, regs, addr);
+ }
+}
+
extern void __perf_event_mmap(struct vm_area_struct *vma);

static inline void perf_event_mmap(struct vm_area_struct *vma)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fb3031c..bc7943c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1164,11 +1164,9 @@ void perf_event_task_sched_out(struct task_struct *task,
struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent;
- struct pt_regs *regs;
int do_switch = 1;

- regs = task_pt_regs(task);
- perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);

if (likely(!ctx || !cpuctx->task_ctx))
return;
--
1.6.2.3

Ingo Molnar

unread,
Mar 28, 2010, 11:40:01 PM3/28/10
to

* Frederic Weisbecker <fwei...@gmail.com> wrote:

> Ingo,
>
> Please pull the perf/urgent branch that can be found at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
> perf/urgent
>
> Thanks,
> Frederic
> ---
>
> Frederic Weisbecker (2):
> perf: Correctly align perf event tracing buffer
> perf: Use hot regs with software sched switch/migrate events
>
>
> include/linux/perf_event.h | 21 ++++++++++++++-------
> kernel/perf_event.c | 4 +---
> kernel/trace/trace_event_perf.c | 11 +++++++++--
> 3 files changed, 24 insertions(+), 12 deletions(-)

Pulled, thanks a lot Frederic!

Ingo

Peter Zijlstra

unread,
Mar 29, 2010, 5:20:02 AM3/29/10
to
On Sun, 2010-03-28 at 07:11 +0200, Frederic Weisbecker wrote:
> Scheduler's task migration events don't work because they always
> pass NULL regs perf_sw_event(). The event hence gets filtered
> in perf_swevent_add().
>
> Scheduler's context switches events use task_pt_regs() to get
> the context when the event occured which is a wrong thing to
> do as this won't give us the place in the kernel where we went
> to sleep but the place where we left userspace. The result is
> even more wrong if we switch from a kernel thread.
>
> Use the hot regs snapshot for both events as they belong to the
> non-interrupt/exception based events family. Unlike page faults
> or so that provide the regs matching the exact origin of the event,
> we need to save the current context.
>
> This makes the task migration event working and fix the context
> switch callchains and origin ip.


But after this its no longer possible to profile userspace on context
switches is it?

Frederic Weisbecker

unread,
Mar 29, 2010, 1:50:03 PM3/29/10
to
On Mon, Mar 29, 2010 at 10:49:59AM +0200, Peter Zijlstra wrote:
> On Sun, 2010-03-28 at 07:11 +0200, Frederic Weisbecker wrote:
> > Scheduler's task migration events don't work because they always
> > pass NULL regs perf_sw_event(). The event hence gets filtered
> > in perf_swevent_add().
> >
> > Scheduler's context switches events use task_pt_regs() to get
> > the context when the event occured which is a wrong thing to
> > do as this won't give us the place in the kernel where we went
> > to sleep but the place where we left userspace. The result is
> > even more wrong if we switch from a kernel thread.
> >
> > Use the hot regs snapshot for both events as they belong to the
> > non-interrupt/exception based events family. Unlike page faults
> > or so that provide the regs matching the exact origin of the event,
> > we need to save the current context.
> >
> > This makes the task migration event working and fix the context
> > switch callchains and origin ip.
>
>
> But after this its no longer possible to profile userspace on context
> switches is it?


Once the callchain on the kernel finishes, we bounce to the userspace
part, using task_pt_regs(). The previous version was incorrect because
it was ignoring the kernel part.

But you makes me wonder... We don't take into account exclude_kernel
or exclude_user with these hot regs.

I think we need several new things:

Every arch does its own:

if (!is_user)
perf_callchain_kernel(regs, entry);

if (current->mm)
perf_callchain_user(regs, entry);

Plus perf_callchain_user() goes fetching task_pt_regs()
by itself.

This is a check we should do from the core, according
to exclude_kernel, exclude_user, user_mode and current->mm

Archs shouldn't bother about these details.
They should just implement perf_callchain_kernel and perf_callchain_user
rather than a monolithic one that deals with contexts.

Each time we pass regs to perf_event_overflow() we should call
a perf_filter_callchain(struct pt_regs *default) that checks the
exclude_* things and override with task_pt_regs() if needed
(and if current->mm is set) so that even the ip source will
be correct.

And a generic perf_callchain() can deal with perf_callchain_kernel()
and perf_callchain_user() calls, again, according the exclude_*
policies.

I'm going to make a quick fix for perf_fetch_caller_regs() that
passes task_pt_regs if exclude_kernel for perf/urgent,
and I'll do the above cleanups/invasive fixes on perf/core.

Peter Zijlstra

unread,
Mar 29, 2010, 2:10:02 PM3/29/10
to
On Mon, 2010-03-29 at 19:47 +0200, Frederic Weisbecker wrote:
>
>
> I'm going to make a quick fix for perf_fetch_caller_regs() that
> passes task_pt_regs if exclude_kernel for perf/urgent,
> and I'll do the above cleanups/invasive fixes on perf/core.
>
>
ok, sounds sensible, thanks!

Frederic Weisbecker

unread,
Mar 29, 2010, 6:50:02 PM3/29/10
to
On Mon, Mar 29, 2010 at 08:05:38PM +0200, Peter Zijlstra wrote:
> On Mon, 2010-03-29 at 19:47 +0200, Frederic Weisbecker wrote:
> >
> >
> > I'm going to make a quick fix for perf_fetch_caller_regs() that
> > passes task_pt_regs if exclude_kernel for perf/urgent,
> > and I'll do the above cleanups/invasive fixes on perf/core.
> >
> >
> ok, sounds sensible, thanks!


Actually I have doubts about what should be the strict sense
of exclude_kernel.

Does that mean we exclude any event that happened in the kernel?
Or does that mean we exclude the part that happened in the kernel?

Depending on the case, we do either.

In perf_swevent_hrtimer(), we simply go back to task_pt_regs()
if exclude_kernel.

But in other software events, we don't such fix, we actually
filter out the event if it is not user_mode().

So, I'm a bit confused on what to do.
I'm tempted to adopt the meaning from perf_swevent_hrtimer()
for software events too, I'm not sure...

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b0feb47..3cb5de8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3986,14 +3986,17 @@ static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data);

static int perf_exclude_event(struct perf_event *event,
- struct pt_regs *regs)
+ struct pt_regs **regs)
{
- if (regs) {
- if (event->attr.exclude_user && user_mode(regs))
+ if (*regs) {
+ if (event->attr.exclude_user && user_mode(*regs))
return 1;

- if (event->attr.exclude_kernel && !user_mode(regs))
- return 1;
+ if (event->attr.exclude_kernel && !user_mode(*regs))
+ if (current->mm)
+ *regs = task_pt_regs();
+ else
+ return 1;
}

return 0;
@@ -4017,7 +4020,7 @@ static int perf_swevent_match(struct perf_event *event,
if (event->attr.config != event_id)
return 0;

- if (perf_exclude_event(event, regs))
+ if (perf_exclude_event(event, &regs))
return 0;

if (event->attr.type == PERF_TYPE_TRACEPOINT &&
@@ -4442,7 +4445,7 @@ void perf_bp_event(struct perf_event *bp, void *data)

perf_sample_data_init(&sample, bp->attr.bp_addr);

- if (!perf_exclude_event(bp, regs))
+ if (!perf_exclude_event(bp, &regs))
perf_swevent_add(bp, 1, 1, &sample, regs);
}
#else

Frederic Weisbecker

unread,
Mar 29, 2010, 7:00:02 PM3/29/10
to
On Tue, Mar 30, 2010 at 12:43:54AM +0200, Frederic Weisbecker wrote:
> On Mon, Mar 29, 2010 at 08:05:38PM +0200, Peter Zijlstra wrote:
> > On Mon, 2010-03-29 at 19:47 +0200, Frederic Weisbecker wrote:
> > >
> > >
> > > I'm going to make a quick fix for perf_fetch_caller_regs() that
> > > passes task_pt_regs if exclude_kernel for perf/urgent,
> > > and I'll do the above cleanups/invasive fixes on perf/core.
> > >
> > >
> > ok, sounds sensible, thanks!
>
>
> Actually I have doubts about what should be the strict sense
> of exclude_kernel.
>
> Does that mean we exclude any event that happened in the kernel?
> Or does that mean we exclude the part that happened in the kernel?
>
> Depending on the case, we do either.
>
> In perf_swevent_hrtimer(), we simply go back to task_pt_regs()
> if exclude_kernel.
>
> But in other software events, we don't such fix, we actually
> filter out the event if it is not user_mode().
>
> So, I'm a bit confused on what to do.
> I'm tempted to adopt the meaning from perf_swevent_hrtimer()
> for software events too, I'm not sure...


I think this is the right thing to do: jump back to user context
instead of filtering out (unless kernel thread).

Otherwise every software events, trace events included, are totally
pointless with exclude_kernel.

Peter Zijlstra

unread,
Mar 30, 2010, 3:00:02 PM3/30/10
to
On Tue, 2010-03-30 at 00:43 +0200, Frederic Weisbecker wrote:

> Actually I have doubts about what should be the strict sense
> of exclude_kernel.
>
> Does that mean we exclude any event that happened in the kernel?
> Or does that mean we exclude the part that happened in the kernel?
>
> Depending on the case, we do either.
>
> In perf_swevent_hrtimer(), we simply go back to task_pt_regs()
> if exclude_kernel.
>
> But in other software events, we don't such fix, we actually
> filter out the event if it is not user_mode().
>
> So, I'm a bit confused on what to do.
> I'm tempted to adopt the meaning from perf_swevent_hrtimer()
> for software events too, I'm not sure...

Yes, that is indeed a good point. Problem is that perf_swevent_hrtimer()
is not quite correct either, since strictly speaking its timeline should
stop on the excluded region, but implementing that would make context
switches horribly expensive.

That said, the option that would be most correct is to simply not count
these events, and in that respect the current behaviour seems best.

Maybe we can make a new perf feature that would for each kernel event
(hw pmu included) report on the userspace state, would that be useful?

Frederic Weisbecker

unread,
Mar 30, 2010, 3:20:01 PM3/30/10
to
On Tue, Mar 30, 2010 at 08:54:52PM +0200, Peter Zijlstra wrote:
> On Tue, 2010-03-30 at 00:43 +0200, Frederic Weisbecker wrote:
>
> > Actually I have doubts about what should be the strict sense
> > of exclude_kernel.
> >
> > Does that mean we exclude any event that happened in the kernel?
> > Or does that mean we exclude the part that happened in the kernel?
> >
> > Depending on the case, we do either.
> >
> > In perf_swevent_hrtimer(), we simply go back to task_pt_regs()
> > if exclude_kernel.
> >
> > But in other software events, we don't such fix, we actually
> > filter out the event if it is not user_mode().
> >
> > So, I'm a bit confused on what to do.
> > I'm tempted to adopt the meaning from perf_swevent_hrtimer()
> > for software events too, I'm not sure...
>
> Yes, that is indeed a good point. Problem is that perf_swevent_hrtimer()
> is not quite correct either, since strictly speaking its timeline should
> stop on the excluded region, but implementing that would make context
> switches horribly expensive.

No we wouldn't need that. We would just need to change the regs
check.

Currently we have this:

regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
if ((event->attr.exclude_kernel || !regs) &&
!event->attr.exclude_user)
regs = task_pt_regs(current);


According to the strict meaning of exclude_kernel (event that happened in
userspace), we should have this:


regs = get_irq_regs();

if ((event->attr.exclude_kernel && regs)
return ret;

if (!regs && !event->attr.exclude_user && current->mm)
regs = task_pt_regs(current);

if (regs)
overflow()


Note the current code is also buggy because we call task_pt_regs()
whenever we are a kernel thread or not.


>
> That said, the option that would be most correct is to simply not count
> these events, and in that respect the current behaviour seems best.


Ok. But in this case I'm not sure what to do with the context switch
software event. The new hot regs thing now capture the kernel context,
whereas before it was only capturing userspace exit point.

Are you fine with that? The callchain will still go to userspace too.


> Maybe we can make a new perf feature that would for each kernel event
> (hw pmu included) report on the userspace state, would that be useful?


I'm not sure it would be useful...

Frederic Weisbecker

unread,
Mar 31, 2010, 12:00:02 AM3/31/10
to
Scheduler's task migration events don't work because they always
pass NULL regs perf_sw_event(). The event hence gets filtered
in perf_swevent_add().

Scheduler's context switches events use task_pt_regs() to get
the context when the event occured which is a wrong thing to
do as this won't give us the place in the kernel where we went
to sleep but the place where we left userspace. The result is
even more wrong if we switch from a kernel thread.

Use the hot regs snapshot for both events as they belong to the
non-interrupt/exception based events family. Unlike page faults
or so that provide the regs matching the exact origin of the event,
we need to save the current context.

This makes the task migration event working and fix the context
switch callchains and origin ip.

Example: perf record -a -e cs

Before:

After:

v2: Always build perf_arch_fetch_caller_regs() now that software
events need that too. They don't need it from modules, unlike trace
events, so we keep the EXPORT_SYMBOL in trace_event_perf.c

Signed-off-by: Frederic Weisbecker <fwei...@gmail.com>
Cc: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Arnaldo Carvalho de Melo <ac...@redhat.com>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: David Miller <da...@davemloft.net>
---

arch/x86/kernel/cpu/perf_event.c | 2 --


include/linux/perf_event.h | 21 ++++++++++++++-------
kernel/perf_event.c | 4 +---

3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 60398a0..5fb490c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1702,7 +1702,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}

-#ifdef CONFIG_EVENT_TRACING


void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)

{
regs->ip = ip;
@@ -1714,4 +1713,3 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
regs->cs = __KERNEL_CS;
local_save_flags(regs->flags);
}
-#endif

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 574ee58..b0feb47 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c


@@ -1164,11 +1164,9 @@ void perf_event_task_sched_out(struct task_struct *task,
struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent;
- struct pt_regs *regs;
int do_switch = 1;

- regs = task_pt_regs(task);
- perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);

if (likely(!ctx || !cpuctx->task_ctx))
return;
--
1.6.2.3

--

Frederic Weisbecker

unread,
Mar 31, 2010, 12:00:02 AM3/31/10
to
From: Jason Wessel <jason....@windriver.com>

It is required to call hw_breakpoint_init() on an attr before using it
in any other calls. This fixes the problem where kgdb will sometimes
fail to initialize on x86_64.

Signed-off-by: Jason Wessel <jason....@windriver.com>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: 2.6.33 <sta...@kernel.org>
LKML-Reference: <1269975907-27602-1-git-...@windriver.com>
Signed-off-by: Frederic Weisbecker <fwei...@gmail.com>
---
arch/x86/kernel/kgdb.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index bfba601..b2258ca 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -618,8 +618,8 @@ int kgdb_arch_init(void)
* portion of kgdb because this operation requires mutexs to
* complete.
*/
+ hw_breakpoint_init(&attr);
attr.bp_addr = (unsigned long)kgdb_arch_init;
- attr.type = PERF_TYPE_BREAKPOINT;
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;

Frederic Weisbecker

unread,
Mar 31, 2010, 12:00:02 AM3/31/10
to
Ingo,

Please pull the perf/urgent branch that can be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
perf/urgent

It fixes the conflict against latest linus's tree that raised
build errors (you can find a merge commit inside).

I am not yet totally sure about Peter's opinion concerning
the context switch event that is now captured from the kernel.
Our latest discussion on the strict exclude_kernel meaning
seems in favour of this change but I'd prefer to wait for his
final approval before you pull this.

(I'm not reposting the alignment fix as it is unchanged)

Thanks,
Frederic
---

Frederic Weisbecker (2):
perf: Correctly align perf event tracing buffer
perf: Use hot regs with software sched switch/migrate events

Jason Wessel (1):
x86,kgdb: Always initialize the hw breakpoint attribute


arch/x86/kernel/cpu/perf_event.c | 2 --
arch/x86/kernel/kgdb.c | 2 +-


include/linux/perf_event.h | 21 ++++++++++++++-------
kernel/perf_event.c | 4 +---

kernel/trace/trace_event_perf.c | 11 +++++++++--
5 files changed, 25 insertions(+), 15 deletions(-)

Frederic Weisbecker

unread,
Apr 1, 2010, 2:20:02 AM4/1/10
to
On Wed, Mar 31, 2010 at 05:58:45AM +0200, Frederic Weisbecker wrote:
> Ingo,
>
> Please pull the perf/urgent branch that can be found at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
> perf/urgent


I have updated the tree. There was a leftover patch for perf/core
inside (not listed here). Also, since you've merged linus tree in
perf/urgent lately, I've zapped my merge commit. Other than that,
the three fixes remain.

Thanks.

Ingo Molnar

unread,
Apr 2, 2010, 1:40:01 PM4/2/10
to

* Frederic Weisbecker <fwei...@gmail.com> wrote:

> On Wed, Mar 31, 2010 at 05:58:45AM +0200, Frederic Weisbecker wrote:
> > Ingo,
> >
> > Please pull the perf/urgent branch that can be found at:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
> > perf/urgent
>
>
> I have updated the tree. There was a leftover patch for perf/core
> inside (not listed here). Also, since you've merged linus tree in
> perf/urgent lately, I've zapped my merge commit. Other than that,
> the three fixes remain.
>
> Thanks.

Pulled, thanks Frederic!

Note, i also queued up this patch from Edwin:

257ef9d: perf, x86: Fix callgraphs of 32-bit processes on 64-bit kernels

with your Acked-by added.

Thanks,

Ingo

Frederic Weisbecker

unread,
Apr 3, 2010, 6:40:02 AM4/3/10
to
On Fri, Apr 02, 2010 at 07:32:03PM +0200, Ingo Molnar wrote:
>
> * Frederic Weisbecker <fwei...@gmail.com> wrote:
>
> > On Wed, Mar 31, 2010 at 05:58:45AM +0200, Frederic Weisbecker wrote:
> > > Ingo,
> > >
> > > Please pull the perf/urgent branch that can be found at:
> > >
> > > git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
> > > perf/urgent
> >
> >
> > I have updated the tree. There was a leftover patch for perf/core
> > inside (not listed here). Also, since you've merged linus tree in
> > perf/urgent lately, I've zapped my merge commit. Other than that,
> > the three fixes remain.
> >
> > Thanks.
>
> Pulled, thanks Frederic!
>
> Note, i also queued up this patch from Edwin:
>
> 257ef9d: perf, x86: Fix callgraphs of 32-bit processes on 64-bit kernels
>
> with your Acked-by added.
>
> Thanks,
>
> Ingo


Ah ok. I was too cautious and eventually queued it for perf/core, but
if you think it's fine, no problem.

Thanks.

Frederic Weisbecker

unread,
Apr 3, 2010, 6:50:02 AM4/3/10
to
Ingo,

Please pull the perf/urgent branch that can be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
perf/urgent

Thanks,
Frederic
---

Frederic Weisbecker (2):
perf: Always build the stub perf_arch_fetch_caller_regs version
perf: Always build the powerpc perf_arch_fetch_caller_regs version


arch/powerpc/kernel/misc.S | 2 --
kernel/perf_event.c | 3 +--
2 files changed, 1 insertions(+), 4 deletions(-)

Ingo Molnar

unread,
Apr 4, 2010, 6:20:02 AM4/4/10
to
Linus,

Please pull the latest perf-fixes-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git perf-fixes-for-linus

Thanks,

Ingo

------------------>
Borislav Petkov (1):
perf, probe-finder: Build fix on Debian

Frederic Weisbecker (4):


perf: Correctly align perf event tracing buffer
perf: Use hot regs with software sched switch/migrate events

perf: Always build the stub perf_arch_fetch_caller_regs version
perf: Always build the powerpc perf_arch_fetch_caller_regs version

Jason Wessel (1):


x86,kgdb: Always initialize the hw breakpoint attribute

Mike Galbraith (1):
perf: Fix 'perf sched record' deadlock

Peter Zijlstra (2):
x86: Move notify_cpu_starting() callback to a later stage
perf, x86: Fix AMD hotplug & constraint initialization

Tom Zanussi (1):
perf/scripts: Tuple was set from long in both branches in python_process_event()

Torok Edwin (1):


perf, x86: Fix callgraphs of 32-bit processes on 64-bit kernels


arch/powerpc/kernel/misc.S | 2 -
arch/x86/kernel/cpu/perf_event.c | 54 +++++++++++---
arch/x86/kernel/cpu/perf_event_amd.c | 80 ++++++++++++--------
arch/x86/kernel/dumpstack.h | 5 +
arch/x86/kernel/kgdb.c | 2 +-
arch/x86/kernel/smpboot.c | 4 +-
include/linux/perf_event.h | 21 ++++--
kernel/perf_event.c | 22 ++++--
kernel/trace/trace_event_perf.c | 11 ++-
tools/perf/Makefile | 10 +-
.../util/scripting-engines/trace-event-python.c | 17 +++-
11 files changed, 153 insertions(+), 75 deletions(-)

diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index b485a87..22e507c 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -128,7 +128,6 @@ _GLOBAL(__restore_cpu_power7)


/* place holder */
blr

-#ifdef CONFIG_EVENT_TRACING
/*


* Get a minimal set of registers for our caller's nth caller.

* r3 = regs pointer, r5 = n.

@@ -154,4 +153,3 @@ _GLOBAL(perf_arch_fetch_caller_regs)
PPC_STL r4,_NIP-STACK_FRAME_OVERHEAD(r3)
PPC_STL r7,_LINK-STACK_FRAME_OVERHEAD(r3)
blr
-#endif /* CONFIG_EVENT_TRACING */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 60398a0..53ea4cf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -28,6 +28,7 @@
#include <asm/apic.h>
#include <asm/stacktrace.h>
#include <asm/nmi.h>
+#include <asm/compat.h>

static u64 perf_event_mask __read_mostly;

@@ -158,7 +159,7 @@ struct x86_pmu {
struct perf_event *event);
struct event_constraint *event_constraints;

- void (*cpu_prepare)(int cpu);
+ int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
@@ -1333,11 +1334,12 @@ static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;

switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
if (x86_pmu.cpu_prepare)
- x86_pmu.cpu_prepare(cpu);
+ ret = x86_pmu.cpu_prepare(cpu);
break;

case CPU_STARTING:
@@ -1350,6 +1352,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
x86_pmu.cpu_dying(cpu);
break;

+ case CPU_UP_CANCELED:
case CPU_DEAD:
if (x86_pmu.cpu_dead)
x86_pmu.cpu_dead(cpu);
@@ -1359,7 +1362,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
}

- return NOTIFY_OK;
+ return ret;
}

static void __init pmu_check_apic(void)
@@ -1628,14 +1631,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
return len;
}

-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+#ifdef CONFIG_COMPAT
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
- unsigned long bytes;
+ /* 32-bit process in 64-bit kernel. */
+ struct stack_frame_ia32 frame;
+ const void __user *fp;

- bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+ if (!test_thread_flag(TIF_IA32))
+ return 0;
+
+ fp = compat_ptr(regs->bp);
+ while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ unsigned long bytes;
+ frame.next_frame = 0;
+ frame.return_address = 0;
+
+ bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (bytes != sizeof(frame))
+ break;
+
+ if (fp < compat_ptr(regs->sp))
+ break;

- return bytes == sizeof(*frame);
+ callchain_store(entry, frame.return_address);
+ fp = compat_ptr(frame.next_frame);
+ }
+ return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ return 0;
}
+#endif

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1651,11 +1682,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
callchain_store(entry, PERF_CONTEXT_USER);
callchain_store(entry, regs->ip);

+ if (perf_callchain_user32(regs, entry))
+ return;
+
while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;

- if (!copy_stack_frame(fp, &frame))
+ bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (bytes != sizeof(frame))
break;

if ((unsigned long)fp < regs->sp)
@@ -1702,7 +1738,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)


return entry;
}

-#ifdef CONFIG_EVENT_TRACING
void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
{
regs->ip = ip;

@@ -1714,4 +1749,3 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski


regs->cs = __KERNEL_CS;
local_save_flags(regs->flags);
}
-#endif

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index b87e0b6..db6f7d4 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -137,6 +137,13 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc)
return (hwc->config & 0xe0) == 0xe0;
}

+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+ struct amd_nb *nb = cpuc->amd_nb;
+
+ return nb && nb->nb_id != -1;
+}
+
static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
@@ -147,7 +154,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
/*
* only care about NB events
*/
- if (!(nb && amd_is_nb_event(hwc)))
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
return;

/*
@@ -214,7 +221,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
/*
* if not NB event or no NB, then no constraints
*/
- if (!(nb && amd_is_nb_event(hwc)))
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
return &unconstrained;

/*
@@ -293,51 +300,55 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
return nb;
}

-static void amd_pmu_cpu_online(int cpu)
+static int amd_pmu_cpu_prepare(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ WARN_ON_ONCE(cpuc->amd_nb);
+
+ if (boot_cpu_data.x86_max_cores < 2)
+ return NOTIFY_OK;
+
+ cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+ if (!cpuc->amd_nb)
+ return NOTIFY_BAD;
+
+ return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
{
- struct cpu_hw_events *cpu1, *cpu2;
- struct amd_nb *nb = NULL;
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct amd_nb *nb;
int i, nb_id;

if (boot_cpu_data.x86_max_cores < 2)
return;

- /*
- * function may be called too early in the
- * boot process, in which case nb_id is bogus
- */
nb_id = amd_get_nb_id(cpu);
- if (nb_id == BAD_APICID)
- return;
-
- cpu1 = &per_cpu(cpu_hw_events, cpu);
- cpu1->amd_nb = NULL;
+ WARN_ON_ONCE(nb_id == BAD_APICID);

raw_spin_lock(&amd_nb_lock);

for_each_online_cpu(i) {
- cpu2 = &per_cpu(cpu_hw_events, i);
- nb = cpu2->amd_nb;
- if (!nb)
+ nb = per_cpu(cpu_hw_events, i).amd_nb;
+ if (WARN_ON_ONCE(!nb))
continue;
- if (nb->nb_id == nb_id)
- goto found;
- }

- nb = amd_alloc_nb(cpu, nb_id);
- if (!nb) {
- pr_err("perf_events: failed NB allocation for CPU%d\n", cpu);
- raw_spin_unlock(&amd_nb_lock);
- return;
+ if (nb->nb_id == nb_id) {
+ kfree(cpuc->amd_nb);
+ cpuc->amd_nb = nb;
+ break;
+ }
}
-found:
- nb->refcnt++;
- cpu1->amd_nb = nb;
+
+ cpuc->amd_nb->nb_id = nb_id;
+ cpuc->amd_nb->refcnt++;

raw_spin_unlock(&amd_nb_lock);
}

-static void amd_pmu_cpu_offline(int cpu)
+static void amd_pmu_cpu_dead(int cpu)
{
struct cpu_hw_events *cpuhw;

@@ -349,8 +360,10 @@ static void amd_pmu_cpu_offline(int cpu)
raw_spin_lock(&amd_nb_lock);

if (cpuhw->amd_nb) {
- if (--cpuhw->amd_nb->refcnt == 0)
- kfree(cpuhw->amd_nb);
+ struct amd_nb *nb = cpuhw->amd_nb;
+
+ if (nb->nb_id == -1 || --nb->refcnt == 0)
+ kfree(nb);

cpuhw->amd_nb = NULL;
}
@@ -379,8 +392,9 @@ static __initconst struct x86_pmu amd_pmu = {
.get_event_constraints = amd_get_event_constraints,
.put_event_constraints = amd_put_event_constraints,

- .cpu_prepare = amd_pmu_cpu_online,
- .cpu_dead = amd_pmu_cpu_offline,
+ .cpu_prepare = amd_pmu_cpu_prepare,
+ .cpu_starting = amd_pmu_cpu_starting,
+ .cpu_dead = amd_pmu_cpu_dead,
};

static __init int amd_pmu_init(void)
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 29e5f7c..e39e771 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -30,6 +30,11 @@ struct stack_frame {
unsigned long return_address;
};

+struct stack_frame_ia32 {
+ u32 next_frame;
+ u32 return_address;
+};
+
static inline unsigned long rewind_frame_pointer(int n)
{
struct stack_frame *frame;


diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index bfba601..b2258ca 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -618,8 +618,8 @@ int kgdb_arch_init(void)
* portion of kgdb because this operation requires mutexs to
* complete.
*/
+ hw_breakpoint_init(&attr);
attr.bp_addr = (unsigned long)kgdb_arch_init;
- attr.type = PERF_TYPE_BREAKPOINT;
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 06d98ae..6808b93 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -242,8 +242,6 @@ static void __cpuinit smp_callin(void)
end_local_APIC_setup();
map_cpu_to_logical_apicid();

- notify_cpu_starting(cpuid);
-
/*
* Need to setup vector mappings before we enable interrupts.
*/
@@ -264,6 +262,8 @@ static void __cpuinit smp_callin(void)
*/
smp_store_cpu_info(cpuid);

+ notify_cpu_starting(cpuid);
+
/*
* Allow the master to continue.
*/

index 574ee58..681af80 100644


--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1164,11 +1164,9 @@ void perf_event_task_sched_out(struct task_struct *task,
struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent;
- struct pt_regs *regs;
int do_switch = 1;

- regs = task_pt_regs(task);
- perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);

if (likely(!ctx || !cpuctx->task_ctx))
return;

@@ -2786,12 +2784,11 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return NULL;
}

-#ifdef CONFIG_EVENT_TRACING
__weak


void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
{
}

-#endif
+

/*
* Output
@@ -3378,15 +3375,23 @@ static void perf_event_task_output(struct perf_event *event,
struct perf_task_event *task_event)
{
struct perf_output_handle handle;
- int size;
struct task_struct *task = task_event->task;
- int ret;
+ unsigned long flags;
+ int size, ret;
+
+ /*
+ * If this CPU attempts to acquire an rq lock held by a CPU spinning
+ * in perf_output_lock() from interrupt context, it's game over.
+ */
+ local_irq_save(flags);

size = task_event->event_id.header.size;
ret = perf_output_begin(&handle, event, size, 0, 0);

- if (ret)
+ if (ret) {
+ local_irq_restore(flags);
return;
+ }

task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3397,6 +3402,7 @@ static void perf_event_task_output(struct perf_event *event,
perf_output_put(&handle, task_event->event_id);

perf_output_end(&handle);
+ local_irq_restore(flags);
}

static int perf_event_task_match(struct perf_event *event)
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 81f691e..0565bb4 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -17,7 +17,12 @@ EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
static char *perf_trace_buf;
static char *perf_trace_buf_nmi;

-typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ;
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+ perf_trace_t;

/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
@@ -130,6 +135,8 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
char *trace_buf, *raw_data;
int pc, cpu;

+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+
pc = preempt_count();

/* Protect the per cpu buffer, begin the rcu read side */
@@ -152,7 +159,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
raw_data = per_cpu_ptr(trace_buf, cpu);

/* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));

entry = (struct trace_entry *)raw_data;
tracing_generic_entry_update(entry, *irq_flags, pc);
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 8a8f52d..bc0f670 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -200,7 +200,7 @@ endif

CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
EXTLIBS = -lpthread -lrt -lelf -lm
-ALL_CFLAGS = $(CFLAGS)
+ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
ALL_LDFLAGS = $(LDFLAGS)
STRIP ?= strip

@@ -492,19 +492,19 @@ ifeq ($(uname_S),Darwin)
PTHREAD_LIBS =
endif

-ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
endif

- ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+ ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
BASIC_CFLAGS += -DLIBELF_NO_MMAP
endif
else
msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
endif

-ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
BASIC_CFLAGS += -DNO_DWARF_SUPPORT
else
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 33a414b..6a72f14 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -208,7 +208,7 @@ static void python_process_event(int cpu, void *data,
int size __unused,
unsigned long long nsecs, char *comm)
{
- PyObject *handler, *retval, *context, *t;
+ PyObject *handler, *retval, *context, *t, *obj;
static char handler_name[256];
struct format_field *field;
unsigned long long val;
@@ -256,16 +256,23 @@ static void python_process_event(int cpu, void *data,
offset &= 0xffff;
} else
offset = field->offset;
- PyTuple_SetItem(t, n++,
- PyString_FromString((char *)data + offset));
+ obj = PyString_FromString((char *)data + offset);
} else { /* FIELD_IS_NUMERIC */
val = read_size(data + field->offset, field->size);
if (field->flags & FIELD_IS_SIGNED) {
- PyTuple_SetItem(t, n++, PyInt_FromLong(val));
+ if ((long long)val >= LONG_MIN &&
+ (long long)val <= LONG_MAX)
+ obj = PyInt_FromLong(val);
+ else
+ obj = PyLong_FromLongLong(val);
} else {
- PyTuple_SetItem(t, n++, PyInt_FromLong(val));
+ if (val <= LONG_MAX)
+ obj = PyInt_FromLong(val);
+ else
+ obj = PyLong_FromUnsignedLongLong(val);
}
}
+ PyTuple_SetItem(t, n++, obj);
}

if (_PyTuple_Resize(&t, n) == -1)

Ingo Molnar

unread,
Apr 6, 2010, 2:00:04 PM4/6/10
to
Linus,

Please pull the latest perf-fixes-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git perf-fixes-for-linus

Thanks,

Ingo

------------------>
Arnaldo Carvalho de Melo (1):
perf kmem: Fix breakage introduced by 5a0e3ad slab.h script

Vince Weaver (1):
perf, x86: Enable Nehalem-EX support


arch/x86/kernel/cpu/perf_event_intel.c | 1 +
tools/perf/builtin-kmem.c | 1 -
2 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 84bfde6..9c794ac 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -936,6 +936,7 @@ static __init int intel_pmu_init(void)

case 26: /* 45 nm nehalem, "Bloomfield" */
case 30: /* 45 nm nehalem, "Lynnfield" */
+ case 46: /* 45 nm nehalem-ex, "Beckton" */
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 7d9e3a7..924a951 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -14,7 +14,6 @@
#include "util/debug.h"

#include <linux/rbtree.h>
-#include <linux/slab.h>

struct alloc_stat;
typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);

0 new messages