One of the things that is missing is keeping the count value sane while
using PEBS -- another is dealing with auto frequency things, I thought
about single shot PEBS assist for that.
After this we can do something like PERF_SAMPLE_REGS, but for that we
need to think about how to expose pt_regs to userspace or something (or
maybe it already is, I haven't checked).
Also, initially I'll go through all the other hw perf implementations
(powerpc, sparc, arm, sh) and make then refuse to create attr.precise
counters -- precise meaning the reported IP is not influenced by OoO
artefacts.
Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
---
arch/x86/kernel/cpu/perf_event.c | 354 ++++++++++++++++++++++++++++++++++-----
include/linux/perf_event.h | 4
2 files changed, 314 insertions(+), 44 deletions(-)
Index: linux-2.6/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_event.c
@@ -38,11 +38,28 @@ static u64 perf_event_mask __read_mostly
#define BTS_RECORD_SIZE 24
/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+struct pebs_record_core {
+ u64 eflags, eip;
+ u64 eax, ebc, ecx, edx;
+ u64 esi, edi, ebp, esp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+}; /* size: 0x90 bytes */
+
+struct pebs_record_nhm {
+ u64 eflags, eip;
+ u64 eax, ebc, ecx, edx;
+ u64 esi, edi, ebp, esp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+}; /* size: 0xB0 bytes */
+
+static int pebs_record_size;
/*
* Bits in the debugctlmsr controlling branch tracing.
@@ -104,12 +121,24 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+/*
+ * Constraint on the Event code.
+ */
#define INTEL_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ */
#define FIXED_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK)
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
#define EVENT_CONSTRAINT_END \
EVENT_CONSTRAINT(0, 0, 0)
@@ -136,11 +165,12 @@ struct x86_pmu {
int num_events_fixed;
int event_bits;
u64 event_mask;
- int apic;
+ int apic, bts, pebs;
u64 max_period;
u64 intel_ctrl;
- void (*enable_bts)(u64 config);
- void (*disable_bts)(void);
+
+ void (*drain_pebs)(struct cpu_hw_events *cpuc);
+ struct event_constraint *pebs_constraints;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
@@ -303,6 +333,32 @@ static struct event_constraint intel_gen
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_core_pebs_events[] = {
+ PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+ PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+ PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_events[] = {
+ PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+ PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+ PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+ PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+ EVENT_CONSTRAINT_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -937,11 +993,6 @@ static void release_pmc_hardware(void)
#endif
}
-static inline bool bts_available(void)
-{
- return x86_pmu.enable_bts != NULL;
-}
-
static inline void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -962,11 +1013,11 @@ static inline void fini_debug_store_on_c
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
-static void release_bts_hardware(void)
+static void release_ds_buffers(void)
{
int cpu;
- if (!bts_available())
+ if (!x86_pmu.bts && !x86_pmu.pebs)
return;
get_online_cpus();
@@ -982,6 +1033,7 @@ static void release_bts_hardware(void)
per_cpu(cpu_hw_events, cpu).ds = NULL;
+ kfree((void *)(unsigned long)ds->pebs_buffer_base);
kfree((void *)(unsigned long)ds->bts_buffer_base);
kfree(ds);
}
@@ -989,43 +1041,65 @@ static void release_bts_hardware(void)
put_online_cpus();
}
-static int reserve_bts_hardware(void)
+static int reserve_ds_buffers(void)
{
int cpu, err = 0;
- if (!bts_available())
- return 0;
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
get_online_cpus();
for_each_possible_cpu(cpu) {
struct debug_store *ds;
void *buffer;
+ int max, thresh;
err = -ENOMEM;
- buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
ds = kzalloc(sizeof(*ds), GFP_KERNEL);
if (unlikely(!ds)) {
kfree(buffer);
break;
}
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum =
- ds->bts_buffer_base + BTS_BUFFER_SIZE;
- ds->bts_interrupt_threshold =
- ds->bts_absolute_maximum - BTS_OVFL_TH;
+ if (x86_pmu.bts) {
+ buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ thresh = max / 16;
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ thresh * BTS_RECORD_SIZE;
+ }
+
+ if (x86_pmu.pebs) {
+ buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
- per_cpu(cpu_hw_events, cpu).ds = ds;
+ max = PEBS_BUFFER_SIZE / pebs_record_size;
+ thresh = max / 16;
+
+ ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ ds->pebs_index = ds->pebs_buffer_base;
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+ max * pebs_record_size;
+ ds->pebs_interrupt_threshold = ds->pebs_absolute_maximum -
+ thresh * pebs_record_size;
+
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+ }
err = 0;
}
if (err)
- release_bts_hardware();
+ release_ds_buffers();
else {
for_each_online_cpu(cpu)
init_debug_store_on_cpu(cpu);
@@ -1040,7 +1114,7 @@ static void hw_perf_event_destroy(struct
{
if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
release_pmc_hardware();
- release_bts_hardware();
+ release_ds_buffers();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -1119,6 +1193,37 @@ static void intel_pmu_disable_bts(void)
update_debugctlmsr(debugctlmsr);
}
+static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx = hwc->idx;
+ u64 left;
+ u64 val;
+
+ left = min(hwc->sample_period, x86_pmu.max_period);
+ left = (u64)(-left) & x86_pmu.event_mask;
+
+ cpuc->ds->pebs_event_reset[idx] = left;
+ hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+ rdmsrl(MSR_IA32_PEBS_ENABLE, val);
+ val |= 1ULL << idx;
+ wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+}
+
+static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx = hwc->idx;
+ u64 val;
+
+ rdmsrl(MSR_IA32_PEBS_ENABLE, val);
+ val &= ~(1ULL << idx);
+ wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+ hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -1139,7 +1244,7 @@ static int __hw_perf_event_init(struct p
if (!reserve_pmc_hardware())
err = -EBUSY;
else
- err = reserve_bts_hardware();
+ err = reserve_ds_buffers();
}
if (!err)
atomic_inc(&active_events);
@@ -1214,7 +1319,7 @@ static int __hw_perf_event_init(struct p
if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
(hwc->sample_period == 1)) {
/* BTS is not supported by this architecture. */
- if (!bts_available())
+ if (!x86_pmu.bts)
return -EOPNOTSUPP;
/* BTS is currently only allowed for user-mode. */
@@ -1646,6 +1751,9 @@ intel_pmu_disable_event(struct hw_perf_e
}
x86_pmu_disable_event(hwc, idx);
+
+ if (unlikely(hwc->pebs))
+ intel_pmu_pebs_disable(hwc);
}
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1767,6 +1875,9 @@ static void intel_pmu_enable_event(struc
return;
}
+ if (unlikely(hwc->pebs))
+ intel_pmu_pebs_enable(hwc);
+
__x86_pmu_enable_event(hwc, idx);
}
@@ -1920,8 +2031,7 @@ static void intel_pmu_drain_bts_buffer(s
*/
perf_prepare_sample(&header, &data, event, ®s);
- if (perf_output_begin(&handle, event,
- header.size * (top - at), 1, 1))
+ if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
return;
for (; at < top; at++) {
@@ -1938,6 +2048,106 @@ static void intel_pmu_drain_bts_buffer(s
event->pending_kill = POLL_IN;
}
+static void intel_pmu_drain_pebs_core(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+ struct pebs_record_core *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!event)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+ if (top <= at)
+ return;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ data.period = event->hw.last_period;
+ data.addr = 0;
+ data.raw = NULL;
+ regs.ip = 0;
+
+ perf_prepare_sample(&header, &data, event, ®s);
+
+ if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+ return;
+
+ for (; at < top; at++) {
+ data.ip = at->eip;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+}
+
+static void intel_pmu_drain_pebs_nhm(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ struct pebs_record_core *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct perf_event *event;
+ struct pt_regs regs;
+
+ if (!ds)
+ return;
+
+ at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+ if (top <= at)
+ return;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ for (; at < top; at++) {
+ for (i = 0; i < x86_pmu.num_events; i++) {
+ event = cpuc->events[i];
+
+ if (!event || !event->attr.precise)
+ continue;
+
+ if (!(at->status & (1ULL << i)))
+ continue;
+
+ break;
+ }
+ if (i == x86_pmu.num_events)
+ continue;
+
+ data.period = event->hw.last_period;
+ data.addr = 0;
+ data.raw = NULL;
+ regs.ip = at->eip;
+
+ perf_prepare_sample(&header, &data, event, ®s);
+
+ if (perf_output_begin(&handle, event, header.size, 1, 1))
+ continue;
+
+ perf_output_sample(&handle, &header, &data, event);
+ perf_output_end(&handle);
+
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+ }
+}
+
static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc)
{
struct hw_perf_event *hwc = &event->hw;
@@ -2209,8 +2419,8 @@ perf_event_nmi_handler(struct notifier_b
return NOTIFY_STOP;
}
-static struct event_constraint unconstrained;
-
+static struct event_constraint unconstrained; /* can schedule */
+static struct event_constraint null_constraint; /* can't schedule */
static struct event_constraint bts_constraint =
EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
@@ -2233,20 +2443,28 @@ intel_special_constraints(struct perf_ev
static struct event_constraint *
intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
{
- struct event_constraint *c;
+ struct event_constraint *constraints = x86_pmu.event_constraints;
+ struct event_constraint *i, *c = &unconstrained;
c = intel_special_constraints(event);
if (c)
return c;
- if (x86_pmu.event_constraints) {
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- if ((event->hw.config & c->cmask) == c->code)
- return c;
+ if (event->attr.precise) {
+ constraints = x86_pmu.pebs_constraints;
+ c = &null_constraint;
+ }
+
+ if (constraints) {
+ for_each_event_constraint(i, constraints) {
+ if ((event->hw.config & i->cmask) == i->code) {
+ c = i;
+ break;
+ }
}
}
- return &unconstrained;
+ return c;
}
static struct event_constraint *
@@ -2442,8 +2660,6 @@ static __initconst struct x86_pmu intel_
* the generic event period:
*/
.max_period = (1ULL << 31) - 1,
- .enable_bts = intel_pmu_enable_bts,
- .disable_bts = intel_pmu_disable_bts,
.get_event_constraints = intel_get_event_constraints
};
@@ -2500,6 +2716,7 @@ static __init int intel_pmu_init(void)
unsigned int unused;
unsigned int ebx;
int version;
+ u64 capabilities;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
/* check for P6 processor family */
@@ -2536,6 +2753,42 @@ static __init int intel_pmu_init(void)
if (version > 1)
x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+ if (!boot_cpu_has(X86_FEATURE_DTES64))
+ goto no_datastore;
+
+ x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
+ x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ if (x86_pmu.pebs) {
+ int format = 0;
+
+ if (version > 1) {
+ /*
+ * v2+ has a PEBS format field
+ */
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+ format = (capabilities >> 8) & 0xf;
+ }
+
+ switch (format) {
+ case 0:
+ pebs_record_size = sizeof(pebs_record_core);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+ x86_pmu.pebs_constraints = intel_core_pebs_events;
+ break;
+
+ case 1:
+ pebs_record_size = sizeof(pebs_record_nhm);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+ break;
+
+ default:
+ x86_pmu.pebs = 0;
+ break;
+ }
+ }
+no_datastore:
+
/*
* Install the hw-cache-events table:
*/
@@ -2695,6 +2948,19 @@ static const struct pmu pmu = {
};
/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+ struct event_constraint *c = x86_pmu.get_event_constraints(event);
+
+ if (!c || !c->weight)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/*
* validate a single event group
*
* validation include:
@@ -2759,6 +3025,8 @@ const struct pmu *hw_perf_event_init(str
if (event->group_leader != event)
err = validate_group(event);
+ else
+ err = validate_event(event);
event->pmu = tmp;
}
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -203,8 +203,9 @@ struct perf_event_attr {
enable_on_exec : 1, /* next exec enables */
task : 1, /* trace fork/exit */
watermark : 1, /* wakeup_watermark */
+ precise : 1,
- __reserved_1 : 49;
+ __reserved_1 : 48;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -483,6 +484,7 @@ struct hw_perf_event {
unsigned long event_base;
int idx;
int last_cpu;
+ int pebs;
};
struct { /* software */
s64 remaining;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> Totally uncompiled and untested, but it looks to be about mostly there so I
> thought I'd post it.
>
> One of the things that is missing is keeping the count value sane while
> using PEBS -- another is dealing with auto frequency things, I thought
> about single shot PEBS assist for that.
>
> After this we can do something like PERF_SAMPLE_REGS, but for that we need
> to think about how to expose pt_regs to userspace or something (or maybe it
> already is, I haven't checked).
>
> Also, initially I'll go through all the other hw perf implementations
> (powerpc, sparc, arm, sh) and make then refuse to create attr.precise
> counters -- precise meaning the reported IP is not influenced by OoO
> artefacts.
>
> Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
> ---
> arch/x86/kernel/cpu/perf_event.c | 354 ++++++++++++++++++++++++++++++++++-----
> include/linux/perf_event.h | 4
> 2 files changed, 314 insertions(+), 44 deletions(-)
Very nice!
> @@ -203,8 +203,9 @@ struct perf_event_attr {
> enable_on_exec : 1, /* next exec enables */
> task : 1, /* trace fork/exit */
> watermark : 1, /* wakeup_watermark */
> + precise : 1,
I think we want to default to precise events even if not specifically
requested by user-space, in the cases where that's possible on the CPU
without additional limitations.
That way people will default to better (and possibly cheaper) PEBS profiling
on modern Intel CPUs.
Ingo
> > @@ -203,8 +203,9 @@ struct perf_event_attr {
> > enable_on_exec : 1, /* next exec enables */
> > task : 1, /* trace fork/exit */
> > watermark : 1, /* wakeup_watermark */
> > + precise : 1,
>
> I think we want to default to precise events even if not specifically
> requested by user-space, in the cases where that's possible on the CPU
> without additional limitations.
>
> That way people will default to better (and possibly cheaper) PEBS profiling
> on modern Intel CPUs.
Sure, I'll look at that once it starts working :-)
Looks like this patch hardcodes the depth and threshold of the buffer.
I believe you need to add some flexibility in there.
You are currently only extracting IP. You need a way to extract the rest
of the recorded state. There are some useful measurements you can do
with it. I believe something like PERF_SAMPLE_REGS would work.
Part of the pt_regs are already exported by signals (sigcontext).
It should be noted that providing PERF_SAMPLE_REGS in non-PEBS
situations is also a requirement. But it needs to be clear this is the
interrupted state and not the at-overflow state.
I do not believe substituting PEBS whenever you detect it is available AND
event supports it is a good idea. PEBS is not more precise than regular
sampling, in fact, it is statistically of poorer quality. This is due to the way
it works and it cannot be mitigated by randomization (at least with depth > 1).
The only improvement that PEBS provides is that you get an IP and the
machine state at retirement of an instruction that caused the event to
increment. Thus, the IP points to the next dynamic instruction. The instruction
is not the one that cause the P-th occurence of the event, if you set the
period to P. It is at P+N, where N cannot be predicted and varies depending
on the event and executed code. This introduces some bias in the samples.
Given the behavior of PEBS, it would not be possible to correlate samples
obtained from two events with only one of them supporting PEBS. For instance,
if you sample on INST_RETIRED and UNHALTED_CORE_CYCLES. You
would get a PEBS profile for INST_RETIRED and a regular profile for CYCLES.
Given the skid differences, you would not be able to make fair comparisons.
The user needs to understand what is being measured.
On Tue, Feb 2, 2010 at 7:33 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Tue, 2010-02-02 at 19:26 +0100, Ingo Molnar wrote:
>> * Peter Zijlstra <pet...@infradead.org> wrote:
>
>> > @@ -203,8 +203,9 @@ struct perf_event_attr {
>> > enable_on_exec : 1, /* next exec enables */
>> > task : 1, /* trace fork/exit */
>> > watermark : 1, /* wakeup_watermark */
>> > + precise : 1,
>>
>> I think we want to default to precise events even if not specifically
>> requested by user-space, in the cases where that's possible on the CPU
>> without additional limitations.
>>
>> That way people will default to better (and possibly cheaper) PEBS profiling
>> on modern Intel CPUs.
>
> Sure, I'll look at that once it starts working :-)
>
>
--
Stephane Eranian | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks
Sure you can, just drain the buffers on context switch. (You'll see that
placing x86_pmu.drain_pebs() calls is one of the missing pieces).
> You are currently only extracting IP. You need a way to extract the rest
> of the recorded state. There are some useful measurements you can do
> with it. I believe something like PERF_SAMPLE_REGS would work.
> Part of the pt_regs are already exported by signals (sigcontext).
Right, hence my suggestion to add that :-)
> It should be noted that providing PERF_SAMPLE_REGS in non-PEBS
> situations is also a requirement. But it needs to be clear this is the
> interrupted state and not the at-overflow state.
Sure.
> I do not believe substituting PEBS whenever you detect it is available AND
> event supports it is a good idea. PEBS is not more precise than regular
> sampling, in fact, it is statistically of poorer quality. This is due to the way
> it works and it cannot be mitigated by randomization (at least with depth > 1).
Right, which is why I already mentioned intending to use depth == 1 for
things like the auto-freq (and possible future randomization).
> The only improvement that PEBS provides is that you get an IP and the
> machine state at retirement of an instruction that caused the event to
> increment. Thus, the IP points to the next dynamic instruction. The instruction
> is not the one that cause the P-th occurence of the event, if you set the
> period to P. It is at P+N, where N cannot be predicted and varies depending
> on the event and executed code. This introduces some bias in the samples.
I'm not sure I follow, it records the next event after overflow, doesn't
that make it P+1?
It doesn't matter how many instructions are between the P-th and P+1th
event, you're counting events.
One thing that is not quite clear to me is the influence of PEBS Trap,
IA32_PERF_CAPABILITIES[6], that says to record after (trap like) when
set, and before (fault like) when cleared, but then it goes on saying
the IP is always the instruction after.
If it means the register state before or after the instruction, then I
don't know why they had to mess up the IP like they do :/
> Given the behavior of PEBS, it would not be possible to correlate samples
> obtained from two events with only one of them supporting PEBS. For instance,
> if you sample on INST_RETIRED and UNHALTED_CORE_CYCLES. You
> would get a PEBS profile for INST_RETIRED and a regular profile for CYCLES.
> Given the skid differences, you would not be able to make fair comparisons.
OK, good point.
I was talking about cpu-wide mode, where you don't do anything today.
So sure, if you had drain_pebs() in the context switch out, then this will
work.
>
>> I do not believe substituting PEBS whenever you detect it is available AND
>> event supports it is a good idea. PEBS is not more precise than regular
>> sampling, in fact, it is statistically of poorer quality. This is due to the way
>> it works and it cannot be mitigated by randomization (at least with depth > 1).
>
> Right, which is why I already mentioned intending to use depth == 1 for
> things like the auto-freq (and possible future randomization).
>
okay.
>> The only improvement that PEBS provides is that you get an IP and the
>> machine state at retirement of an instruction that caused the event to
>> increment. Thus, the IP points to the next dynamic instruction. The instruction
>> is not the one that cause the P-th occurence of the event, if you set the
>> period to P. It is at P+N, where N cannot be predicted and varies depending
>> on the event and executed code. This introduces some bias in the samples.
>
> I'm not sure I follow, it records the next event after overflow, doesn't
> that make it P+1?
>
That is not what I wrote. I did not say if records at P+1. I said it records
at P+N, where N varies from sample to sample and cannot be predicted.
N is expressed in the unit of the sampling event.
> It doesn't matter how many instructions are between the P-th and P+1th
> event, you're counting events.
>
I did not talk about instructions but occurrences of the sampling event.
> One thing that is not quite clear to me is the influence of PEBS Trap,
> IA32_PERF_CAPABILITIES[6], that says to record after (trap like) when
> set, and before (fault like) when cleared, but then it goes on saying
> the IP is always the instruction after.
I have never played with Trap vs. Fault. I leave it to default.
The IP is ALWAYS the address after the sampled instruction because it
is recorded at retirement of that instruction. Same thing with the machine
state. It is the state after the instruction retired. So if it
increments a register,
you get the value after the increment.
>
> If it means the register state before or after the instruction, then I
> don't know why they had to mess up the IP like they do :/
>
>> Given the behavior of PEBS, it would not be possible to correlate samples
>> obtained from two events with only one of them supporting PEBS. For instance,
>> if you sample on INST_RETIRED and UNHALTED_CORE_CYCLES. You
>> would get a PEBS profile for INST_RETIRED and a regular profile for CYCLES.
>> Given the skid differences, you would not be able to make fair comparisons.
>
> OK, good point.
>
>
>
--
Stephane Eranian | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks
OK, so I'm confused.
The manual says it arms the PEBS assist on overflow, and the PEBS thing
will then record the next event. Which to me reads like P+1.
You're saying they're wrong and they record a random event after the
overflow?
Yes I was, ok that stinks.
If only they would reset the counter on overflow instead of on record,
that would solve quite a few issues I imagine.
Then add IP to the actual instruction and you've got yourself a useful
tool :-)
PEBS also gets way more interesting on Nehalem because of the
ability to capture where cache misses occur. That's the load latency
feature. You need to support that.
I believe you would need to abstract this in a generic fashion so it
could be used on other architectures, such as AMD with IBS.
On Nehalem, it requires the following:
- only works if you sample on MEM_INST_RETIRED:LATENCY_ABOVE_THRESHOLD.
- the threshold must be programmed into a dedicated MSR. The extra
difficulty is that this MSR is shared between CPU when HT is on.
Simple things first. But yeah, we'll get to load-latency eventually.
> I believe you would need to abstract this in a generic fashion so it
> could be used on other architectures, such as AMD with IBS.
Right, Robert said he was working on IBS, I've still not made up my mind
on how to represent IBS properly, its a bit of a weird thing.
> On Nehalem, it requires the following:
>
> - only works if you sample on MEM_INST_RETIRED:LATENCY_ABOVE_THRESHOLD.
Yeah, and then you get to decode the data source thingy, not really a
nice interface. Also, it mostly contains L3 information, not L2/L1.
> - the threshold must be programmed into a dedicated MSR. The extra
> difficulty is that this MSR is shared between CPU when HT is on.
Lovely :/ One way is to program it to the lowest of the two and simply
discard events afterwards.
So I tried enabling the regular PMC overflow interrupt and reprogramming
the counter from that, but touching the counter seems to destroy the
PEBS assist, so much for that idea.
Well sure, but that's not the point. I was thinking that if we need to
do single event pebs anyway, we might as well try to reprogram on the
PMC overflow interrupt instead of on the PEBS overflow and curb some of
that drift.
Also, it makes keeping the event count value a lot easier. But alas.