Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] perf_events: improve x86 event scheduling (v5)

0 views
Skip to first unread message

Stephane Eranian

unread,
Jan 18, 2010, 5:00:03 AM1/18/10
to
This patch improves event scheduling by maximizing the use
of PMU registers regardless of the order in which events are
created in a group.

The algorithm takes into account the list of counter constraints
for each event. It assigns events to counters from the most
constrained, i.e., works on only one counter, to the least
constrained, i.e., works on any counter.

Intel Fixed counter events and the BTS special event are also
handled via this algorithm which is designed to be fairly generic.

The patch also updates the validation of an event to use the
scheduling algorithm. This will cause early failure in
perf_event_open().

The 2nd version of this patch follows the model used
by PPC, by running the scheduling algorithm and the actual
assignment separately. Actual assignment takes place in
hw_perf_enable() whereas scheduling is implemented in
hw_perf_group_sched_in() and x86_pmu_enable().

The 3rd version does:
- correct handling of pmu->enable() error in hw_perf_group_sched_in()
- simplified event scheduling
- constraint storage in x86_schedule_events() (dynamic constraints)
- new put_event_constraints() callback to release a dynamic constraint

The 4th version does:
- remove leftover debug code in x86_perf_event_set_period()

The 5th version does:
- fix missing bitmask initialization in intel_get_event_constraints()
- use bitmap_copy() instead of memcpy() to copy bitmasks
- call pmu->disable() in x86_event_sched_out()

Signed-off-by: Stephane Eranian <era...@google.com>
--
include/asm/perf_event.h | 17 -
kernel/cpu/perf_event.c | 771 +++++++++++++++++++++++++++++++++--------------
2 files changed, 571 insertions(+), 217 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8d9f854..16beb34 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -19,6 +19,7 @@
#define MSR_ARCH_PERFMON_EVENTSEL1 0x187

#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
@@ -26,7 +27,14 @@
/*
* Includes eventsel and unit mask as well:
*/
-#define ARCH_PERFMON_EVENT_MASK 0xffff
+
+
+#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
+#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
+#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
+#define INTEL_ARCH_INV_MASK 0x00800000ULL
+#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
+#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)

/*
* filter mask to validate fixed counter events.
@@ -37,7 +45,12 @@
* The other filters are supported by fixed counters.
* The any-thread option is supported starting with v3.
*/
-#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
+#define INTEL_ARCH_FIXED_MASK \
+ (INTEL_ARCH_CNT_MASK| \
+ INTEL_ARCH_INV_MASK| \
+ INTEL_ARCH_EDGE_MASK|\
+ INTEL_ARCH_UNIT_MASK|\
+ INTEL_ARCH_EVENT_MASK)

#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index d616c06..03a888d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzij...@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t...@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
@@ -68,26 +69,37 @@ struct debug_store {
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};

+#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
+
+struct event_constraint {
+ u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)];
+ int code;
+ int cmask;
+};
+
struct cpu_hw_events {
- struct perf_event *events[X86_PMC_IDX_MAX];
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
struct debug_store *ds;
-};

-struct event_constraint {
- unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int code;
+ int n_events;
+ int n_added;
+ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
};

-#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
-#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
+#define EVENT_CONSTRAINT(c, n, m) { \
+ .code = (c), \
+ .cmask = (m), \
+ .idxmsk[0] = (n) }

-#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->idxmsk[0]; (e)++)
+#define EVENT_CONSTRAINT_END \
+ { .code = 0, .cmask = 0, .idxmsk[0] = 0 }

+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->cmask; (e)++)

/*
* struct x86_pmu - generic x86 pmu
@@ -114,8 +126,9 @@ struct x86_pmu {
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
- int (*get_event_idx)(struct cpu_hw_events *cpuc,
- struct hw_perf_event *hwc);
+ void (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk);
+ void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event);
+ const struct event_constraint *event_constraints;
};

static struct x86_pmu x86_pmu __read_mostly;
@@ -124,7 +137,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

-static const struct event_constraint *event_constraints;
+static int x86_perf_event_set_period(struct perf_event *event,
+ struct hw_perf_event *hwc, int idx);

/*
* Not sure about some of these
@@ -171,14 +185,14 @@ static u64 p6_pmu_raw_event(u64 hw_event)
return hw_event & P6_EVNTSEL_MASK;
}

-static const struct event_constraint intel_p6_event_constraints[] =
+static struct event_constraint intel_p6_event_constraints[] =
{
- EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0xc1, 0x1, INTEL_ARCH_EVENT_MASK), /* FLOPS */
+ EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */
EVENT_CONSTRAINT_END
};

@@ -196,35 +210,43 @@ static const u64 intel_perfmon_event_map[] =
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
};

-static const struct event_constraint intel_core_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
- EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
- EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+static struct event_constraint intel_core_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */
+ EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x2, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0x18, 0x1, INTEL_ARCH_EVENT_MASK), /* IDLE_DURING_DIV */
+ EVENT_CONSTRAINT(0x19, 0x2, INTEL_ARCH_EVENT_MASK), /* DELAYED_BYPASS */
+ EVENT_CONSTRAINT(0xa1, 0x1, INTEL_ARCH_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */
+ EVENT_CONSTRAINT(0xcb, 0x1, INTEL_ARCH_EVENT_MASK), /* MEM_LOAD_RETIRED */
EVENT_CONSTRAINT_END
};

-static const struct event_constraint intel_nehalem_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
- EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
- EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
- EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
- EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
- EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
- EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
+static struct event_constraint intel_nehalem_event_constraints[] = {
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */
+ EVENT_CONSTRAINT(0x40, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LD */
+ EVENT_CONSTRAINT(0x41, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_ST */
+ EVENT_CONSTRAINT(0x42, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK */
+ EVENT_CONSTRAINT(0x43, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_ALL_REF */
+ EVENT_CONSTRAINT(0x4e, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_PREFETCH */
+ EVENT_CONSTRAINT(0x4c, 0x3, INTEL_ARCH_EVENT_MASK), /* LOAD_HIT_PRE */
+ EVENT_CONSTRAINT(0x51, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D */
+ EVENT_CONSTRAINT(0x52, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0x53, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0xc5, 0x3, INTEL_ARCH_EVENT_MASK), /* CACHE_LOCK_CYCLES */
EVENT_CONSTRAINT_END
};

+static struct event_constraint intel_gen_event_constraints[] = {
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -527,11 +549,11 @@ static u64 intel_pmu_raw_event(u64 hw_event)
#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL

#define CORE_EVNTSEL_MASK \
- (CORE_EVNTSEL_EVENT_MASK | \
- CORE_EVNTSEL_UNIT_MASK | \
- CORE_EVNTSEL_EDGE_MASK | \
- CORE_EVNTSEL_INV_MASK | \
- CORE_EVNTSEL_REG_MASK)
+ (INTEL_ARCH_EVTSEL_MASK | \
+ INTEL_ARCH_UNIT_MASK | \
+ INTEL_ARCH_EDGE_MASK | \
+ INTEL_ARCH_INV_MASK | \
+ INTEL_ARCH_CNT_MASK)

return hw_event & CORE_EVNTSEL_MASK;
}
@@ -1120,9 +1142,15 @@ static void amd_pmu_disable_all(void)

void hw_perf_disable(void)
{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
if (!x86_pmu_initialized())
return;
- return x86_pmu.disable_all();
+
+ if (cpuc->enabled)
+ cpuc->n_added = 0;
+
+ x86_pmu.disable_all();
}

static void p6_pmu_enable_all(void)
@@ -1189,10 +1217,229 @@ static void amd_pmu_enable_all(void)
}
}

+static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ int i, j , w, num;
+ int weight, wmax;
+ unsigned long *c;
+ u64 constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+ for(i=0; i < n; i++) {
+ x86_pmu.get_event_constraints(cpuc,
+ cpuc->event_list[i],
+ constraints[i]);
+ }
+
+ /*
+ * weight = number of possible counters
+ *
+ * 1 = most constrained, only works on one counter
+ * wmax = least constrained, works on any counter
+ *
+ * assign events to counters starting with most
+ * constrained events.
+ */
+ wmax = x86_pmu.num_events;
+
+ /*
+ * when fixed event counters are present,
+ * wmax is incremented by 1 to account
+ * for one more choice
+ */
+ if (x86_pmu.num_events_fixed)
+ wmax++;
+
+ num = n;
+ for(w=1; num && w <= wmax; w++) {
+
+ /* for each event */
+ for(i=0; i < n; i++) {
+ c = (unsigned long *)constraints[i];
+ hwc = &cpuc->event_list[i]->hw;
+
+ weight = bitmap_weight(c, X86_PMC_IDX_MAX);
+ if (weight != w)
+ continue;
+
+ /*
+ * try to reuse previous assignment
+ *
+ * This is possible despite the fact that
+ * events or events order may have changed.
+ *
+ * What matters is the level of constraints
+ * of an event and this is constant for now.
+ *
+ * This is possible also because we always
+ * scan from most to least constrained. Thus,
+ * if a counter can be reused, it means no,
+ * more constrained events, needed it. And
+ * next events will either compete for it
+ * (which cannot be solved anyway) or they
+ * have fewer constraints, and they can use
+ * another counter.
+ */
+ j = hwc->idx;
+ if (j != -1 && !test_bit(j, used_mask))
+ goto skip;
+
+ for_each_bit(j, c, X86_PMC_IDX_MAX) {
+ if (!test_bit(j, used_mask))
+ break;
+ }
+
+ if (j == X86_PMC_IDX_MAX)
+ break;
+skip:
+ set_bit(j, used_mask);
+
+ pr_debug("CPU%d config=0x%llx idx=%d assign=%c\n",
+ smp_processor_id(),
+ hwc->config,
+ j,
+ assign ? 'y' : 'n');
+
+ if (assign)
+ assign[i] = j;
+ num--;
+ }
+ }
+ /*
+ * scheduling failed or is just a simulation,
+ * free resources if necessary
+ */
+ if (!assign || num) {
+ for(i=0; i < n; i++) {
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
+ }
+ }
+ return num ? -ENOSPC: 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+
+ /* current number of events already accepted */
+ n = cpuc->n_events;
+
+ if (!is_software_event(leader)) {
+ if (n >= max_count)
+ return -ENOSPC;
+ cpuc->event_list[n] = leader;
+ n++;
+ }
+ if (!dogrp)
+ return n;
+
+ list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ if (is_software_event(event) ||
+ event->state == PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -ENOSPC;
+
+ cpuc->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+ struct hw_perf_event *hwc, int idx)
+{
+ hwc->idx = idx;
+
+ if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that event_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->event_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ } else {
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
+ }
+}
+
void hw_perf_enable(void)
{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int i;
+
if (!x86_pmu_initialized())
return;
+ if (cpuc->n_added) {
+ /*
+ * apply assignment obtained either from
+ * hw_perf_group_sched_in() or x86_pmu_enable()
+ *
+ * step1: save events moving to new counters
+ * step2: reprogram moved events into new counters
+ */
+ for(i=0; i < cpuc->n_events; i++) {
+
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == -1 || hwc->idx == cpuc->assign[i])
+ continue;
+
+ x86_pmu.disable(hwc, hwc->idx);
+
+ clear_bit(hwc->idx, cpuc->active_mask);
+ barrier();
+ cpuc->events[hwc->idx] = NULL;
+
+ x86_perf_event_update(event, hwc, hwc->idx);
+
+ hwc->idx = -1;
+ }
+
+ for(i=0; i < cpuc->n_events; i++) {
+
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == -1) {
+ x86_assign_hw_event(event, hwc, cpuc->assign[i]);
+ x86_perf_event_set_period(event, hwc, hwc->idx);
+ }
+ /*
+ * need to mark as active because x86_pmu_disable()
+ * clear active_mask and eventsp[] yet it preserves
+ * idx
+ */
+ set_bit(hwc->idx, cpuc->active_mask);
+ cpuc->events[hwc->idx] = event;
+
+ x86_pmu.enable(hwc, hwc->idx);
+ perf_event_update_userpage(event);
+ }
+ cpuc->n_added = 0;
+ perf_events_lapic_init();
+ }
x86_pmu.enable_all();
}

@@ -1343,6 +1590,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
bits |= 0x2;
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
bits |= 0x1;
+
+ /*
+ * ANY bit is supported in v3 and up
+ */
+ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+ bits |= 0x4;
+
bits <<= (idx * 4);
mask = 0xfULL << (idx * 4);

@@ -1391,148 +1645,43 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
x86_pmu_enable_event(hwc, idx);
}

-static int fixed_mode_idx(struct hw_perf_event *hwc)
-{
- unsigned int hw_event;
-
- hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (hwc->sample_period == 1)))
- return X86_PMC_IDX_FIXED_BTS;
-
- if (!x86_pmu.num_events_fixed)
- return -1;
-
- /*
- * fixed counters do not take all possible filters
- */
- if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
- return -1;
-
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
- return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
- return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
- return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
- return -1;
-}
-
-/*
- * generic counter allocator: get next free counter
- */
-static int
-gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
- return idx == x86_pmu.num_events ? -1 : idx;
-}
-
/*
- * intel-specific counter allocator: check event constraints
- */
-static int
-intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- const struct event_constraint *event_constraint;
- int i, code;
-
- if (!event_constraints)
- goto skip;
-
- code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
-
- for_each_event_constraint(event_constraint, event_constraints) {
- if (code == event_constraint->code) {
- for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_and_set_bit(i, cpuc->used_mask))
- return i;
- }
- return -1;
- }
- }
-skip:
- return gen_get_event_idx(cpuc, hwc);
-}
-
-static int
-x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = fixed_mode_idx(hwc);
- if (idx == X86_PMC_IDX_FIXED_BTS) {
- /* BTS is already occupied. */
- if (test_and_set_bit(idx, cpuc->used_mask))
- return -EAGAIN;
-
- hwc->config_base = 0;
- hwc->event_base = 0;
- hwc->idx = idx;
- } else if (idx >= 0) {
- /*
- * Try to get the fixed event, if that is already taken
- * then try to get a generic event:
- */
- if (test_and_set_bit(idx, cpuc->used_mask))
- goto try_generic;
-
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that event_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->event_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- hwc->idx = idx;
- } else {
- idx = hwc->idx;
- /* Try to get the previous generic event again */
- if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
- idx = x86_pmu.get_event_idx(cpuc, hwc);
- if (idx == -1)
- return -EAGAIN;
-
- set_bit(idx, cpuc->used_mask);
- hwc->idx = idx;
- }
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
- }
-
- return idx;
-}
-
-/*
- * Find a PMC slot for the freshly enabled / scheduled in event:
+ * activate a single event
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ *
+ * Called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
*/
static int x86_pmu_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- int idx;
+ struct hw_perf_event *hwc;
+ int assign[X86_PMC_IDX_MAX];
+ int n, n0, ret;

- idx = x86_schedule_event(cpuc, hwc);
- if (idx < 0)
- return idx;
+ hwc = &event->hw;

- perf_events_lapic_init();
+ n0 = cpuc->n_events;
+ n = collect_events(cpuc, event, false);
+ if (n < 0)
+ return n;

- x86_pmu.disable(hwc, idx);
-
- cpuc->events[idx] = event;
- set_bit(idx, cpuc->active_mask);
+ ret = x86_schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));

- x86_perf_event_set_period(event, hwc, idx);
- x86_pmu.enable(hwc, idx);
+ cpuc->n_events = n;
+ cpuc->n_added = n - n0;

- perf_event_update_userpage(event);
+ if (hwc->idx != -1)
+ x86_perf_event_set_period(event, hwc, hwc->idx);

return 0;
}
@@ -1576,7 +1725,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
}
- pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);

for (idx = 0; idx < x86_pmu.num_events; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1664,7 +1813,7 @@ static void x86_pmu_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
+ int i, idx = hwc->idx;

/*
* Must be done before we disable, otherwise the nmi handler
@@ -1690,8 +1839,19 @@ static void x86_pmu_disable(struct perf_event *event)
intel_pmu_drain_bts_buffer(cpuc);

cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);

+ for(i=0; i < cpuc->n_events; i++) {
+ if (event == cpuc->event_list[i]) {
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
+
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+
+ --cpuc->n_events;
+ }
+ }
perf_event_update_userpage(event);
}

@@ -1962,6 +2122,176 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_STOP;
}

+static struct event_constraint bts_constraint={
+ .code = 0,
+ .cmask = 0,
+ .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS
+};
+
+static int intel_special_constraints(struct perf_event *event,
+ u64 *idxmsk)
+{
+ unsigned int hw_event;
+
+ hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+
+ if (unlikely((hw_event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (event->hw.sample_period == 1))) {
+
+ bitmap_copy((unsigned long *)idxmsk,
+ (unsigned long *)bts_constraint.idxmsk,
+ X86_PMC_IDX_MAX);
+ return 1;
+ }
+ return 0;
+}
+
+static void intel_get_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ u64 *idxmsk)
+{
+ const struct event_constraint *c;
+
+ /*
+ * cleanup bitmask
+ */
+ bitmap_zero((unsigned long *)idxmsk, X86_PMC_IDX_MAX);
+
+ if (intel_special_constraints(event, idxmsk))
+ return;
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if ((event->hw.config & c->cmask) == c->code) {
+
+ bitmap_copy((unsigned long *)idxmsk,
+ (unsigned long *)c->idxmsk,
+ X86_PMC_IDX_MAX);
+ return;
+ }
+ }
+ }
+ /* no constraints, means supports all generic counters */
+ bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events);
+}
+
+static void amd_get_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ u64 *idxmsk)
+{
+}
+
+static int x86_event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx, int cpu)
+{
+ int ret = 0;
+
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = cpu;
+ event->tstamp_running += event->ctx->time - event->tstamp_stopped;
+
+ if (is_software_event(event))
+ ret = event->pmu->enable(event);
+
+ if (!ret && !is_software_event(event))
+ cpuctx->active_oncpu++;
+
+ if (!ret && event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return ret;
+}
+
+static void x86_event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx, int cpu)
+{
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
+
+ if (is_software_event(event))
+ event->pmu->disable(event);
+
+ event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ *
+ * called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
+ */
+int hw_perf_group_sched_in(struct perf_event *leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct perf_event *sub;
+ int assign[X86_PMC_IDX_MAX];
+ int n0, n1, ret;
+
+ /* n0 = total number of events */
+ n0 = collect_events(cpuc, leader, true);
+ if (n0 < 0)
+ return n0;
+
+ ret = x86_schedule_events(cpuc, n0, assign);
+ if (ret)
+ return ret;
+
+ ret = x86_event_sched_in(leader, cpuctx, cpu);
+ if (ret)
+ return ret;
+
+ n1 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state != PERF_EVENT_STATE_OFF) {
+ ret = x86_event_sched_in(sub, cpuctx, cpu);
+ if (ret)
+ goto undo;
+ ++n1;
+ }
+ }
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n0*sizeof(int));
+
+ cpuc->n_events = n0;
+ cpuc->n_added = n1;
+ ctx->nr_active += n1;
+
+ /*
+ * 1 means successful and events are active
+ * This is not quite true because we defer
+ * actual activation until hw_perf_enable() but
+ * this way we* ensure caller won't try to enable
+ * individual events
+ */
+ return 1;
+undo:
+ x86_event_sched_out(leader, cpuctx, cpu);
+ n0 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ x86_event_sched_out(sub, cpuctx, cpu);
+ if (++n0 == n1)
+ break;
+ }
+ }
+ return ret;
+}
+
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
@@ -1993,7 +2323,8 @@ static __initconst struct x86_pmu p6_pmu = {
*/
.event_bits = 32,
.event_mask = (1ULL << 32) - 1,
- .get_event_idx = intel_get_event_idx,
+ .get_event_constraints = intel_get_event_constraints,
+ .event_constraints = intel_p6_event_constraints
};

static __initconst struct x86_pmu intel_pmu = {
@@ -2017,7 +2348,7 @@ static __initconst struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.enable_bts = intel_pmu_enable_bts,
.disable_bts = intel_pmu_disable_bts,
- .get_event_idx = intel_get_event_idx,
+ .get_event_constraints = intel_get_event_constraints
};

static __initconst struct x86_pmu amd_pmu = {
@@ -2038,7 +2369,7 @@ static __initconst struct x86_pmu amd_pmu = {
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
- .get_event_idx = gen_get_event_idx,
+ .get_event_constraints = amd_get_event_constraints
};

static __init int p6_pmu_init(void)
@@ -2051,12 +2382,9 @@ static __init int p6_pmu_init(void)
case 7:
case 8:
case 11: /* Pentium III */
- event_constraints = intel_p6_event_constraints;
- break;
case 9:
case 13:
/* Pentium M */
- event_constraints = intel_p6_event_constraints;
break;
default:
pr_cont("unsupported p6 CPU model %d ",
@@ -2121,23 +2449,29 @@ static __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

+ x86_pmu.event_constraints = intel_core_event_constraints;
pr_cont("Core2 events, ");
- event_constraints = intel_core_event_constraints;
break;
- default:
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

- event_constraints = intel_nehalem_event_constraints;
+ x86_pmu.event_constraints = intel_nehalem_event_constraints;
pr_cont("Nehalem/Corei7 events, ");
break;
case 28:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

+ x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("Atom events, ");
break;
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
}
return 0;
}
@@ -2234,36 +2568,43 @@ static const struct pmu pmu = {
.unthrottle = x86_pmu_unthrottle,
};

-static int
-validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct hw_perf_event fake_event = event->hw;
-
- if (event->pmu && event->pmu != &pmu)
- return 0;
-
- return x86_schedule_event(cpuc, &fake_event) >= 0;
-}
-
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
static int validate_group(struct perf_event *event)
{
- struct perf_event *sibling, *leader = event->group_leader;
- struct cpu_hw_events fake_pmu;
+ struct perf_event *leader = event->group_leader;
+ struct cpu_hw_events fake_cpuc;
+ int n;

- memset(&fake_pmu, 0, sizeof(fake_pmu));
+ memset(&fake_cpuc, 0, sizeof(fake_cpuc));

- if (!validate_event(&fake_pmu, leader))
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = collect_events(&fake_cpuc, leader, true);
+ if (n < 0)
return -ENOSPC;

- list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
- if (!validate_event(&fake_pmu, sibling))
- return -ENOSPC;
- }
-
- if (!validate_event(&fake_pmu, event))
+ fake_cpuc.n_events = n;
+ n = collect_events(&fake_cpuc, event, false);
+ if (n < 0)
return -ENOSPC;

- return 0;
+ fake_cpuc.n_events = n;
+
+ return x86_schedule_events(&fake_cpuc, n, NULL);
}

const struct pmu *hw_perf_event_init(struct perf_event *event)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Frederic Weisbecker

unread,
Jan 18, 2010, 8:50:01 AM1/18/10
to

Looking at all these numerous places where this whole constraint
and ad hoc scheduling machine tries to catch up with what the
core can already do (handle non-x86 events, revert in failure,
first handle leader, then handle the rest, etc...), I wonder
if this hw_group_sched_in() based design is a good idea.

Shouldn't we actually use the core based pmu->enable(),disable()
model called from kernel/perf_event.c:event_sched_in(),
like every other events, where we can fill up the queue of hardware
events to be scheduled, and then call a hw_check_constraints()
when we finish a group scheduling?

I guess this would simplify all this code, avoid it to run through
the list of events, handle software events, revert partial enabled
by itself etc...

Hm?

Peter Zijlstra

unread,
Jan 18, 2010, 9:00:02 AM1/18/10
to
On Mon, 2010-01-18 at 14:43 +0100, Frederic Weisbecker wrote:
>
> Shouldn't we actually use the core based pmu->enable(),disable()
> model called from kernel/perf_event.c:event_sched_in(),
> like every other events, where we can fill up the queue of hardware
> events to be scheduled, and then call a hw_check_constraints()
> when we finish a group scheduling?

Well the thing that makes hw_perf_group_sched_in() useful is that you
can add a bunch of events and not have to reschedule for each one, but
instead do a single schedule pass.

That said you do have a point, maybe we can express this particular
thing differently.. maybe a pre and post group call like:

void hw_perf_group_sched_in_begin(struct pmu *pmu)
int hw_perf_group_sched_in_end(struct pmu *pmu)

That way we know we need to track more state for rollback and can give
the pmu implementation leeway to delay scheduling/availablility tests.

Paul, would that work for you too?

Then there's still the question of having events of multiple hw pmus in
a single group, I'd be perfectly fine with saying that's not allowed,
what to others think?

Stephane Eranian

unread,
Jan 18, 2010, 9:20:01 AM1/18/10
to
On Mon, Jan 18, 2010 at 2:54 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Mon, 2010-01-18 at 14:43 +0100, Frederic Weisbecker wrote:
>>
>> Shouldn't we actually use the core based pmu->enable(),disable()
>> model called from kernel/perf_event.c:event_sched_in(),
>> like every other events, where we can fill up the queue of hardware
>> events to be scheduled, and then call a hw_check_constraints()
>> when we finish a group scheduling?
>
> Well the thing that makes hw_perf_group_sched_in() useful is that you
> can add a bunch of events and not have to reschedule for each one, but
> instead do a single schedule pass.
>
That's right.

> That said you do have a point, maybe we can express this particular
> thing differently.. maybe a pre and post group call like:
>
>  void hw_perf_group_sched_in_begin(struct pmu *pmu)
>  int  hw_perf_group_sched_in_end(struct pmu *pmu)
>

The issue with hw_perf_group_sched_in() is that because we do not know
when we are done scheduling, we have to defer actual activation until
hw_perf_enable(). But we have to still mark the events as ACTIVE,
otherwise things go wrong in the generic layer and for non-PMU events.
That leads to partial duplication of event_sched_in()/event_sched_out()
in the PMU specific layer.

As Frederic pointed out, the more natural way would be to simply rely
on event_sched_in()/event_sched_out() and the rollback logic and just
drop hw_perf_group_sched_in() which is there as an optimization and
not for correctness. Scheduling can be done incrementally from the
event_sched_in() function.

> That way we know we need to track more state for rollback and can give
> the pmu implementation leeway to delay scheduling/availablility tests.
>

Rollback would still be handled by the generic code, wouldn't it?

> Paul, would that work for you too?
>
> Then there's still the question of having events of multiple hw pmus in
> a single group, I'd be perfectly fine with saying that's not allowed,
> what to others think?
>

I have seen requests for measuring both core and uncore PMU events
together for instance. It all depends on how uncore PMU will be managed.

Peter Zijlstra

unread,
Jan 18, 2010, 9:30:02 AM1/18/10
to
On Mon, 2010-01-18 at 15:12 +0100, Stephane Eranian wrote:
>
> > Then there's still the question of having events of multiple hw pmus in
> > a single group, I'd be perfectly fine with saying that's not allowed,
> > what to others think?
> >
> I have seen requests for measuring both core and uncore PMU events
> together for instance. It all depends on how uncore PMU will be managed.

OK, hard to correlate those two though, it would basically mean only
running that one task on the socket.

Frederic Weisbecker

unread,
Jan 18, 2010, 9:30:02 AM1/18/10
to
On Mon, Jan 18, 2010 at 02:54:58PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 14:43 +0100, Frederic Weisbecker wrote:
> >
> > Shouldn't we actually use the core based pmu->enable(),disable()
> > model called from kernel/perf_event.c:event_sched_in(),
> > like every other events, where we can fill up the queue of hardware
> > events to be scheduled, and then call a hw_check_constraints()
> > when we finish a group scheduling?
>
> Well the thing that makes hw_perf_group_sched_in() useful is that you
> can add a bunch of events and not have to reschedule for each one, but
> instead do a single schedule pass.

Well in appearance, things go through one pass.

But actually not, there is a first iteration that collects
the events (walking trhough the group list, filtering soft events),
a second iteration that check the constraints and schedule (but
not apply) the events.

And thereafter we schedule soft events (and revert the whole if needed).

This is a one pass from group_sched_in() POV but at the cost
of reimplementating what the core does wrt soft events and iterations.
And not only is it reinventing the wheel, it also produces more
iterations than we need.

If we were using the common pmu->enable() from group/event_sched_in(),
that would build the collection, with only one iteration through the
group list (instead of one to collect, and one for the software
events).

And the constraints can be validated in a second explicit iteration
through hw_check_constraint(), like it's currently done explicitly
from hw_perf_group_sched_in() that calls x86_schedule_event().

The fact is we have with this patch a _lot_ of iterations each
time x86 get scheduled. This is really a lot for a fast path.
But considering the dynamic cpu events / task events series
we can have, I don't see other alternatives.

But still there are wasteful iterations that can be avoided
with the above statements.


>
> That said you do have a point, maybe we can express this particular
> thing differently.. maybe a pre and post group call like:
>
> void hw_perf_group_sched_in_begin(struct pmu *pmu)
> int hw_perf_group_sched_in_end(struct pmu *pmu)
>
> That way we know we need to track more state for rollback and can give
> the pmu implementation leeway to delay scheduling/availablility tests.


Do you mean this:

hw_perf_group_sched_in_begin(&x86_pmu);

for_each_event(event, group) {
event->enable(); //do the collection here
}


if (hw_perf_group_sched_in_end(&x86_pmu)) {
rollback...
}

That requires to know in advance if we have hardware pmu
in the list though (can be a flag in the group).


> Then there's still the question of having events of multiple hw pmus in
> a single group, I'd be perfectly fine with saying that's not allowed,
> what to others think?


I guess we need that. It can be insteresting to couple
hardware counters with memory accesses...or whatever.
Perf stat combines cache miss counting with page faults,
cpu clock counters.
We shouldn't limit such possibilities for technical/cleanliness
reasons. We should rather adapt.

Hm?

Peter Zijlstra

unread,
Jan 18, 2010, 9:40:01 AM1/18/10
to
On Mon, 2010-01-18 at 15:20 +0100, Frederic Weisbecker wrote:
>
>
> Well in appearance, things go through one pass.
>
> But actually not, there is a first iteration that collects
> the events (walking trhough the group list, filtering soft events),
> a second iteration that check the constraints and schedule (but
> not apply) the events.
>
> And thereafter we schedule soft events (and revert the whole if
> needed).
>
> This is a one pass from group_sched_in() POV but at the cost
> of reimplementating what the core does wrt soft events and iterations.
> And not only is it reinventing the wheel, it also produces more
> iterations than we need.
>
> If we were using the common pmu->enable() from group/event_sched_in(),
> that would build the collection, with only one iteration through the
> group list (instead of one to collect, and one for the software
> events).
>
> And the constraints can be validated in a second explicit iteration
> through hw_check_constraint(), like it's currently done explicitly
> from hw_perf_group_sched_in() that calls x86_schedule_event().

Thing is, we cannot do that, because we currently require ->enable() to
report schedulability. Now we could add an argument to ->enable, or add
callbacks like I suggested to convey that state.

> The fact is we have with this patch a _lot_ of iterations each
> time x86 get scheduled. This is really a lot for a fast path.
> But considering the dynamic cpu events / task events series
> we can have, I don't see other alternatives.

Luckily it tries to use a previous configuration, so in practise the
schedule phase is real quick amortized O(1) as long as we don't change
the set.

> Do you mean this:
>
> hw_perf_group_sched_in_begin(&x86_pmu);
>
> for_each_event(event, group) {
> event->enable(); //do the collection here
> }
>
>
> if (hw_perf_group_sched_in_end(&x86_pmu)) {
> rollback...
> }
>
> That requires to know in advance if we have hardware pmu
> in the list though (can be a flag in the group).

Good point, but your proposed hw_check_constraint() call needs to know
the exact same.

Peter Zijlstra

unread,
Jan 18, 2010, 9:40:02 AM1/18/10
to
On Mon, 2010-01-18 at 15:12 +0100, Stephane Eranian wrote:
> > That said you do have a point, maybe we can express this particular
> > thing differently.. maybe a pre and post group call like:
> >
> > void hw_perf_group_sched_in_begin(struct pmu *pmu)
> > int hw_perf_group_sched_in_end(struct pmu *pmu)
> >
> The issue with hw_perf_group_sched_in() is that because we do not know
> when we are done scheduling, we have to defer actual activation until
> hw_perf_enable(). But we have to still mark the events as ACTIVE,
> otherwise things go wrong in the generic layer and for non-PMU events.
> That leads to partial duplication of event_sched_in()/event_sched_out()
> in the PMU specific layer.
>
> As Frederic pointed out, the more natural way would be to simply rely
> on event_sched_in()/event_sched_out() and the rollback logic and just
> drop hw_perf_group_sched_in() which is there as an optimization and
> not for correctness. Scheduling can be done incrementally from the
> event_sched_in() function.
>
> > That way we know we need to track more state for rollback and can give
> > the pmu implementation leeway to delay scheduling/availablility tests.
> >
> Rollback would still be handled by the generic code, wouldn't it?

I'm not sure I understand your reply. Sure dropping
hw_perf_group_sched_in() is still correct, but its also less optimal,
since we have to determine schedulability for each incremental event.

Peter Zijlstra

unread,
Jan 18, 2010, 9:40:02 AM1/18/10
to
On Mon, 2010-01-18 at 15:20 +0100, Frederic Weisbecker wrote:
>
> > Then there's still the question of having events of multiple hw pmus in
> > a single group, I'd be perfectly fine with saying that's not allowed,
> > what to others think?
>
>
> I guess we need that. It can be insteresting to couple
> hardware counters with memory accesses...or whatever.

That really depends on how easy it is to correlate events from the
various pmus. This case could indeed do that, but the core vs uncore
tihng is a lot less clear.

> Perf stat combines cache miss counting with page faults,
> cpu clock counters.

perf stat also doesn't use groups and it still works quite nicely.

> We shouldn't limit such possibilities for technical/cleanliness
> reasons. We should rather adapt.

Maybe, I'm not a very big fan of groups myself, but they are clearly
useful within a pmu, measuring cache misses through total-access for
example, but the use between pmus is questionable.

But sure, if we can do it without too much pain, that's fine.

Frederic Weisbecker

unread,
Jan 18, 2010, 9:50:02 AM1/18/10
to
On Mon, Jan 18, 2010 at 03:32:38PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 15:20 +0100, Frederic Weisbecker wrote:
> >
> >
> > Well in appearance, things go through one pass.
> >
> > But actually not, there is a first iteration that collects
> > the events (walking trhough the group list, filtering soft events),
> > a second iteration that check the constraints and schedule (but
> > not apply) the events.
> >
> > And thereafter we schedule soft events (and revert the whole if
> > needed).
> >
> > This is a one pass from group_sched_in() POV but at the cost
> > of reimplementating what the core does wrt soft events and iterations.
> > And not only is it reinventing the wheel, it also produces more
> > iterations than we need.
> >
> > If we were using the common pmu->enable() from group/event_sched_in(),
> > that would build the collection, with only one iteration through the
> > group list (instead of one to collect, and one for the software
> > events).
> >
> > And the constraints can be validated in a second explicit iteration
> > through hw_check_constraint(), like it's currently done explicitly
> > from hw_perf_group_sched_in() that calls x86_schedule_event().
>
> Thing is, we cannot do that, because we currently require ->enable() to
> report schedulability. Now we could add an argument to ->enable, or add
> callbacks like I suggested to convey that state.


Hmm, but the schedulability status can be overriden in this case by
the callbacks you mentioned. The thing is I'm not sure how you
mean to use these. Is it like I did in the previous mockup, by
calling hw_perf_group_sched_in_begin() in the beginning of a group
scheduling and hw_perf_group_sched_in_end() in the end?


> > The fact is we have with this patch a _lot_ of iterations each
> > time x86 get scheduled. This is really a lot for a fast path.
> > But considering the dynamic cpu events / task events series
> > we can have, I don't see other alternatives.
>
> Luckily it tries to use a previous configuration, so in practise the
> schedule phase is real quick amortized O(1) as long as we don't change
> the set.

Yeah.



> > Do you mean this:
> >
> > hw_perf_group_sched_in_begin(&x86_pmu);
> >
> > for_each_event(event, group) {
> > event->enable(); //do the collection here
> > }
> >
> >
> > if (hw_perf_group_sched_in_end(&x86_pmu)) {
> > rollback...
> > }
> >
> > That requires to know in advance if we have hardware pmu
> > in the list though (can be a flag in the group).
>
> Good point, but your proposed hw_check_constraint() call needs to know
> the exact same.


True. Whatever model we use anyway, both implement the same idea.

Peter Zijlstra

unread,
Jan 18, 2010, 10:00:02 AM1/18/10
to
On Mon, 2010-01-18 at 15:45 +0100, Frederic Weisbecker wrote:
>
> > > That requires to know in advance if we have hardware pmu
> > > in the list though (can be a flag in the group).
> >
> > Good point, but your proposed hw_check_constraint() call needs to know
> > the exact same.
>
>
> True. Whatever model we use anyway, both implement the same idea.

Hmm, we seem to already have that problem (which would indicate another
bug in the hw-breakpoint stuff), how do you deal with
hw_perf_{enable,disable}() for the breakpoints?

We could fix this by making these pmu state functions into a callback
list visiting all registered PMUs.

Frederic Weisbecker

unread,
Jan 18, 2010, 10:00:01 AM1/18/10
to
On Mon, Jan 18, 2010 at 03:37:01PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 15:20 +0100, Frederic Weisbecker wrote:
> >
> > > Then there's still the question of having events of multiple hw pmus in
> > > a single group, I'd be perfectly fine with saying that's not allowed,
> > > what to others think?
> >
> >
> > I guess we need that. It can be insteresting to couple
> > hardware counters with memory accesses...or whatever.
>
> That really depends on how easy it is to correlate events from the
> various pmus. This case could indeed do that, but the core vs uncore
> tihng is a lot less clear.

Not sure what you both mean by this core VS uncore thing :)
Is it about hardware counters that apply to single hardware threads
or shared among them inside a same core?



> > Perf stat combines cache miss counting with page faults,
> > cpu clock counters.
>
> perf stat also doesn't use groups and it still works quite nicely.


Ah? I thought it does.



> > We shouldn't limit such possibilities for technical/cleanliness
> > reasons. We should rather adapt.
>
> Maybe, I'm not a very big fan of groups myself, but they are clearly
> useful within a pmu, measuring cache misses through total-access for
> example, but the use between pmus is questionable.


Cross pmu, these seem to only make sense for non pinned groups.
If you want two non-pinned counters to be paired and not randomly
and separately scheduled.

For other cases, indeed I'm not sure it is useful :)

Peter Zijlstra

unread,
Jan 18, 2010, 10:10:01 AM1/18/10
to
On Mon, 2010-01-18 at 15:53 +0100, Frederic Weisbecker wrote:

> Not sure what you both mean by this core VS uncore thing :)
> Is it about hardware counters that apply to single hardware threads
> or shared among them inside a same core?

Yeah intel has two PMUs, one per logical cpu and one per socket/node. We
currently don't support the node one yet.

So the thing with these socket wide things is is that they are not able
to tell where the event originated, so relating that back to a task is
only possible when there's only one task running.

Frederic Weisbecker

unread,
Jan 18, 2010, 11:20:03 AM1/18/10
to
On Mon, Jan 18, 2010 at 03:56:41PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 15:45 +0100, Frederic Weisbecker wrote:
> >
> > > > That requires to know in advance if we have hardware pmu
> > > > in the list though (can be a flag in the group).
> > >
> > > Good point, but your proposed hw_check_constraint() call needs to know
> > > the exact same.
> >
> >
> > True. Whatever model we use anyway, both implement the same idea.
>
> Hmm, we seem to already have that problem (which would indicate another
> bug in the hw-breakpoint stuff), how do you deal with
> hw_perf_{enable,disable}() for the breakpoints?


We don't have ordering constraints for breakpoints, only constraints
on the number of available registers.

So we check the constraints when a breakpoint registers. The
enable/disable then (is supposed to) always succeed on breakpoint
pmu, except for flexible breakpoints that can make it or not,
but no need to overwrite group scheduling for that.

Peter Zijlstra

unread,
Jan 18, 2010, 11:30:02 AM1/18/10
to
On Mon, 2010-01-18 at 17:18 +0100, Frederic Weisbecker wrote:
> On Mon, Jan 18, 2010 at 03:56:41PM +0100, Peter Zijlstra wrote:
> > On Mon, 2010-01-18 at 15:45 +0100, Frederic Weisbecker wrote:
> > >
> > > > > That requires to know in advance if we have hardware pmu
> > > > > in the list though (can be a flag in the group).
> > > >
> > > > Good point, but your proposed hw_check_constraint() call needs to know
> > > > the exact same.
> > >
> > >
> > > True. Whatever model we use anyway, both implement the same idea.
> >
> > Hmm, we seem to already have that problem (which would indicate another
> > bug in the hw-breakpoint stuff), how do you deal with
> > hw_perf_{enable,disable}() for the breakpoints?
>
>
> We don't have ordering constraints for breakpoints, only constraints
> on the number of available registers.
>
> So we check the constraints when a breakpoint registers. The
> enable/disable then (is supposed to) always succeed on breakpoint
> pmu, except for flexible breakpoints that can make it or not,
> but no need to overwrite group scheduling for that.

hw_perf_{enable,disable} are unrelated to groups.

Frederic Weisbecker

unread,
Jan 18, 2010, 11:30:03 AM1/18/10
to
On Mon, Jan 18, 2010 at 03:59:58PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 15:53 +0100, Frederic Weisbecker wrote:
>
> > Not sure what you both mean by this core VS uncore thing :)
> > Is it about hardware counters that apply to single hardware threads
> > or shared among them inside a same core?
>
> Yeah intel has two PMUs, one per logical cpu and one per socket/node. We
> currently don't support the node one yet.
>
> So the thing with these socket wide things is is that they are not able
> to tell where the event originated, so relating that back to a task is
> only possible when there's only one task running.
>


Ah ok.

Frederic Weisbecker

unread,
Jan 18, 2010, 12:00:01 PM1/18/10
to
On Mon, Jan 18, 2010 at 05:26:13PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 17:18 +0100, Frederic Weisbecker wrote:
> > On Mon, Jan 18, 2010 at 03:56:41PM +0100, Peter Zijlstra wrote:
> > > On Mon, 2010-01-18 at 15:45 +0100, Frederic Weisbecker wrote:
> > > >
> > > > > > That requires to know in advance if we have hardware pmu
> > > > > > in the list though (can be a flag in the group).
> > > > >
> > > > > Good point, but your proposed hw_check_constraint() call needs to know
> > > > > the exact same.
> > > >
> > > >
> > > > True. Whatever model we use anyway, both implement the same idea.
> > >
> > > Hmm, we seem to already have that problem (which would indicate another
> > > bug in the hw-breakpoint stuff), how do you deal with
> > > hw_perf_{enable,disable}() for the breakpoints?
> >
> >
> > We don't have ordering constraints for breakpoints, only constraints
> > on the number of available registers.
> >
> > So we check the constraints when a breakpoint registers. The
> > enable/disable then (is supposed to) always succeed on breakpoint
> > pmu, except for flexible breakpoints that can make it or not,
> > but no need to overwrite group scheduling for that.
>
> hw_perf_{enable,disable} are unrelated to groups.


Right hw_perf_enable/disable have no action on breakpoint events.
These were somehow considered as software events until now.

That raises the question: why perf_disable() only takes care
of hardware events? Very few software events can trigger
between perf_disable() and perf_enable() sections though.

May be I should handle breakpoints there.

Peter Zijlstra

unread,
Jan 18, 2010, 12:20:05 PM1/18/10
to
On Mon, 2010-01-18 at 17:51 +0100, Frederic Weisbecker wrote:

> Right hw_perf_enable/disable have no action on breakpoint events.
> These were somehow considered as software events until now.
>
> That raises the question: why perf_disable() only takes care
> of hardware events? Very few software events can trigger
> between perf_disable() and perf_enable() sections though.
>
> May be I should handle breakpoints there.

OK, so maybe I'm not understanding the breakpoint stuff correctly, why
is it modeled as a software pmu? It has resource constraints like a
hardware pmu.

Frederic Weisbecker

unread,
Jan 18, 2010, 12:40:04 PM1/18/10
to
On Mon, Jan 18, 2010 at 06:13:26PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 17:51 +0100, Frederic Weisbecker wrote:
>
> > Right hw_perf_enable/disable have no action on breakpoint events.
> > These were somehow considered as software events until now.
> >
> > That raises the question: why perf_disable() only takes care
> > of hardware events? Very few software events can trigger
> > between perf_disable() and perf_enable() sections though.
> >
> > May be I should handle breakpoints there.
>
> OK, so maybe I'm not understanding the breakpoint stuff correctly, why
> is it modeled as a software pmu? It has resource constraints like a
> hardware pmu.


It doesn't use the software pmu, it uses its own. But what kind
of properties can it share with other hardware events?

It has constraints that only need to be checked when we register
the event. It has also constraint on enable time but nothing
tricky that requires an overwritten group scheduling.

And moreover it has no internal counters, it sensibly behaves
much like a software pmu by just triggering events.

Peter Zijlstra

unread,
Jan 18, 2010, 3:10:01 PM1/18/10
to
On Mon, 2010-01-18 at 18:29 +0100, Frederic Weisbecker wrote:
> On Mon, Jan 18, 2010 at 06:13:26PM +0100, Peter Zijlstra wrote:
> > On Mon, 2010-01-18 at 17:51 +0100, Frederic Weisbecker wrote:
> >
> > > Right hw_perf_enable/disable have no action on breakpoint events.
> > > These were somehow considered as software events until now.
> > >
> > > That raises the question: why perf_disable() only takes care
> > > of hardware events? Very few software events can trigger
> > > between perf_disable() and perf_enable() sections though.
> > >
> > > May be I should handle breakpoints there.
> >
> > OK, so maybe I'm not understanding the breakpoint stuff correctly, why
> > is it modeled as a software pmu? It has resource constraints like a
> > hardware pmu.
>
>
> It doesn't use the software pmu, it uses its own. But what kind
> of properties can it share with other hardware events?
>
> It has constraints that only need to be checked when we register
> the event. It has also constraint on enable time but nothing
> tricky that requires an overwritten group scheduling.

the only group scheduling bit is hw_perf_group_sched_in(), and I guess
you can get away without hw_perf_disable() because it doesn't generate
nmis, although I'd have to audit the code to verify a properly placed
breakpoint won't trip things up, since the core code basically assumes
counters won't trigger within perf_disable/enable sections.

Peter Zijlstra

unread,
Jan 19, 2010, 7:30:02 AM1/19/10
to
On Mon, 2010-01-18 at 18:29 +0100, Frederic Weisbecker wrote:

> It has constraints that only need to be checked when we register
> the event. It has also constraint on enable time but nothing
> tricky that requires an overwritten group scheduling.

The fact that ->enable() can fail makes it a hardware counter. Software
counters cannot fail enable.

Having multiple groups of failable events (multiple hardware pmus) can
go wrong with the current core in interesting ways, look for example at
__perf_event_sched_in():

It does:

int can_add_hw = 1;

...

list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
/* Ignore events in OFF or ERROR state */
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
/*
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
if (event->cpu != -1 && event->cpu != cpu)
continue;

if (group_can_go_on(event, cpuctx, can_add_hw))
if (group_sched_in(event, cpuctx, ctx, cpu))
can_add_hw = 0;
}

Now, if you look at that logic you'll see that it assumes there's one hw
device since it only has one can_add_hw state. So if your hw_breakpoint
pmu starts to fail we'll also stop adding counters to the cpu pmu (for
lack of a better name) and vs.

This might be fixable by using per-cpu struct pmu variables.

I'm going to try and move all the weak hw_perf_* functions into struct
pmu and create a notifier like callchain for them so we can have proper
per pmu state, and then use that to fix these things up.

However I'm afraid its far to late to push any of that into .33, which
means .33 will have rather funny behaviour once the breakpoints start
getting used.

Peter Zijlstra

unread,
Jan 19, 2010, 8:30:03 AM1/19/10
to

Hrmph, so I read some of that hw_breakpoint stuff, and now I'm sorta
confused, it looks like ->enable should never fail, but that means you
cannot overcommit breakpoints, which doesn't fit the perf model nicely.

Also, I see you set an ->unthrottle, but then don't implement it, but
comment it as todo, which is strange because that implies its broken. If
there's an ->unthrottle method it will throttle, so if its todo, the
safest thing is to not set it.

/me mutters something and goes look at something else for a while.

Frederic Weisbecker

unread,
Jan 19, 2010, 10:50:03 AM1/19/10
to


Indeed.


>
> This might be fixable by using per-cpu struct pmu variables.


Yeah or just a mask instead of can_add_hw that can testify
a given pmu type couldn't be scheduled anymore?

We can extend struct perf_event:group_flags:

enum perf_group_flag {
PERF_GROUP_NO_HARDWARE = 0x1,
PERF_GROUP_NO_BREAKPOINT = 0x2
PERF_GROUP_SOFTWARE = PERF_GROUP_NO_HARDWARE | PERF_GROUP_NO_BREAKPOINT;
};


Looks easy to maintain and to use for quick bitwise check
on flexible groups scheduling.


> However I'm afraid its far to late to push any of that into .33, which
> means .33 will have rather funny behaviour once the breakpoints start
> getting used.


No. Because for now it is not supposed to happen that a breakpoint
can't be scheduled.

We have constraints that check whether a pinned breakpoint will
always be able to go in, on registration. Similarly we have
constraints for flexible ones, to ensure it's possible for these
to be scheduled. But these are broken because of the non-strict
ordering between pinned and non-pinned events.

Anyway, the point is that for now we treat flexible breakpoints
as if these were pinned, so a breakpoint is not supposed to
be unschedulable, ever. So .33 is not broken.

But we need to fix what you described for .34, because once we
get a strict ordering between pinned and flexible, I'm going
to reactivate the breakpoint constraints for flexibles.

Frederic Weisbecker

unread,
Jan 19, 2010, 11:00:02 AM1/19/10
to
On Tue, Jan 19, 2010 at 02:24:49PM +0100, Peter Zijlstra wrote:
> Hrmph, so I read some of that hw_breakpoint stuff, and now I'm sorta
> confused, it looks like ->enable should never fail, but that means you
> cannot overcommit breakpoints, which doesn't fit the perf model nicely.

Yeah :)
I described this in my previous mail to you. Breakpoint events, for
now, are not supposed to fail on enable().


But once we have the strict pinned -> flexible ordering,
I'll rework this.


> Also, I see you set an ->unthrottle, but then don't implement it, but
> comment it as todo, which is strange because that implies its broken. If
> there's an ->unthrottle method it will throttle, so if its todo, the
> safest thing is to not set it.


Yeah, that's because I have a too vague idea on what is the purpose
of the unthrottle() callback.

I've read the concerned codes that call this, several times, and I still
can't figure out what happens there, not sure what is meant by throttle
or unthrottle there :-/


> /me mutters something and goes look at something else for a while.


Yeah, that's still a young code that needs improvement :)

Peter Zijlstra

unread,
Jan 19, 2010, 11:40:02 AM1/19/10
to
On Tue, 2010-01-19 at 16:55 +0100, Frederic Weisbecker wrote:
> > Also, I see you set an ->unthrottle, but then don't implement it, but
> > comment it as todo, which is strange because that implies its broken. If
> > there's an ->unthrottle method it will throttle, so if its todo, the
> > safest thing is to not set it.
>
>
> Yeah, that's because I have a too vague idea on what is the purpose
> of the unthrottle() callback.
>
> I've read the concerned codes that call this, several times, and I still
> can't figure out what happens there, not sure what is meant by throttle
> or unthrottle there :-/

OK, so not setting it is relatively safe.

As to what it does, it has to undo everything you do when
perf_event_overflow() returns true, which happens when ->unthrottle is
set and we get more than sysctl_perf_event_sample_rate/HZ events in a
jiffy.

If you look at the x86 implementation, you'll see that we simply disable
the hardware counter when the overflow call returns true, so the
unthrottle() callback simply enables it again.

Stephane Eranian

unread,
Jan 21, 2010, 5:10:02 AM1/21/10
to
>> > Do you mean this:
>> >
>> > hw_perf_group_sched_in_begin(&x86_pmu);
>> >
>> > for_each_event(event, group) {
>> >         event->enable();        //do the collection here
>> > }
>> >
>> >
>> > if (hw_perf_group_sched_in_end(&x86_pmu)) {
>> >         rollback...
>> > }
>> >
>> > That requires to know in advance if we have hardware pmu
>> > in the list though (can be a flag in the group).
>>

I don't think this model can work without scheduling for each event.

Imagine the situation where you have more events than you have
counters. At each tick you:
- disable all events
- rotate the list
- collect events from the list
- schedule events
- activate

Collection is the accumulation of events until you have as many as you
have counters
given you defer scheduling until the end (see loop above).

But that does not mean you can schedule what you have accumulated. And then what
do you do, i.e., rollback to what?

With incremental, you can skip a group that is conflicting with the
groups already
accumulated. What hw_perf_group_sched_in() gives you is simply a way to do
incremental on a whole event group at once.

Given the perf_event model, I believe you have no other way but to do
incremental
scheduling of events. That is the only way you guarantee you maximize the use of
the PMU. Regardless of that, the scheduling model has a bias towards smaller
and less constrained event groups.

Peter Zijlstra

unread,
Jan 21, 2010, 5:20:01 AM1/21/10
to
On Thu, 2010-01-21 at 11:08 +0100, Stephane Eranian wrote:
> >> > Do you mean this:
> >> >
> >> > hw_perf_group_sched_in_begin(&x86_pmu);
> >> >
> >> > for_each_event(event, group) {
> >> > event->enable(); //do the collection here
> >> > }
> >> >
> >> >
> >> > if (hw_perf_group_sched_in_end(&x86_pmu)) {
> >> > rollback...
> >> > }
> >> >
> >> > That requires to know in advance if we have hardware pmu
> >> > in the list though (can be a flag in the group).
> >>
>
> I don't think this model can work without scheduling for each event.
>
> Imagine the situation where you have more events than you have
> counters. At each tick you:

No it wont indeed, but it will work for where we now use
hw_perf_group_sched_in() without having to replicate lots of code.

For the cases you mention I see no other way than to try and schedule
each event individually.

Stephane Eranian

unread,
Jan 21, 2010, 5:30:03 AM1/21/10
to
On Thu, Jan 21, 2010 at 11:11 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Thu, 2010-01-21 at 11:08 +0100, Stephane Eranian wrote:
>> >> > Do you mean this:
>> >> >
>> >> > hw_perf_group_sched_in_begin(&x86_pmu);
>> >> >
>> >> > for_each_event(event, group) {
>> >> >         event->enable();        //do the collection here
>> >> > }
>> >> >
>> >> >
>> >> > if (hw_perf_group_sched_in_end(&x86_pmu)) {
>> >> >         rollback...
>> >> > }
>> >> >
>> >> > That requires to know in advance if we have hardware pmu
>> >> > in the list though (can be a flag in the group).
>> >>
>>
>> I don't think this model can work without scheduling for each event.
>>
>> Imagine the situation where you have more events than you have
>> counters. At each tick you:
>
> No it wont indeed, but it will work for where we now use
> hw_perf_group_sched_in() without having to replicate lots of code.
>
> For the cases you mention I see no other way than to try and schedule
> each event individually.
>
Are you suggesting a speculative approach where you first try simply
accumulate then schedule and if this fails, then restart the whole
loop but this time adding and scheduling each event individually?
For groups, you'd have to fail the group if one of its events fails.

Peter Zijlstra

unread,
Jan 21, 2010, 5:30:04 AM1/21/10
to
On Thu, 2010-01-21 at 11:21 +0100, Stephane Eranian wrote:

> Are you suggesting a speculative approach where you first try simply
> accumulate then schedule and if this fails, then restart the whole
> loop but this time adding and scheduling each event individually?
> For groups, you'd have to fail the group if one of its events fails.

No, I'm only talking about groups. The complaint from frederic was that
current hw_perf_group_sched_in() implementations have to basically
replicate all of the group_sched_in() and event_sched_in() stuff, which
seems wasteful.

So I was thinking of an alternative interface that would give the same
end result but not as much code replication.

I'm now leaning towards adding a parameter to ->enable() to postpone
schedulability and add a hw_perf_validate() like call.

With that I'm also looking at what would be the sanest way to multiplex
all the current weak hw_perf* functions in the light of multiple pmu
implementations.

Stephane Eranian

unread,
Jan 21, 2010, 5:40:02 AM1/21/10
to
On Thu, Jan 21, 2010 at 11:28 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Thu, 2010-01-21 at 11:21 +0100, Stephane Eranian wrote:
>
>> Are you suggesting a speculative approach where you first try simply
>> accumulate then schedule and if this fails, then restart the whole
>> loop but this time adding and scheduling each event individually?
>> For groups, you'd have to fail the group if one of its events fails.
>
> No, I'm only talking about groups. The complaint from frederic was that
> current hw_perf_group_sched_in() implementations have to basically
> replicate all of the group_sched_in() and event_sched_in() stuff, which
> seems wasteful.
>
I agree about the replication problem. But this comes from the fact that
if you call hw_perf_group_sched_in() and you succeed, you want to only
execute part of what group_sched_in() normally does, namely mark the
event as active, update timing but skip event_sched_in() stuff, incl. enable().
Actual activation is deferred until perf_enable() which avoids having some
events actually measuring while you are still scheduling events.

> So I was thinking of an alternative interface that would give the same
> end result but not as much code replication.
>
> I'm now leaning towards adding a parameter to ->enable() to postpone
> schedulability and add a hw_perf_validate() like call.
>
> With that I'm also looking at what would be the sanest way to multiplex
> all the current weak hw_perf* functions in the light of multiple pmu
> implementations.
>
>

--
Stephane Eranian | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

Peter Zijlstra

unread,
Jan 21, 2010, 5:40:02 AM1/21/10
to
On Mon, 2010-01-18 at 10:58 +0200, Stephane Eranian wrote:
> void hw_perf_enable(void)
> {


> + cpuc->n_added = 0;
> + perf_events_lapic_init();

Just wondering, why do we need that lapic_init() there?

> + }
> x86_pmu.enable_all();

Peter Zijlstra

unread,
Jan 21, 2010, 5:50:01 AM1/21/10
to
On Thu, 2010-01-21 at 11:43 +0100, Stephane Eranian wrote:

> On Thu, Jan 21, 2010 at 11:36 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> > On Mon, 2010-01-18 at 10:58 +0200, Stephane Eranian wrote:
> >> void hw_perf_enable(void)
> >> {
> >
> >
> >> + cpuc->n_added = 0;
> >> + perf_events_lapic_init();
> >
> > Just wondering, why do we need that lapic_init() there?
> >
> I think I picked it up from x86_pmu_enable(). I don't think
> you necessarily need it here. Not clear to me why it was
> in x86_pmu_enable() to begin with.

Right, wondering about the same ;-)

> I will post a new version of the patch which fixes some bugs and
> also implements true fast path (reuse of previous assignment). It turned
> out, things were a bit more complicated than what I had in v5.

Could you post a diff against your previous version, I just picked up
-v5 and tidied a few things up and send it mingo wards (although it
appears he hasn't picked it up yet).

Frederic Weisbecker

unread,
Jan 21, 2010, 5:50:02 AM1/21/10
to
On Thu, Jan 21, 2010 at 11:08:12AM +0100, Stephane Eranian wrote:
> >> > Do you mean this:
> >> >
> >> > hw_perf_group_sched_in_begin(&x86_pmu);
> >> >
> >> > for_each_event(event, group) {
> >> > � � � � event->enable(); � � � �//do the collection here
> >> > }
> >> >
> >> >
> >> > if (hw_perf_group_sched_in_end(&x86_pmu)) {
> >> > � � � � rollback...
> >> > }
> >> >
> >> > That requires to know in advance if we have hardware pmu
> >> > in the list though (can be a flag in the group).
> >>
>
> I don't think this model can work without scheduling for each event.
>
> Imagine the situation where you have more events than you have
> counters. At each tick you:
> - disable all events
> - rotate the list
> - collect events from the list
> - schedule events
> - activate
>
> Collection is the accumulation of events until you have as many as you
> have counters
> given you defer scheduling until the end (see loop above).
>
> But that does not mean you can schedule what you have accumulated. And then what
> do you do, i.e., rollback to what?

If the scheduling validation fails, then you just need to rollback
the whole group.

That's sensibly what you did in your patch, right? Except the loop
is now handled by the core code.


>
> With incremental, you can skip a group that is conflicting with the
> groups already
> accumulated. What hw_perf_group_sched_in() gives you is simply a way to do
> incremental on a whole event group at once.


I don't understand why that can't be done with the above model.
In your patch we iterate through the whole group, collect events,
and schedule them.

With the above, the collection is just done on enable(), and the scheduling
is done with the new pmu callbacks.

The thing is sensibly the same, where is the obstacle?


>
> Given the perf_event model, I believe you have no other way but to do
> incremental
> scheduling of events. That is the only way you guarantee you maximize the use of
> the PMU. Regardless of that, the scheduling model has a bias towards smaller
> and less constrained event groups.


But the incremental is still the purpose of the above model. I feel
confused.

Stephane Eranian

unread,
Jan 21, 2010, 5:50:02 AM1/21/10
to
On Thu, Jan 21, 2010 at 11:36 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Mon, 2010-01-18 at 10:58 +0200, Stephane Eranian wrote:
>>  void hw_perf_enable(void)
>>  {
>
>
>> +               cpuc->n_added = 0;
>> +               perf_events_lapic_init();
>
> Just wondering, why do we need that lapic_init() there?
>
I think I picked it up from x86_pmu_enable(). I don't think
you necessarily need it here. Not clear to me why it was
in x86_pmu_enable() to begin with.

I will post a new version of the patch which fixes some bugs and


also implements true fast path (reuse of previous assignment). It turned
out, things were a bit more complicated than what I had in v5.

Stephane Eranian

unread,
Jan 21, 2010, 6:50:02 AM1/21/10
to
Ok, I think I missed where you were actually placing that loop.
So you want to do this in group_sched_in(), right?

>
> I don't understand why that can't be done with the above model.
> In your patch we iterate through the whole group, collect events,
> and schedule them.
>
> With the above, the collection is just done on enable(), and the scheduling
> is done with the new pmu callbacks.
>
> The thing is sensibly the same, where is the obstacle?
>

There is none. You've just hoisted the some of the code from
hw_perf_group_sched_in().

Frederic Weisbecker

unread,
Jan 21, 2010, 7:10:01 AM1/21/10
to
On Thu, Jan 21, 2010 at 12:44:03PM +0100, Stephane Eranian wrote:
> > If the scheduling validation fails, then you just need to rollback
> > the whole group.
> >
> > That's sensibly what you did in your patch, right? Except the loop
> > is now handled by the core code.
> >
> >
> Ok, I think I missed where you were actually placing that loop.
> So you want to do this in group_sched_in(), right?

Exactly!



> >
> > I don't understand why that can't be done with the above model.
> > In your patch we iterate through the whole group, collect events,
> > and schedule them.
> >
> > With the above, the collection is just done on enable(), and the scheduling
> > is done with the new pmu callbacks.
> >
> > The thing is sensibly the same, where is the obstacle?
> >
> There is none. You've just hoisted the some of the code from
> hw_perf_group_sched_in().


Exactly :)

tip-bot for Stephane Eranian

unread,
Jan 21, 2010, 9:00:02 AM1/21/10
to
Commit-ID: b27d515a49169e5e2a92d621faac761074a8c5b1
Gitweb: http://git.kernel.org/tip/b27d515a49169e5e2a92d621faac761074a8c5b1
Author: Stephane Eranian <era...@google.com>
AuthorDate: Mon, 18 Jan 2010 10:58:01 +0200
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Thu, 21 Jan 2010 13:40:41 +0100

perf: x86: Add support for the ANY bit

Propagate the ANY bit into the fixed counter config for v3 and higher.

Signed-off-by: Stephane Eranian <era...@google.com>
[a.p.zi...@chello.nl: split from larger patch]
Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
LKML-Reference: <4b5430c6.0f975e...@mx.google.com>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---
arch/x86/include/asm/perf_event.h | 1 +
arch/x86/kernel/cpu/perf_event.c | 7 +++++++
2 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8d9f854..1380367 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -19,6 +19,7 @@
#define MSR_ARCH_PERFMON_EVENTSEL1 0x187

#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index d616c06..8c1c070 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1343,6 +1343,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
bits |= 0x2;
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
bits |= 0x1;
+
+ /*
+ * ANY bit is supported in v3 and up
+ */
+ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+ bits |= 0x4;
+
bits <<= (idx * 4);
mask = 0xfULL << (idx * 4);

Stephane Eranian

unread,
Jan 21, 2010, 9:10:02 AM1/21/10
to
On Thu, Jan 21, 2010 at 11:46 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Thu, 2010-01-21 at 11:43 +0100, Stephane Eranian wrote:
>> On Thu, Jan 21, 2010 at 11:36 AM, Peter Zijlstra <pet...@infradead.org> wrote:
>> > On Mon, 2010-01-18 at 10:58 +0200, Stephane Eranian wrote:
>> >>  void hw_perf_enable(void)
>> >>  {
>> >
>> >
>> >> +               cpuc->n_added = 0;
>> >> +               perf_events_lapic_init();
>> >
>> > Just wondering, why do we need that lapic_init() there?
>> >
>> I think I picked it up from x86_pmu_enable(). I don't think
>> you necessarily need it here. Not clear to me why it was
>> in x86_pmu_enable() to begin with.
>
> Right, wondering about the same ;-)
>
I suspect this is because on Intel, you need to re-initialize
the APIC LVT entry on PMU interrupt. You don't have to do this
on AMD.

But then, this should be done when you initialize the
PMU for the first event and on every CPU and in the Intel
interrupt handler.


>> I will post a new version of the patch which fixes some bugs and
>> also implements true fast path (reuse of previous assignment). It turned
>> out, things were a bit more complicated than what I had in v5.
>
> Could you post a diff against your previous version, I just picked up
> -v5 and tidied a few things up and send it mingo wards (although it
> appears he hasn't picked it up yet).
>

Will do that shortly.

tip-bot for Stephane Eranian

unread,
Jan 29, 2010, 4:30:03 AM1/29/10
to
Commit-ID: 1da53e023029c067ba1277a33038c65d6e4c99b3
Gitweb: http://git.kernel.org/tip/1da53e023029c067ba1277a33038c65d6e4c99b3

Author: Stephane Eranian <era...@google.com>
AuthorDate: Mon, 18 Jan 2010 10:58:01 +0200
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Fri, 29 Jan 2010 09:01:33 +0100

perf_events, x86: Improve x86 event scheduling

This patch improves event scheduling by maximizing the use of PMU
registers regardless of the order in which events are created in a group.

The algorithm takes into account the list of counter constraints for each
event. It assigns events to counters from the most constrained, i.e.,
works on only one counter, to the least constrained, i.e., works on any
counter.

Intel Fixed counter events and the BTS special event are also handled via
this algorithm which is designed to be fairly generic.

The patch also updates the validation of an event to use the scheduling
algorithm. This will cause early failure in perf_event_open().

The 2nd version of this patch follows the model used by PPC, by running
the scheduling algorithm and the actual assignment separately. Actual
assignment takes place in hw_perf_enable() whereas scheduling is
implemented in hw_perf_group_sched_in() and x86_pmu_enable().

Signed-off-by: Stephane Eranian <era...@google.com>
[ fixup whitespace and style nits as well as adding is_x86_event() ]
Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@redhat.com>
Cc: Frederic Weisbecker <fwei...@gmail.com>


LKML-Reference: <4b5430c6.0f975e...@mx.google.com>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

arch/x86/include/asm/perf_event.h | 16 +-
arch/x86/kernel/cpu/perf_event.c | 775 +++++++++++++++++++++++++++----------
2 files changed, 574 insertions(+), 217 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8d9f854..dbc0826 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -26,7 +26,14 @@
/*
* Includes eventsel and unit mask as well:
*/
-#define ARCH_PERFMON_EVENT_MASK 0xffff
+
+
+#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
+#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
+#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
+#define INTEL_ARCH_INV_MASK 0x00800000ULL
+#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
+#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)

/*
* filter mask to validate fixed counter events.
@@ -37,7 +44,12 @@
* The other filters are supported by fixed counters.
* The any-thread option is supported starting with v3.
*/
-#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
+#define INTEL_ARCH_FIXED_MASK \
+ (INTEL_ARCH_CNT_MASK| \
+ INTEL_ARCH_INV_MASK| \
+ INTEL_ARCH_EDGE_MASK|\
+ INTEL_ARCH_UNIT_MASK|\
+ INTEL_ARCH_EVENT_MASK)

#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed1998b..995ac4a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzij...@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t...@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
@@ -68,26 +69,37 @@ struct debug_store {
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};

+#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
+
+struct event_constraint {
+ u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)];
+ int code;
+ int cmask;
+};
+
struct cpu_hw_events {
- struct perf_event *events[X86_PMC_IDX_MAX];
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
struct debug_store *ds;
-};

-struct event_constraint {
- unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int code;
+ int n_events;
+ int n_added;
+ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
};

-#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
-#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
+#define EVENT_CONSTRAINT(c, n, m) { \
+ .code = (c), \
+ .cmask = (m), \
+ .idxmsk[0] = (n) }

-#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->idxmsk[0]; (e)++)
+#define EVENT_CONSTRAINT_END \
+ { .code = 0, .cmask = 0, .idxmsk[0] = 0 }

+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->cmask; (e)++)

/*
* struct x86_pmu - generic x86 pmu
@@ -114,8 +126,9 @@ struct x86_pmu {
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
- int (*get_event_idx)(struct cpu_hw_events *cpuc,
- struct hw_perf_event *hwc);
+ void (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk);
+ void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event);
+ const struct event_constraint *event_constraints;
};

static struct x86_pmu x86_pmu __read_mostly;
@@ -124,7 +137,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

-static const struct event_constraint *event_constraints;
+static int x86_perf_event_set_period(struct perf_event *event,
+ struct hw_perf_event *hwc, int idx);

/*
* Not sure about some of these
@@ -171,14 +185,14 @@ static u64 p6_pmu_raw_event(u64 hw_event)
return hw_event & P6_EVNTSEL_MASK;
}

-static const struct event_constraint intel_p6_event_constraints[] =
+static struct event_constraint intel_p6_event_constraints[] =
{
- EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0xc1, 0x1, INTEL_ARCH_EVENT_MASK), /* FLOPS */
+ EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */
EVENT_CONSTRAINT_END
};

@@ -196,32 +210,43 @@ static const u64 intel_perfmon_event_map[] =
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
};

-static const struct event_constraint intel_core_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
- EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
- EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+static struct event_constraint intel_core_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */
+ EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x2, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0x18, 0x1, INTEL_ARCH_EVENT_MASK), /* IDLE_DURING_DIV */
+ EVENT_CONSTRAINT(0x19, 0x2, INTEL_ARCH_EVENT_MASK), /* DELAYED_BYPASS */
+ EVENT_CONSTRAINT(0xa1, 0x1, INTEL_ARCH_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */
+ EVENT_CONSTRAINT(0xcb, 0x1, INTEL_ARCH_EVENT_MASK), /* MEM_LOAD_RETIRED */
EVENT_CONSTRAINT_END
};

-static const struct event_constraint intel_nehalem_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
- EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
- EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
- EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
- EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
- EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
- EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
+static struct event_constraint intel_nehalem_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */
+ EVENT_CONSTRAINT(0x40, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LD */
+ EVENT_CONSTRAINT(0x41, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_ST */
+ EVENT_CONSTRAINT(0x42, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK */
+ EVENT_CONSTRAINT(0x43, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_ALL_REF */
+ EVENT_CONSTRAINT(0x4e, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_PREFETCH */
+ EVENT_CONSTRAINT(0x4c, 0x3, INTEL_ARCH_EVENT_MASK), /* LOAD_HIT_PRE */
+ EVENT_CONSTRAINT(0x51, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D */
+ EVENT_CONSTRAINT(0x52, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0x53, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0xc5, 0x3, INTEL_ARCH_EVENT_MASK), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */
EVENT_CONSTRAINT_END
};

@@ -527,11 +552,11 @@ static u64 intel_pmu_raw_event(u64 hw_event)
#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL

#define CORE_EVNTSEL_MASK \
- (CORE_EVNTSEL_EVENT_MASK | \
- CORE_EVNTSEL_UNIT_MASK | \
- CORE_EVNTSEL_EDGE_MASK | \
- CORE_EVNTSEL_INV_MASK | \
- CORE_EVNTSEL_REG_MASK)
+ (INTEL_ARCH_EVTSEL_MASK | \
+ INTEL_ARCH_UNIT_MASK | \
+ INTEL_ARCH_EDGE_MASK | \
+ INTEL_ARCH_INV_MASK | \
+ INTEL_ARCH_CNT_MASK)

return hw_event & CORE_EVNTSEL_MASK;
}
@@ -1120,9 +1145,15 @@ static void amd_pmu_disable_all(void)

void hw_perf_disable(void)
{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
if (!x86_pmu_initialized())
return;
- return x86_pmu.disable_all();
+
+ if (cpuc->enabled)


+ cpuc->n_added = 0;
+

+ x86_pmu.disable_all();
}

static void p6_pmu_enable_all(void)
@@ -1189,10 +1220,237 @@ static void amd_pmu_enable_all(void)
}
}

+static const struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+ return event->pmu == &pmu;
+}
+
+static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ int i, j , w, num;
+ int weight, wmax;
+ unsigned long *c;
+ u64 constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+ for (i = 0; i < n; i++) {
+ x86_pmu.get_event_constraints(cpuc,
+ cpuc->event_list[i],
+ constraints[i]);
+ }
+
+ /*
+ * weight = number of possible counters
+ *
+ * 1 = most constrained, only works on one counter
+ * wmax = least constrained, works on any counter
+ *
+ * assign events to counters starting with most
+ * constrained events.
+ */
+ wmax = x86_pmu.num_events;
+
+ /*
+ * when fixed event counters are present,
+ * wmax is incremented by 1 to account
+ * for one more choice
+ */
+ if (x86_pmu.num_events_fixed)
+ wmax++;
+
+ num = n;
+ for (w = 1; num && w <= wmax; w++) {
+ /* for each event */
+ for (i = 0; i < n; i++) {
+ c = (unsigned long *)constraints[i];
+ hwc = &cpuc->event_list[i]->hw;
+
+ weight = bitmap_weight(c, X86_PMC_IDX_MAX);
+ if (weight != w)
+ continue;
+
+ /*
+ * try to reuse previous assignment
+ *
+ * This is possible despite the fact that
+ * events or events order may have changed.
+ *
+ * What matters is the level of constraints
+ * of an event and this is constant for now.
+ *
+ * This is possible also because we always
+ * scan from most to least constrained. Thus,
+ * if a counter can be reused, it means no,
+ * more constrained events, needed it. And
+ * next events will either compete for it
+ * (which cannot be solved anyway) or they
+ * have fewer constraints, and they can use
+ * another counter.
+ */
+ j = hwc->idx;
+ if (j != -1 && !test_bit(j, used_mask))
+ goto skip;
+
+ for_each_bit(j, c, X86_PMC_IDX_MAX) {
+ if (!test_bit(j, used_mask))
+ break;
+ }
+
+ if (j == X86_PMC_IDX_MAX)
+ break;
+skip:
+ set_bit(j, used_mask);
+
+#if 0
+ pr_debug("CPU%d config=0x%llx idx=%d assign=%c\n",
+ smp_processor_id(),
+ hwc->config,
+ j,
+ assign ? 'y' : 'n');
+#endif
+
+ if (assign)
+ assign[i] = j;
+ num--;
+ }
+ }
+ /*
+ * scheduling failed or is just a simulation,
+ * free resources if necessary
+ */
+ if (!assign || num) {
+ for (i = 0; i < n; i++) {
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
+ }
+ }
+ return num ? -ENOSPC : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+
+ /* current number of events already accepted */
+ n = cpuc->n_events;
+
+ if (is_x86_event(leader)) {
+ if (n >= max_count)
+ return -ENOSPC;
+ cpuc->event_list[n] = leader;
+ n++;
+ }
+ if (!dogrp)
+ return n;
+
+ list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ if (!is_x86_event(event) ||
+ event->state == PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -ENOSPC;
+
+ cpuc->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+ struct hw_perf_event *hwc, int idx)
+{
+ hwc->idx = idx;
+
+ if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that event_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->event_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ } else {
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
+ }
+}
+
void hw_perf_enable(void)
{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int i;
+
if (!x86_pmu_initialized())
return;
+ if (cpuc->n_added) {
+ /*
+ * apply assignment obtained either from
+ * hw_perf_group_sched_in() or x86_pmu_enable()
+ *
+ * step1: save events moving to new counters
+ * step2: reprogram moved events into new counters
+ */
+ for (i = 0; i < cpuc->n_events; i++) {
+
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == -1 || hwc->idx == cpuc->assign[i])
+ continue;
+
+ x86_pmu.disable(hwc, hwc->idx);
+
+ clear_bit(hwc->idx, cpuc->active_mask);
+ barrier();
+ cpuc->events[hwc->idx] = NULL;
+
+ x86_perf_event_update(event, hwc, hwc->idx);
+
+ hwc->idx = -1;
+ }
+
+ for (i = 0; i < cpuc->n_events; i++) {
+
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == -1) {
+ x86_assign_hw_event(event, hwc, cpuc->assign[i]);
+ x86_perf_event_set_period(event, hwc, hwc->idx);
+ }
+ /*
+ * need to mark as active because x86_pmu_disable()
+ * clear active_mask and eventsp[] yet it preserves
+ * idx
+ */
+ set_bit(hwc->idx, cpuc->active_mask);
+ cpuc->events[hwc->idx] = event;
+
+ x86_pmu.enable(hwc, hwc->idx);
+ perf_event_update_userpage(event);
+ }


+ cpuc->n_added = 0;
+ perf_events_lapic_init();

+ }
x86_pmu.enable_all();
}

@@ -1391,148 +1649,43 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
x86_pmu_enable_event(hwc, idx);
}

-static int fixed_mode_idx(struct hw_perf_event *hwc)
-{
- unsigned int hw_event;
-
- hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (hwc->sample_period == 1)))
- return X86_PMC_IDX_FIXED_BTS;
-
- if (!x86_pmu.num_events_fixed)
- return -1;
-
- /*
- * fixed counters do not take all possible filters
- */
- if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
- return -1;
-
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
- return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
- return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
- return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
- return -1;
-}
-
-/*
- * generic counter allocator: get next free counter
- */
-static int
-gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
- return idx == x86_pmu.num_events ? -1 : idx;
-}
-
/*
- * intel-specific counter allocator: check event constraints
- */
-static int
-intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- const struct event_constraint *event_constraint;
- int i, code;
-
- if (!event_constraints)
- goto skip;
-
- code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
-
- for_each_event_constraint(event_constraint, event_constraints) {
- if (code == event_constraint->code) {
- for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_and_set_bit(i, cpuc->used_mask))
- return i;
- }
- return -1;
- }
- }
-skip:
- return gen_get_event_idx(cpuc, hwc);
-}
-
-static int
-x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = fixed_mode_idx(hwc);
- if (idx == X86_PMC_IDX_FIXED_BTS) {
- /* BTS is already occupied. */
- if (test_and_set_bit(idx, cpuc->used_mask))
- return -EAGAIN;
-
- hwc->config_base = 0;
- hwc->event_base = 0;
- hwc->idx = idx;
- } else if (idx >= 0) {
- /*
- * Try to get the fixed event, if that is already taken
- * then try to get a generic event:
- */
- if (test_and_set_bit(idx, cpuc->used_mask))
- goto try_generic;
-
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that event_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->event_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- hwc->idx = idx;
- } else {
- idx = hwc->idx;
- /* Try to get the previous generic event again */
- if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
- idx = x86_pmu.get_event_idx(cpuc, hwc);
- if (idx == -1)
- return -EAGAIN;
-
- set_bit(idx, cpuc->used_mask);
- hwc->idx = idx;
- }
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
- }
-
- return idx;
-}
-
-/*
- * Find a PMC slot for the freshly enabled / scheduled in event:
+ * activate a single event
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ *
+ * Called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
*/
static int x86_pmu_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- int idx;
+ struct hw_perf_event *hwc;
+ int assign[X86_PMC_IDX_MAX];
+ int n, n0, ret;

- idx = x86_schedule_event(cpuc, hwc);
- if (idx < 0)
- return idx;
+ hwc = &event->hw;

- perf_events_lapic_init();
+ n0 = cpuc->n_events;
+ n = collect_events(cpuc, event, false);
+ if (n < 0)
+ return n;

- x86_pmu.disable(hwc, idx);
-
- cpuc->events[idx] = event;
- set_bit(idx, cpuc->active_mask);
+ ret = x86_schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));

- x86_perf_event_set_period(event, hwc, idx);
- x86_pmu.enable(hwc, idx);
+ cpuc->n_events = n;
+ cpuc->n_added = n - n0;

- perf_event_update_userpage(event);
+ if (hwc->idx != -1)
+ x86_perf_event_set_period(event, hwc, hwc->idx);

return 0;
}
@@ -1576,7 +1729,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
}
- pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);

for (idx = 0; idx < x86_pmu.num_events; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1664,7 +1817,7 @@ static void x86_pmu_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
+ int i, idx = hwc->idx;

/*
* Must be done before we disable, otherwise the nmi handler
@@ -1690,8 +1843,19 @@ static void x86_pmu_disable(struct perf_event *event)
intel_pmu_drain_bts_buffer(cpuc);

cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);

+ for (i = 0; i < cpuc->n_events; i++) {
+ if (event == cpuc->event_list[i]) {
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
+
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+
+ --cpuc->n_events;
+ }
+ }
perf_event_update_userpage(event);
}

@@ -1962,6 +2126,176 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_STOP;
}

+static struct event_constraint bts_constraint = {
+ .code = 0,
+ .cmask = 0,
+ .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS
+};
+
+static int intel_special_constraints(struct perf_event *event,
+ u64 *idxmsk)
+{
+ unsigned int hw_event;
+
+ hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+
+ if (unlikely((hw_event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (event->hw.sample_period == 1))) {
+
+ bitmap_copy((unsigned long *)idxmsk,
+ (unsigned long *)bts_constraint.idxmsk,
+ X86_PMC_IDX_MAX);
+ return 1;
+ }
+ return 0;
+}
+
+static void intel_get_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ u64 *idxmsk)
+{
+ const struct event_constraint *c;
+
+ /*
+ * cleanup bitmask
+ */
+ bitmap_zero((unsigned long *)idxmsk, X86_PMC_IDX_MAX);
+
+ if (intel_special_constraints(event, idxmsk))
+ return;
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if ((event->hw.config & c->cmask) == c->code) {
+
+ bitmap_copy((unsigned long *)idxmsk,
+ (unsigned long *)c->idxmsk,
+ X86_PMC_IDX_MAX);
+ return;
+ }
+ }
+ }
+ /* no constraints, means supports all generic counters */
+ bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events);
+}
+
+static void amd_get_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ u64 *idxmsk)
+{
+}
+
+static int x86_event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx, int cpu)
+{
+ int ret = 0;
+
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = cpu;
+ event->tstamp_running += event->ctx->time - event->tstamp_stopped;
+
+ if (!is_x86_event(event))
+ ret = event->pmu->enable(event);
+
+ if (!ret && !is_software_event(event))
+ cpuctx->active_oncpu++;
+
+ if (!ret && event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return ret;
+}
+
+static void x86_event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx, int cpu)
+{
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
+
+ if (!is_x86_event(event))
+ event->pmu->disable(event);
+
+ event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ *
+ * called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
+ */
+int hw_perf_group_sched_in(struct perf_event *leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct perf_event *sub;
+ int assign[X86_PMC_IDX_MAX];
+ int n0, n1, ret;
+
+ /* n0 = total number of events */
+ n0 = collect_events(cpuc, leader, true);
+ if (n0 < 0)
+ return n0;
+
+ ret = x86_schedule_events(cpuc, n0, assign);
+ if (ret)
+ return ret;
+
+ ret = x86_event_sched_in(leader, cpuctx, cpu);
+ if (ret)
+ return ret;
+
+ n1 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state != PERF_EVENT_STATE_OFF) {
+ ret = x86_event_sched_in(sub, cpuctx, cpu);
+ if (ret)
+ goto undo;
+ ++n1;
+ }
+ }
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n0*sizeof(int));
+
+ cpuc->n_events = n0;
+ cpuc->n_added = n1;
+ ctx->nr_active += n1;
+
+ /*
+ * 1 means successful and events are active
+ * This is not quite true because we defer
+ * actual activation until hw_perf_enable() but
+ * this way we* ensure caller won't try to enable
+ * individual events
+ */
+ return 1;
+undo:
+ x86_event_sched_out(leader, cpuctx, cpu);
+ n0 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ x86_event_sched_out(sub, cpuctx, cpu);
+ if (++n0 == n1)
+ break;
+ }
+ }
+ return ret;
+}
+
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
@@ -1993,7 +2327,8 @@ static __initconst struct x86_pmu p6_pmu = {
*/
.event_bits = 32,
.event_mask = (1ULL << 32) - 1,
- .get_event_idx = intel_get_event_idx,
+ .get_event_constraints = intel_get_event_constraints,
+ .event_constraints = intel_p6_event_constraints
};

static __initconst struct x86_pmu intel_pmu = {
@@ -2017,7 +2352,7 @@ static __initconst struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.enable_bts = intel_pmu_enable_bts,
.disable_bts = intel_pmu_disable_bts,
- .get_event_idx = intel_get_event_idx,
+ .get_event_constraints = intel_get_event_constraints
};

static __initconst struct x86_pmu amd_pmu = {
@@ -2038,7 +2373,7 @@ static __initconst struct x86_pmu amd_pmu = {
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
- .get_event_idx = gen_get_event_idx,
+ .get_event_constraints = amd_get_event_constraints
};

static __init int p6_pmu_init(void)
@@ -2051,12 +2386,9 @@ static __init int p6_pmu_init(void)
case 7:
case 8:
case 11: /* Pentium III */
- event_constraints = intel_p6_event_constraints;
- break;
case 9:
case 13:
/* Pentium M */
- event_constraints = intel_p6_event_constraints;
break;
default:
pr_cont("unsupported p6 CPU model %d ",
@@ -2121,23 +2453,29 @@ static __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

+ x86_pmu.event_constraints = intel_core_event_constraints;
pr_cont("Core2 events, ");
- event_constraints = intel_core_event_constraints;
break;
- default:
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

- event_constraints = intel_nehalem_event_constraints;
+ x86_pmu.event_constraints = intel_nehalem_event_constraints;
pr_cont("Nehalem/Corei7 events, ");
break;
case 28:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

+ x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("Atom events, ");
break;
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
}
return 0;
}
@@ -2234,36 +2572,43 @@ static const struct pmu pmu = {
.unthrottle = x86_pmu_unthrottle,
};

-static int
-validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct hw_perf_event fake_event = event->hw;
-
- if (event->pmu && event->pmu != &pmu)
- return 0;
-
- return x86_schedule_event(cpuc, &fake_event) >= 0;
-}
-
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
static int validate_group(struct perf_event *event)
{
- struct perf_event *sibling, *leader = event->group_leader;
- struct cpu_hw_events fake_pmu;
+ struct perf_event *leader = event->group_leader;
+ struct cpu_hw_events fake_cpuc;
+ int n;

- memset(&fake_pmu, 0, sizeof(fake_pmu));
+ memset(&fake_cpuc, 0, sizeof(fake_cpuc));

- if (!validate_event(&fake_pmu, leader))
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = collect_events(&fake_cpuc, leader, true);
+ if (n < 0)
return -ENOSPC;

- list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
- if (!validate_event(&fake_pmu, sibling))
- return -ENOSPC;
- }
-
- if (!validate_event(&fake_pmu, event))
+ fake_cpuc.n_events = n;
+ n = collect_events(&fake_cpuc, event, false);
+ if (n < 0)
return -ENOSPC;

- return 0;
+ fake_cpuc.n_events = n;
+
+ return x86_schedule_events(&fake_cpuc, n, NULL);
}

const struct pmu *hw_perf_event_init(struct perf_event *event)

Frederic Weisbecker

unread,
Feb 27, 2010, 12:40:03 PM2/27/10
to
2010/1/19 Peter Zijlstra <pet...@infradead.org>:

> On Tue, 2010-01-19 at 16:55 +0100, Frederic Weisbecker wrote:
>> > Also, I see you set an ->unthrottle, but then don't implement it, but
>> > comment it as todo, which is strange because that implies its broken. If
>> > there's an ->unthrottle method it will throttle, so if its todo, the
>> > safest thing is to not set it.
>>
>>
>> Yeah, that's because I have a too vague idea on what is the purpose
>> of the unthrottle() callback.
>>
>> I've read the concerned codes that call this, several times, and I still
>> can't figure out what happens there, not sure what is meant by throttle
>> or unthrottle there :-/
>
> OK, so not setting it is relatively safe.
>
> As to what it does, it has to undo everything you do when
> perf_event_overflow() returns true, which happens when ->unthrottle is
> set and we get more than sysctl_perf_event_sample_rate/HZ events in a
> jiffy.
>
> If you look at the x86 implementation, you'll see that we simply disable
> the hardware counter when the overflow call returns true, so the
> unthrottle() callback simply enables it again.


Ok, I understand better now what mean "throttle" and "unthrottle" here.

But looking at what happens when we reach an event storm point, the
pending and subsequent
overflows until the next tick are just cancelled and that's it (as
it's using the software overflow
handler, not the x86 one). So we throttle and it's what we would want
to avoid a hang. If I remove my
unthrottle stub, it won't throttle anymore.

Theorically, the best would be to implement a true unthrottle
callback, which would just clear dr7,
really trivial. But I don't like this as ptrace events must always be
reported. A userspace app
could be polling a watchpointed variable, causing a lot of events, may
be that could trigger
a storm and then perf would throttle. Sure this is not supposed to
happen, given the fact ptrace breakpoints
are signaled to a debugger, which reduces the possible storm windows.
But just imagine a silly user sets sysctl_perf_event_sample_rate to 1,
then we would throttle at
each events, breaking the ptrace determinism rules.

So for now, I guess I need to remove my unthrottle stub, breakpoints
don't want to throttle because
of the ptrace determinism requirements.

0 new messages