[RFC PATCH 4/6] bpf: Add a bpf program function argument constraint for PMU map

kaixu xia

unread,

Jul 17, 2015, 6:50:06 AM7/17/15

to

For the pmu that contains the pointer to struct perf_event,
add the corresponding bpf program function argument constraint
'ARG_PTR_TO_MAP_PERF_EVENT_VALUE'.

Signed-off-by: kaixu xia <xiak...@huawei.com>
---
include/linux/bpf.h | 1 +
kernel/bpf/verifier.c | 9 +++++++++
2 files changed, 10 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f593199..31a93fc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ enum bpf_arg_type {
ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */
ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
+ ARG_PTR_TO_MAP_PERF_EVENT_VALUE, /* pointer to stack used as map pmu value */

/* the following constraints used to prototype bpf_memcmp() and other
* functions that access data on eBPF program stack
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..a04223b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -132,6 +132,7 @@ enum bpf_reg_type {
PTR_TO_CTX, /* reg points to bpf_context */
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_PTR_PERF_EVENT, /* reg points to map element pmu value */
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
FRAME_PTR, /* reg == frame_pointer */
PTR_TO_STACK, /* reg == frame_pointer + imm */
@@ -769,6 +770,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_PTR_TO_MAP_PERF_EVENT_VALUE) {
+ expected_type = PTR_TO_PTR_PERF_EVENT;
} else if (arg_type == ARG_CONST_STACK_SIZE) {
expected_type = CONST_IMM;
} else if (arg_type == ARG_CONST_MAP_PTR) {
@@ -817,6 +820,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
}
err = check_stack_boundary(env, regno, (*mapp)->value_size);

+ } else if (arg_type == ARG_PTR_TO_MAP_PERF_EVENT_VALUE) {
+ /* check for ARG_PTR_TO_MAP_PERF_EVENT_VALUE has been done before*/
+
} else if (arg_type == ARG_CONST_STACK_SIZE) {
/* bpf_xxx(..., buf, len) call will access 'len' bytes
* from stack pointer 'buf'. Check it
@@ -902,6 +908,9 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = map;
+
+ if (map->flags & BPF_MAP_FLAG_PERF_EVENT)
+ regs[BPF_REG_0].type = PTR_TO_PTR_PERF_EVENT;
} else {
verbose("unknown return type %d of func %d\n",
fn->ret_type, func_id);
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

kaixu xia

unread,

Jul 17, 2015, 6:50:06 AM7/17/15

to

Sometimes we want to traverse the map elements and make use
of the map value one by one. So add new function
map->ops->map_traverse_elem() to traverse map elements.

Signed-off-by: kaixu xia <xiak...@huawei.com>
---

include/linux/bpf.h | 3 +++
kernel/bpf/arraymap.c | 17 +++++++++++++++++
kernel/bpf/hashtab.c | 27 +++++++++++++++++++++++++++
3 files changed, 47 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2634a25..f593199 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -13,6 +13,8 @@

struct bpf_map;

+typedef int (*bpf_map_traverse_callback)(void *value);
+
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
@@ -24,6 +26,7 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+ int (*map_traverse_elem)(bpf_map_traverse_callback func, struct bpf_map *map);
};

struct bpf_map {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..3306916 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -114,6 +114,22 @@ static int array_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}

+static int array_map_traverse_elem(bpf_map_traverse_callback func, struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *value;
+ int i;
+
+ for(i = 0; i < array->map.max_entries; i++) {
+ value = array->value + array->elem_size * i;
+
+ if (func(value) < 0)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
@@ -136,6 +152,7 @@ static const struct bpf_map_ops array_ops = {
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_traverse_elem = array_map_traverse_elem,
};

static struct bpf_map_type_list array_type __read_mostly = {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d..fa7887e 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -325,6 +325,32 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}

+static int htab_map_traverse_elem(bpf_map_traverse_callback func, struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 key_size;
+ void *value;
+ int i;
+
+ key_size = map->key_size;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ value = l->key + round_up(key_size, 8);
+
+ if (func(value) < 0)
+ return -EINVAL;
+ }
+
+ }
+
+ return 0;
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -352,6 +378,7 @@ static const struct bpf_map_ops htab_ops = {
.map_lookup_elem = htab_map_lookup_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_traverse_elem = htab_map_traverse_elem,
};

static struct bpf_map_type_list htab_type __read_mostly = {

kaixu xia

unread,

Jul 17, 2015, 6:50:06 AM7/17/15

to

The pointer to struct perf_event will be stored in map. So
add new flag BPF_MAP_FLAG_PERF_EVENT that specify the value
type stored in map. The high 8-bit of attr->map_type is used
to contain the new flags.

Signed-off-by: kaixu xia <xiak...@huawei.com>
---

include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 14 ++++++++++++++
kernel/bpf/syscall.c | 13 +++++++++++--
3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..2634a25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -34,6 +34,7 @@ struct bpf_map {
u32 max_entries;
const struct bpf_map_ops *ops;
struct work_struct work;
+ unsigned int flags;
};

struct bpf_map_type_list {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..47d8516 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -131,6 +131,20 @@ enum bpf_prog_type {
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */

+/* flags for BPF_MAP_CREATE command */
+enum {
+ BPF_PERF_EVENT = 0,
+ __NR_FLAG_BITS,
+};
+
+#define MAP_FLAG_BITS __NR_FLAG_BITS
+#define MAP_FLAG_SHIFT (32 - MAP_FLAG_BITS)
+#define MAP_FLAG_MASK (~0U << MAP_FLAG_SHIFT)
+
+#define BPF_FLAG_PERF_EVENT_BIT (32 - BPF_PERF_EVENT -1)
+
+#define BPF_MAP_FLAG_PERF_EVENT (1 << BPF_FLAG_PERF_EVENT_BIT) /* create a specific map for perf_event */
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d1..4c2d9e6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -20,7 +20,7 @@

static LIST_HEAD(bpf_map_types);

-static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr, unsigned int flags)
{
struct bpf_map_type_list *tl;
struct bpf_map *map;
@@ -32,6 +32,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
map->ops = tl->ops;
map->map_type = attr->map_type;
+ map->flags = flags;
return map;
}
}
@@ -95,14 +96,22 @@ static const struct file_operations bpf_map_fops = {
static int map_create(union bpf_attr *attr)
{
struct bpf_map *map;
+ unsigned int flags;
int err;

err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;

+ flags = attr->map_type & MAP_FLAG_MASK;
+ attr->map_type &= ~MAP_FLAG_MASK;
+
+ /* check if the map->value_size is suitable when creating PMU map*/
+ if ((flags & BPF_MAP_FLAG_PERF_EVENT) && (attr->value_size != sizeof(void *)))
+ return -EINVAL;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
- map = find_and_alloc_map(attr);
+ map = find_and_alloc_map(attr, flags);
if (IS_ERR(map))
return PTR_ERR(map);

kaixu xia

unread,

Jul 17, 2015, 6:50:06 AM7/17/15

to

The function bpf_read_pmu() can get the specific map key, convert
the corresponding map value to the pointer to struct perf_event and
return the Hardware PMU counter value.

Signed-off-by: kaixu xia <xiak...@huawei.com>
---

include/linux/bpf.h | 2 ++
include/uapi/linux/bpf.h | 2 ++
kernel/bpf/helpers.c | 27 +++++++++++++++++++++++++++
kernel/trace/bpf_trace.c | 2 ++
4 files changed, 33 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 31a93fc..6efff20 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,8 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;

+extern const struct bpf_func_proto bpf_read_pmu_proto;
+
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 47d8516..1431ec6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -263,6 +263,8 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_get_current_comm,
+
+ BPF_FUNC_read_pmu, /* u64 bpf_read_pmu(&value) */
__BPF_FUNC_MAX_ID,
};

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..6a0ed1b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/uidgid.h>
+#include <linux/perf_event.h>

/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -182,3 +183,29 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
+
+static u64 bpf_read_pmu(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *value = (void *) (unsigned long) r1;
+ struct perf_event *event;
+ u64 count;
+
+ if (!value || !(*(unsigned long *)value))
+ return 0;
+
+ event = (struct perf_event *)(*(unsigned long *)value);
+
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+
+ count = local64_read(&event->count);
+
+ return count;
+}
+
+const struct bpf_func_proto bpf_read_pmu_proto = {
+ .func = bpf_read_pmu,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MAP_PERF_EVENT_VALUE,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041a..2343159 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -183,6 +183,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_read_pmu:
+ return &bpf_read_pmu_proto;
default:
return NULL;

kaixu xia

unread,

Jul 17, 2015, 6:50:06 AM7/17/15

to

The user space event FD from perf_event_open() syscall is converted
to the pointer to struct perf event and stored in map.

Signed-off-by: kaixu xia <xiak...@huawei.com>
---

include/linux/perf_event.h | 2 ++
kernel/bpf/syscall.c | 68 ++++++++++++++++++++++++++++++++++++++++++++
kernel/events/core.c | 22 ++++++++++++++
3 files changed, 92 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..2ea4067 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
+static struct perf_event *perf_event_get(unsigned int fd) { return NULL; }
static inline void perf_event_print_debug(void) { }
static inline int perf_event_task_disable(void) { return -EINVAL; }
static inline int perf_event_task_enable(void) { return -EINVAL; }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4c2d9e6..ac76792 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -17,6 +17,7 @@
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
+#include <linux/perf_event.h>

static LIST_HEAD(bpf_map_types);

@@ -65,6 +66,19 @@ void bpf_map_put(struct bpf_map *map)
}
}

+static int bpf_map_perf_event_put(void *value)
+{

+ struct perf_event *event;
+

+ event = (struct perf_event *)(*(unsigned long *)value);

+ if (!event)
+ return -EBADF;
+
+ perf_event_release_kernel(event);
+

+ return 0;
+}
+

static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
@@ -75,6 +89,13 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
*/
bpf_prog_array_map_clear(map);

+ if (map->flags & BPF_MAP_FLAG_PERF_EVENT) {

+ rcu_read_lock();
+ if (map->ops->map_traverse_elem(bpf_map_perf_event_put, map) < 0)
+ return -EINVAL;
+ rcu_read_unlock();
+ }
+
bpf_map_put(map);
return 0;
}
@@ -176,6 +197,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);

+ if (map->flags & BPF_MAP_FLAG_PERF_EVENT)

+ /* prevent user space from reading elem for PMU map */
+ return -EACCES;
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -215,6 +240,39 @@ err_put:
return err;
}

+static int replace_map_with_perf_event(void *value)
+{
+ struct perf_event *event;
+ u32 fd;
+
+ fd = *(u32 *)value;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ if (atomic_long_inc_not_zero(&event->refcount))
+ memcpy(value, &event, sizeof(struct perf_event *));
+ else
+ return -ENOENT;
+

+ return 0;
+}
+

+static bool check_map_perf_event_stored(struct bpf_map *map, void *key)
+{
+ void *value;
+ bool is_stored = false;
+
+ rcu_read_lock();
+ value = map->ops->map_lookup_elem(map, key);
+ if (value && (*(unsigned long *)value))
+ is_stored = true;
+ rcu_read_unlock();
+
+ return is_stored;
+}
+
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags

static int map_update_elem(union bpf_attr *attr)
@@ -252,6 +310,16 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, map->value_size) != 0)
goto free_value;

+ if (map->flags & BPF_MAP_FLAG_PERF_EVENT) {

+ err = -EINVAL;
+ if (check_map_perf_event_stored(map, key))
+ goto free_value;
+
+ err = -EBADF;
+ if (replace_map_with_perf_event(value) != 0)
+ goto free_value;
+ }
+
/* eBPF program that use maps are running under rcu_read_lock(),
* therefore all map accessors rely on this fact, so do the same here
*/
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e965cfa..c4e34b7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8582,6 +8582,28 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}

+struct perf_event *perf_event_get(unsigned int fd)
+{
+ struct perf_event *event;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = f.file->private_data;
+
+ fdput(f);
+
+ return event;
+}
+
/*
* inherit a event from parent task to child task:
*/

kaixu xia

unread,

Jul 17, 2015, 6:50:07 AM7/17/15

to

This series of patches introduce the new ability of eBPF programs
to access hardware PMU counter. Previous discussions on this subject:
https://lkml.org/lkml/2015/5/27/1027.

There are many useful PMUs provided by X86 and other architectures. By
combining PMU, kprobe and eBPF program together, many interesting things
can be done. For example, by probing at sched:sched_switch we can
measure IPC changing between different processes by watching 'cycle' PMU
counter; by probing at entry and exit points of a kernel function we are
able to compute cache miss rate for a function by collecting
'cache-misses' counter and see the differences. In summary, we can
define the begin and end points of a procedure, insert kprobes on them,
attach two BPF programs and let them collect specific PMU counter.
Further, by reading those PMU counter BPF program can bring some hints
to resource schedulers.

This patchset allows user read PMU events in the following way:
1. Open the PMU using perf_event_open() (for each CPUs or for
each processes he/she'd like to watch);
2. Create a BPF map with BPF_MAP_FLAG_PERF_EVENT set in its
type field;
3. Insert FDs into the map with some key-value mapping scheme
(i.e. cpuid -> event on that CPU);
4. Load and attach eBPF programs as usual;
5. In eBPF program, fetch the perf_event from map with key
(i.e. cpuid get from bpf_get_smp_processor_id()) then use
bpf_read_pmu() to read from it.
6. Do anything he/her want.

This patchset consists of necessary changes to the kernel space.
Perf will be the normal user space tool based on
https://lkml.org/lkml/2015/7/8/823 (perf tools: filtering events
using eBPF programs), https://lkml.org/lkml/2015/7/13/831
(Make eBPF programs output data to perf) and the corresonding
patches are on the way.

Patch 6/6 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace.(the cycles counter value when
'kprobe/sys_write' sampling)

$ ./bpf_pmu_test
$ cat /sys/kernel/debug/tracing/trace
...
syslog-ng-555 [001] dn.1 10189.004626: : bpf count: CPU-0 9935764297
syslog-ng-555 [001] d..1 10189.053776: : bpf count: CPU-0 10000706398
syslog-ng-555 [001] dn.1 10189.102972: : bpf count: CPU-0 10067117321
syslog-ng-555 [001] d..1 10189.152925: : bpf count: CPU-0 10134551505
syslog-ng-555 [001] dn.1 10189.202043: : bpf count: CPU-0 10200869299
syslog-ng-555 [001] d..1 10189.251167: : bpf count: CPU-0 10267179481
syslog-ng-555 [001] dn.1 10189.300285: : bpf count: CPU-0 10333493522
syslog-ng-555 [001] d..1 10189.349410: : bpf count: CPU-0 10399808073
syslog-ng-555 [001] dn.1 10189.398528: : bpf count: CPU-0 10466121583
syslog-ng-555 [001] d..1 10189.447645: : bpf count: CPU-0 10532433368
syslog-ng-555 [001] d..1 10189.496841: : bpf count: CPU-0 10598841104
syslog-ng-555 [001] d..1 10189.546891: : bpf count: CPU-0 10666410564
syslog-ng-555 [001] dn.1 10189.596016: : bpf count: CPU-0 10732729739
syslog-ng-555 [001] d..1 10189.645146: : bpf count: CPU-0 12884941186
syslog-ng-555 [001] d..1 10189.694263: : bpf count: CPU-0 12951249903
syslog-ng-555 [001] dn.1 10189.743382: : bpf count: CPU-0 13017561470
syslog-ng-555 [001] d..1 10189.792506: : bpf count: CPU-0 13083873521
syslog-ng-555 [001] d..1 10189.841631: : bpf count: CPU-0 13150190416
syslog-ng-555 [001] d..1 10189.890749: : bpf count: CPU-0 13216505962
syslog-ng-555 [001] d..1 10189.939945: : bpf count: CPU-0 13282913062
...

The detail of patches is as follow:

Patch 1/6 introduces a flag of map. The flag bit is encoded into type
field passed through attr;

Patch 2/6 introduces a map_traverse_elem() function for further use;

Patch 3/6 convets event file descriptors into perf_event structure when
add new element to a map with the flag set;

Patch 4/6 introduces a bpf program function argument constraint for
PMU map;

Patch 5/6 implement function bpf_read_pmu() that get the selected
hardware PMU conuter;

Patch 6/6 give a simple example.

kaixu xia (6):
bpf: Add new flags that specify the value type stored in map
bpf: Add function map->ops->map_traverse_elem() to traverse map elems
bpf: Save the pointer to struct perf_event to map
bpf: Add a bpf program function argument constraint for PMU map
bpf: Implement function bpf_read_pmu() that get the selected hardware
PMU conuter
samples/bpf: example of get selected PMU counter value

include/linux/bpf.h | 7 +++
include/linux/perf_event.h | 2 +
include/uapi/linux/bpf.h | 16 +++++
kernel/bpf/arraymap.c | 17 ++++++
kernel/bpf/hashtab.c | 27 +++++++++
kernel/bpf/helpers.c | 27 +++++++++
kernel/bpf/syscall.c | 81 ++++++++++++++++++++++++-
kernel/bpf/verifier.c | 9 +++
kernel/events/core.c | 22 +++++++
kernel/trace/bpf_trace.c | 2 +
samples/bpf/bpf_helpers.h | 2 +
samples/bpf/bpf_pmu_test.c | 143 ++++++++++++++++++++++++++++++++++++++++++++
12 files changed, 353 insertions(+), 2 deletions(-)
create mode 100644 samples/bpf/bpf_pmu_test.c

kaixu xia

unread,

Jul 17, 2015, 6:50:07 AM7/17/15

to

This is a simple example and shows how to use the new ability
to get the selected PMU counter value.

The trace output:

$ ./bpf_pmu_test
$ cat /sys/kernel/debug/tracing/trace
...
syslog-ng-555 [001] dn.1 10189.004626: : bpf count: CPU-0 9935764297
syslog-ng-555 [001] d..1 10189.053776: : bpf count: CPU-0 10000706398
syslog-ng-555 [001] dn.1 10189.102972: : bpf count: CPU-0 10067117321
syslog-ng-555 [001] d..1 10189.152925: : bpf count: CPU-0 10134551505
syslog-ng-555 [001] dn.1 10189.202043: : bpf count: CPU-0 10200869299
syslog-ng-555 [001] d..1 10189.251167: : bpf count: CPU-0 10267179481
syslog-ng-555 [001] dn.1 10189.300285: : bpf count: CPU-0 10333493522
syslog-ng-555 [001] d..1 10189.349410: : bpf count: CPU-0 10399808073
syslog-ng-555 [001] dn.1 10189.398528: : bpf count: CPU-0 10466121583
syslog-ng-555 [001] d..1 10189.447645: : bpf count: CPU-0 10532433368
syslog-ng-555 [001] d..1 10189.496841: : bpf count: CPU-0 10598841104
syslog-ng-555 [001] d..1 10189.546891: : bpf count: CPU-0 10666410564
syslog-ng-555 [001] dn.1 10189.596016: : bpf count: CPU-0 10732729739
syslog-ng-555 [001] d..1 10189.645146: : bpf count: CPU-0 12884941186
syslog-ng-555 [001] d..1 10189.694263: : bpf count: CPU-0 12951249903
syslog-ng-555 [001] dn.1 10189.743382: : bpf count: CPU-0 13017561470
syslog-ng-555 [001] d..1 10189.792506: : bpf count: CPU-0 13083873521
syslog-ng-555 [001] d..1 10189.841631: : bpf count: CPU-0 13150190416
syslog-ng-555 [001] d..1 10189.890749: : bpf count: CPU-0 13216505962
syslog-ng-555 [001] d..1 10189.939945: : bpf count: CPU-0 13282913062
...

Signed-off-by: kaixu xia <xiak...@huawei.com>
---

samples/bpf/bpf_helpers.h | 2 +
samples/bpf/bpf_pmu_test.c | 143 ++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 145 insertions(+)
create mode 100644 samples/bpf/bpf_pmu_test.c

diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..a421be1 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_read_pmu)(void *pmu) =
+ (void *) BPF_FUNC_read_pmu;

/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_pmu_test.c b/samples/bpf/bpf_pmu_test.c
new file mode 100644
index 0000000..aced233
--- /dev/null
+++ b/samples/bpf/bpf_pmu_test.c
@@ -0,0 +1,143 @@
+/* eBPF example program shows how to get hardware PMU counter
+ * - creates hashmap in kernel
+ *
+ * - save the pointer to struct perf_event to map
+ *
+ * - loads eBPF program:
+ * r0 = 0 (the chosen key: CPU-0)
+ * *(u32 *)(fp - 4) = r0
+ * value = bpf_map_lookup_elem(map_fd, fp - 4);
+ * count = bpf_read_pmu(value);
+ * bpf_trace_printk(fmt, fmt_size, key, count);
+ *
+ * - attaches this program to kprobes/sys_write
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stddef.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include "libbpf.h"
+
+static int test_pmu(void)
+{
+ int kprobe_fd, prog_fd, i;
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int *pmu_fd = malloc(nr_cpus * sizeof(int));
+ int key, map_fd;
+ unsigned long value;
+
+ char fmt[] = "bpf count: CPU-%d %lld\n";
+ int fmt_size = sizeof(fmt);
+
+ int type = BPF_MAP_TYPE_HASH | BPF_MAP_FLAG_PERF_EVENT;
+
+ map_fd = bpf_create_map(type, sizeof(key), sizeof(value),
+ nr_cpus);
+ if(map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ struct perf_event_attr attr_insn_kprobe = {
+ .sample_period = 1,
+ .type = PERF_TYPE_TRACEPOINT,
+ .sample_type = PERF_SAMPLE_RAW,
+ .wakeup_events = 1,
+ .config = 0x46B, /* sys_write (this ID maybe change) */
+ };
+
+ struct perf_event_attr attr_insn_pmu = {
+ .freq = 0,
+ .sample_period = 0x7fffffffffffffffULL,
+ .inherit = 0,
+ .type = PERF_TYPE_RAW,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0x11,/* PMU: cycles (ARM) */
+
+ };
+
+ for(i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd[i] < 0) {
+ printf("event syscall failed ****\n");
+ return -1;
+ }
+
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+
+ bpf_update_elem(map_fd, &i, (pmu_fd + i), BPF_ANY);
+ }
+
+ kprobe_fd = perf_event_open(&attr_insn_kprobe, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+ if (kprobe_fd < 0) {
+ printf("kprobe event syscall failed ****\n");
+ return -1;
+ }
+ ioctl(kprobe_fd, PERF_EVENT_IOC_ENABLE, 0);
+
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_read_pmu),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -8),
+ BPF_LD_IMM64(BPF_REG_1, 748842649785475172),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -16),
+ BPF_LD_IMM64(BPF_REG_1, 2678891156567243380),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -24),
+ BPF_LD_IMM64(BPF_REG_1, 7959390387983249506),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -32),
+ BPF_MOV64_IMM(BPF_REG_2, 25),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_trace_printk),
+ BPF_EXIT_INSN(),
+ };
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, prog, sizeof(prog),
+ "GPL", 0);
+ if (prog_fd < 0) {
+ printf("failed to load prog '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ ioctl(kprobe_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+
+ system("ls");
+ system("pwd");
+ system("sleep 4");
+
+ for(i = 0; i < nr_cpus; i++) {
+ close(pmu_fd[i]);
+ }
+
+ close(map_fd);
+
+ free(pmu_fd);

+ return 0;
+}
+

+int main(int argc, char **argv)
+{
+ test_pmu();

+
+ return 0;
+}

Peter Zijlstra

unread,

Jul 17, 2015, 7:10:06 AM7/17/15

to

And what is stopping userspace from closing those FDs while you're using
them?

Peter Zijlstra

unread,

Jul 17, 2015, 7:10:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 06:43:35PM +0800, kaixu xia wrote:
> The function bpf_read_pmu() can get the specific map key, convert
> the corresponding map value to the pointer to struct perf_event and
> return the Hardware PMU counter value.

Thanks for having me on Cc :/

> Signed-off-by: kaixu xia <xiak...@huawei.com>
> ---

> +static u64 bpf_read_pmu(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> + void *value = (void *) (unsigned long) r1;
> + struct perf_event *event;
> + u64 count;
> +
> + if (!value || !(*(unsigned long *)value))
> + return 0;
> +
> + event = (struct perf_event *)(*(unsigned long *)value);
> +
> + if (event->state == PERF_EVENT_STATE_ACTIVE)
> + event->pmu->read(event);
> +
> + count = local64_read(&event->count);
> +
> + return count;
> +}

Hell no, that's way broken.

Wangnan (F)

unread,

Jul 17, 2015, 7:30:06 AM7/17/15

to

Please check replace_map_with_perf_event(). Users can close the FDs, but
the perf
event structure will still valid because we increase its reference
count. It won't be
close until the map is released. We have test that case.

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 7:40:05 AM7/17/15

to

On Fri, Jul 17, 2015 at 06:43:33PM +0800, kaixu xia wrote:

> +static int replace_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return PTR_ERR(event);
> +

And userspace closes here,

> + if (atomic_long_inc_not_zero(&event->refcount))

And this goes *BOOM*

> + memcpy(value, &event, sizeof(struct perf_event *));
> + else
> + return -ENOENT;
> +
> + return 0;
> +}

Also, why do you think its OK to prod around the internals of perf_event
outside of kernel/events/ ?

Wangnan (F)

unread,

Jul 17, 2015, 7:40:05 AM7/17/15

to

Shall we put atomic_long_inc_not_zero() between fdget() and fdput()?

Peter Zijlstra

unread,

Jul 17, 2015, 7:40:06 AM7/17/15

to

You want something long these lines..

---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 22 +++++++++++++++++++---
2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809433b3..6e7be7345511 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -661,6 +661,7 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
+extern u64 perf_event_read(struct perf_event *event);

struct perf_sample_data {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..53521360c13d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3212,15 +3212,31 @@ static inline u64 perf_event_count(struct perf_event *event)
return __perf_event_count(event);
}

-static u64 perf_event_read(struct perf_event *event)
+u64 perf_event_read(struct perf_event *event)
{
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
- smp_call_function_single(event->oncpu,
- __perf_event_read, event, 1);
+ /*
+ * If the event is for the current task, its guaranteed that we
+ * never need the cross cpu call, and therefore can allow this
+ * to be called with IRQs disabled.
+ *
+ * Avoids the warning otherwise generated by
+ * smp_call_function_single().
+ */
+ if (event->ctx->task == current) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __perf_event_read(event);
+ local_irq_restore(flags);
+ } else {
+ smp_call_function_single(event->oncpu,
+ __perf_event_read, event, 1);
+ }
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;

Peter Zijlstra

unread,

Jul 17, 2015, 7:40:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 07:29:07PM +0800, Wangnan (F) wrote:

> What about calling perf_event_read_value() then?

Depends on what all you need, if you need full perf events to work then
yes perf_event_read_value() is your only option.

But note that that requires scheduling, so you cannot actually use it
for tracing purposes etc..

Wangnan (F)

unread,

Jul 17, 2015, 7:40:06 AM7/17/15

to

On 2015/7/17 19:05, Peter Zijlstra wrote:

What about calling perf_event_read_value() then?

...
struct perf_event_context *ctx;
u64 enabled, u64 running

ctx = perf_event_ctx_lock(event);
if (!event->state == PERF_EVENT_STATE_ERROR) {
count = perf_event_read_value(event, &enable, &running);
}
perf_event_ctx_unlock(event, ctx);
...

Code is from perf_read().

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 7:50:06 AM7/17/15

to

> Shall we put atomic_long_inc_not_zero() between fdget() and fdput()?

You pretty much _have_ to do that.

Wangnan (F)

unread,

Jul 17, 2015, 7:50:06 AM7/17/15

to

What you mean "full perf events"? Even with your code some event still
not work?

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 8:00:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 07:45:02PM +0800, Wangnan (F) wrote:

> >Depends on what all you need, if you need full perf events to work then
> >yes perf_event_read_value() is your only option.
> >
> >But note that that requires scheduling, so you cannot actually use it
> >for tracing purposes etc..

> What you mean "full perf events"? Even with your code some event still not
> work?

The code I posted only works for events that do not have inherit set.
And only works from IRQ/NMI context for events that monitor the current
task or the current CPU (although that needs a little extra code still).

Anything else and it does not work (correctly).

Peter Zijlstra

unread,

Jul 17, 2015, 8:00:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 01:55:05PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 07:45:02PM +0800, Wangnan (F) wrote:
>
> > >Depends on what all you need, if you need full perf events to work then
> > >yes perf_event_read_value() is your only option.
> > >
> > >But note that that requires scheduling, so you cannot actually use it
> > >for tracing purposes etc..
>
> > What you mean "full perf events"? Even with your code some event still not
> > work?
>
> The code I posted only works for events that do not have inherit set.
> And only works from IRQ/NMI context for events that monitor the current
> task or the current CPU (although that needs a little extra code still).
>
> Anything else and it does not work (correctly).

Scratch that from NMI, for that to work we need more magic still.

Wangnan (F)

unread,

Jul 17, 2015, 8:00:07 AM7/17/15

to

Thanks. In next version we will introduce a new function which do
oppsite thing to
perf_event_release_kernel() in perf/event/core.c, then fetch the event
before fdput.

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 8:10:08 AM7/17/15

to

On Fri, Jul 17, 2015 at 07:54:55PM +0800, Wangnan (F) wrote:
> Thanks. In next version we will introduce a new function which do oppsite
> thing to
> perf_event_release_kernel() in perf/event/core.c, then fetch the event
> before fdput.

perf_event_get() as proposed, with the addition of the refcount
increment inside the fdget/fdput() is fine.

Note that the _get() name already implies a refcount increment.

Wangnan (F)

unread,

Jul 17, 2015, 8:10:09 AM7/17/15

to

On 2015/7/17 20:01, Wangnan (F) wrote:

>
>
> On 2015/7/17 19:56, Peter Zijlstra wrote:
>> On Fri, Jul 17, 2015 at 01:55:05PM +0200, Peter Zijlstra wrote:
>>> On Fri, Jul 17, 2015 at 07:45:02PM +0800, Wangnan (F) wrote:
>>>
>>>>> Depends on what all you need, if you need full perf events to work
>>>>> then
>>>>> yes perf_event_read_value() is your only option.
>>>>>
>>>>> But note that that requires scheduling, so you cannot actually use it
>>>>> for tracing purposes etc..
>>>> What you mean "full perf events"? Even with your code some event
>>>> still not
>>>> work?
>>> The code I posted only works for events that do not have inherit set.
>>> And only works from IRQ/NMI context for events that monitor the current
>>> task or the current CPU (although that needs a little extra code
>>> still).
>>>
>>> Anything else and it does not work (correctly).
>> Scratch that from NMI, for that to work we need more magic still.

> The scheduling you said is caused by
>
> mutex_lock(&event->child_mutex)
>
> right?
>
> What about replacing it to mutex_trylock() and simply return an error
> if it read from a BPF program?
>

Sorry. Should be: "return an error if it doesn't get the lock and the
caller is a BPF program."

> Thank you.

Wangnan (F)

unread,

Jul 17, 2015, 8:10:09 AM7/17/15

to

On 2015/7/17 19:56, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 01:55:05PM +0200, Peter Zijlstra wrote:
>> On Fri, Jul 17, 2015 at 07:45:02PM +0800, Wangnan (F) wrote:
>>
>>>> Depends on what all you need, if you need full perf events to work then
>>>> yes perf_event_read_value() is your only option.
>>>>
>>>> But note that that requires scheduling, so you cannot actually use it
>>>> for tracing purposes etc..
>>> What you mean "full perf events"? Even with your code some event still not
>>> work?
>> The code I posted only works for events that do not have inherit set.
>> And only works from IRQ/NMI context for events that monitor the current
>> task or the current CPU (although that needs a little extra code still).
>>
>> Anything else and it does not work (correctly).
> Scratch that from NMI, for that to work we need more magic still.

The scheduling you said is caused by

mutex_lock(&event->child_mutex)

right?

What about replacing it to mutex_trylock() and simply return an error
if it read from a BPF program?

Thank you.

Wangnan (F)

unread,

Jul 17, 2015, 8:20:06 AM7/17/15

to

On 2015/7/17 20:02, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 07:54:55PM +0800, Wangnan (F) wrote:
>> Thanks. In next version we will introduce a new function which do oppsite
>> thing to
>> perf_event_release_kernel() in perf/event/core.c, then fetch the event
>> before fdput.
> perf_event_get() as proposed, with the addition of the refcount
> increment inside the fdget/fdput() is fine.
>
> Note that the _get() name already implies a refcount increment.

OK, you'll see it in v2.

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 8:20:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 08:01:07PM +0800, Wangnan (F) wrote:
>
>
> On 2015/7/17 19:56, Peter Zijlstra wrote:
> >On Fri, Jul 17, 2015 at 01:55:05PM +0200, Peter Zijlstra wrote:
> >>On Fri, Jul 17, 2015 at 07:45:02PM +0800, Wangnan (F) wrote:
> >>
> >>>>Depends on what all you need, if you need full perf events to work then
> >>>>yes perf_event_read_value() is your only option.
> >>>>
> >>>>But note that that requires scheduling, so you cannot actually use it
> >>>>for tracing purposes etc..
> >>>What you mean "full perf events"? Even with your code some event still not
> >>>work?
> >>The code I posted only works for events that do not have inherit set.
> >>And only works from IRQ/NMI context for events that monitor the current
> >>task or the current CPU (although that needs a little extra code still).
> >>
> >>Anything else and it does not work (correctly).
> >Scratch that from NMI, for that to work we need more magic still.
> The scheduling you said is caused by
>
> mutex_lock(&event->child_mutex)
>
> right?
>
> What about replacing it to mutex_trylock() and simply return an error
> if it read from a BPF program?

That is vile and unreliable.

I think you really want to put very strict limits on what kind of events
you accept, or create the events yourself.

Wangnan (F)

unread,

Jul 17, 2015, 8:40:06 AM7/17/15

to

On 2015/7/17 20:18, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 08:01:07PM +0800, Wangnan (F) wrote:
>>
>> On 2015/7/17 19:56, Peter Zijlstra wrote:
>>> On Fri, Jul 17, 2015 at 01:55:05PM +0200, Peter Zijlstra wrote:
>>>> On Fri, Jul 17, 2015 at 07:45:02PM +0800, Wangnan (F) wrote:
>>>>
>>>>>> Depends on what all you need, if you need full perf events to work then
>>>>>> yes perf_event_read_value() is your only option.
>>>>>>
>>>>>> But note that that requires scheduling, so you cannot actually use it
>>>>>> for tracing purposes etc..
>>>>> What you mean "full perf events"? Even with your code some event still not
>>>>> work?
>>>> The code I posted only works for events that do not have inherit set.
>>>> And only works from IRQ/NMI context for events that monitor the current
>>>> task or the current CPU (although that needs a little extra code still).
>>>>
>>>> Anything else and it does not work (correctly).
>>> Scratch that from NMI, for that to work we need more magic still.
>> The scheduling you said is caused by
>>
>> mutex_lock(&event->child_mutex)
>>
>> right?
>>
>> What about replacing it to mutex_trylock() and simply return an error
>> if it read from a BPF program?
> That is vile and unreliable.
>
> I think you really want to put very strict limits on what kind of events
> you accept, or create the events yourself.
>

I think we can check the limitation in BPF program. What about this:

event must on current CPU or must be on current process. If not,
bpf_read_pmu() should simply return an error.

With current design it is easy to implement, and users can still control
it through bpf map.

But what if we really want cross-cpu PMU accessing? Impossible?

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 8:50:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 02:45:38PM +0200, Peter Zijlstra wrote:

> On Fri, Jul 17, 2015 at 08:27:43PM +0800, Wangnan (F) wrote:

> > event must on current CPU or must be on current process. If not,
> > bpf_read_pmu() should simply return an error.

I would further restrict by saying !inherit (and maybe even !sample).

Peter Zijlstra

unread,

Jul 17, 2015, 8:50:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 08:27:43PM +0800, Wangnan (F) wrote:

> I think we can check the limitation in BPF program.

You typically do not want to rely on your program for correctness.

> What about this:
>
> event must on current CPU or must be on current process. If not,
> bpf_read_pmu() should simply return an error.

OK, that's workable. That enforces the constraints outside of the
program itself.

> With current design it is easy to implement, and users can still control
> it through bpf map.
>
> But what if we really want cross-cpu PMU accessing? Impossible?

Under the assumption that the eBPF program is called from tracing, and
therefore from any context (task, softirq, irq and nmi), yes impossible.

You cannot do (synchronous) IPIs from either IRQ context or with IRQs
disabled. And both are valid trace contexts.

pi3orama

unread,

Jul 17, 2015, 9:10:06 AM7/17/15

to

发自我的 iPhone

> 在 2015年7月17日，下午8:45，Peter Zijlstra <pet...@infradead.org> 写道：
>
>> On Fri, Jul 17, 2015 at 08:27:43PM +0800, Wangnan (F) wrote:
>> I think we can check the limitation in BPF program.
>
> You typically do not want to rely on your program for correctness.

Sorry. NOT BPF program. What I want to express is bpf_read_pmu() function which is called from BPF program. User can put anything into the map during preparation but he or she gets only error code if the perf event he or she read from doesn't meet our restriction at runtime.

>> What about this:
>>
>> event must on current CPU or must be on current process. If not,
>> bpf_read_pmu() should simply return an error.
>
> OK, that's workable. That enforces the constraints outside of the
> program itself.
>
>> With current design it is easy to implement, and users can still control
>> it through bpf map.
>>
>> But what if we really want cross-cpu PMU accessing? Impossible?
>
> Under the assumption that the eBPF program is called from tracing, and
> therefore from any context (task, softirq, irq and nmi), yes impossible.
>
> You cannot do (synchronous) IPIs from either IRQ context or with IRQs
> disabled. And both are valid trace contexts.

What about software perf event? For example, tracepoints?

Thank you.

Peter Zijlstra

unread,

Jul 17, 2015, 9:30:06 AM7/17/15

to

On Fri, Jul 17, 2015 at 08:57:00PM +0800, pi3orama wrote:
> >> But what if we really want cross-cpu PMU accessing? Impossible?
> >
> > Under the assumption that the eBPF program is called from tracing, and
> > therefore from any context (task, softirq, irq and nmi), yes impossible.
> >
> > You cannot do (synchronous) IPIs from either IRQ context or with IRQs
> > disabled. And both are valid trace contexts.
>
> What about software perf event? For example, tracepoints?

Some of them, tracepoints would work. So you could exempt
TYPE_TRACEPOINT, but I would suggest starting as constrained as possible
and relaxing when we really need/want.

pi3orama

unread,

Jul 17, 2015, 9:50:06 AM7/17/15

to

发自我的 iPhone

> 在 2015年7月17日，下午9:26，Peter Zijlstra <pet...@infradead.org> 写道：
>

> On Fri, Jul 17, 2015 at 08:57:00PM +0800, pi3orama wrote:
>>>> But what if we really want cross-cpu PMU accessing? Impossible?
>>>
>>> Under the assumption that the eBPF program is called from tracing, and
>>> therefore from any context (task, softirq, irq and nmi), yes impossible.
>>>
>>> You cannot do (synchronous) IPIs from either IRQ context or with IRQs
>>> disabled. And both are valid trace contexts.
>>
>> What about software perf event? For example, tracepoints?
>
> Some of them, tracepoints would work. So you could exempt
> TYPE_TRACEPOINT, but I would suggest starting as constrained as possible
> and relaxing when we really need/want.
>
>

Thanks to your advise. We will follow them in v2. Do you have further comment on other part of this patch set?

Thank you.

Alexei Starovoitov

unread,

Jul 17, 2015, 7:00:05 PM7/17/15

to

On 7/17/15 3:43 AM, kaixu xia wrote:
> There are many useful PMUs provided by X86 and other architectures. By
> combining PMU, kprobe and eBPF program together, many interesting things
> can be done. For example, by probing at sched:sched_switch we can
> measure IPC changing between different processes by watching 'cycle' PMU
> counter; by probing at entry and exit points of a kernel function we are
> able to compute cache miss rate for a function by collecting
> 'cache-misses' counter and see the differences. In summary, we can
> define the begin and end points of a procedure, insert kprobes on them,
> attach two BPF programs and let them collect specific PMU counter.

that would be definitely a useful feature.
As far as overall design I think it should be done slightly differently.
The addition of 'flags' to all maps is a bit hacky and it seems has few
holes. It's better to reuse 'store fds into maps' code that prog_array
is doing. You can add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY
and reuse most of the arraymap.c code.
The program also wouldn't need to do lookup+read_pmu, so instead of:

r0 = 0 (the chosen key: CPU-0)

*(u32 *)(fp - 4) = r0

value = bpf_map_lookup_elem(map_fd, fp - 4);

count = bpf_read_pmu(value);
you will be able to do:
count = bpf_perf_event_read(perf_event_array_map_fd, index)
which will be faster.
note, I'd prefer 'bpf_perf_event_read' name for the helper.

Then inside helper we really cannot do mutex, sleep or smp_call,
but since programs are always executed in preempt disabled
and never from NMI, I think something like the following should work:
u64 bpf_perf_event_read(u64 r1, u64 index,...)
{
struct bpf_perf_event_array *array = (void *) (long) r1;
struct perf_event *event;

if (unlikely(index >= array->map.max_entries))
return -EINVAL;
event = array->events[index];
if (event->state != PERF_EVENT_STATE_ACTIVE)
return -EINVAL;
if (event->oncpu != raw_smp_processor_id())
return -EINVAL;
__perf_event_read(event);
return perf_event_count(event);
}
not sure whether we need to disable irq around __perf_event_read,
I think it should be ok without.
Also during store of FD into perf_event_array you'd need
to filter out all crazy events. I would limit it to few
basic types first.

btw, make sure you do your tests with lockdep and other debugs on.
and for the sample code please use C for the bpf program. Not many
people can read bpf asm ;)

pi3orama

unread,

Jul 17, 2015, 7:30:06 PM7/17/15

to

发自我的 iPhone

> 在 2015年7月18日，上午6:56，Alexei Starovoitov <a...@plumgrid.com> 写道：
>
>> On 7/17/15 3:43 AM, kaixu xia wrote:
>> There are many useful PMUs provided by X86 and other architectures. By
>> combining PMU, kprobe and eBPF program together, many interesting things
>> can be done. For example, by probing at sched:sched_switch we can
>> measure IPC changing between different processes by watching 'cycle' PMU
>> counter; by probing at entry and exit points of a kernel function we are
>> able to compute cache miss rate for a function by collecting
>> 'cache-misses' counter and see the differences. In summary, we can
>> define the begin and end points of a procedure, insert kprobes on them,
>> attach two BPF programs and let them collect specific PMU counter.
>
> that would be definitely a useful feature.
> As far as overall design I think it should be done slightly differently.
> The addition of 'flags' to all maps is a bit hacky and it seems has few
> holes. It's better to reuse 'store fds into maps' code that prog_array
> is doing. You can add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY
> and reuse most of the arraymap.c code.

Then we also need another BPF_MAP_TYPE_PERF_EVENT_HASHMAP for events in task context.

> The program also wouldn't need to do lookup+read_pmu, so instead of:
> r0 = 0 (the chosen key: CPU-0)
> *(u32 *)(fp - 4) = r0
> value = bpf_map_lookup_elem(map_fd, fp - 4);
> count = bpf_read_pmu(value);
> you will be able to do:
> count = bpf_perf_event_read(perf_event_array_map_fd, index)
> which will be faster.
> note, I'd prefer 'bpf_perf_event_read' name for the helper.

I though that. It makes things much simpler since we won't need care verifier too much.

I choose current implementation because I think we may need perf event not wrapped in map in future (for example, tracepoints). With the design you suggested in this case we have to create a map with only 1 element in it. Although it is acceptable we can make it a better look. However, let's think that in future.

>
> Then inside helper we really cannot do mutex, sleep or smp_call,
> but since programs are always executed in preempt disabled
> and never from NMI, I think something like the following should work:
> u64 bpf_perf_event_read(u64 r1, u64 index,...)
> {
> struct bpf_perf_event_array *array = (void *) (long) r1;
> struct perf_event *event;
>
> if (unlikely(index >= array->map.max_entries))
> return -EINVAL;
> event = array->events[index];
> if (event->state != PERF_EVENT_STATE_ACTIVE)
> return -EINVAL;
> if (event->oncpu != raw_smp_processor_id())
> return -EINVAL;
> __perf_event_read(event);
> return perf_event_count(event);
> }
> not sure whether we need to disable irq around __perf_event_read,
> I think it should be ok without.
> Also during store of FD into perf_event_array you'd need
> to filter out all crazy events. I would limit it to few
> basic types first.
>
> btw, make sure you do your tests with lockdep and other debugs on.
> and for the sample code please use C for the bpf program. Not many
> people can read bpf asm ;)
>

We still need some perf side code to make a c program work. Perf should create the perf events and put them into map. We post this kernel side code first to see your attitude on the basic design principle we choose. Although the implementation has some bugs, from you and Peter's response I think the user interface we choose is no too much problem, so we can start perf side coding now. Right? After perf side code done you won't see asm code in the example.

Thank you.

Alexei Starovoitov

unread,

Jul 17, 2015, 8:50:05 PM7/17/15

to

On 7/17/15 4:27 PM, pi3orama wrote:
> Then we also need another BPF_MAP_TYPE_PERF_EVENT_HASHMAP for events in task context.

hmm. why? don't see a use case yet.

> I choose current implementation because I think we may need perf event not wrapped in map in future (for example, tracepoints). With the design you suggested in this case we have to create a map with only 1 element in it.

what you had also needs a map of one element.
also I don't think perf_events can be 'detached'. User space always will
perf_event_open one first and only then program will use it.
So passing FD from user space to the program is inevitable.
Other than storing FD into map the other alternative is to use ld_imm64
mechanism. Then the helper will only have one argument,
but then you'd need to extend 'used_maps' logic with 'used_fds'.
It's doable as well, but I think the use case of only one pmu counter
per cpu is artificial. You'll always have an array of events. One for
each cpu. So perf_event_array mechanism fits the best.

>> >btw, make sure you do your tests with lockdep and other debugs on.
>> >and for the sample code please use C for the bpf program. Not many
>> >people can read bpf asm ;)
>> >
> We still need some perf side code to make a c program work.

no, what I meant is to do sample code as tracex[1-5]*
where there is distinct kernel and user space parts. Both in C.
At this stage perf patches are way too early.

pi3orama

unread,

Jul 17, 2015, 9:10:05 PM7/17/15

to

发自我的 iPhone

> 在 2015年7月18日，上午8:42，Alexei Starovoitov <a...@plumgrid.com> 写道：
>
>> On 7/17/15 4:27 PM, pi3orama wrote:
>> Then we also need another BPF_MAP_TYPE_PERF_EVENT_HASHMAP for events in task context.
>
> hmm. why? don't see a use case yet.

An example: we want to count the number of cycles between entry and exit point of a particular library function (glibc write() for example). Context switch is possible, but we don't care cycles consumed by other tasks. Then we need to create a perf event in task context using:

perf _event_open(&attr, pid, -1/* cpu */, ...);
Since it is a library function, we have to choose pids we interest. We should also probe sys_clone and create new perf event when thread creating, we haven't think how to do that now.

Then when inserting into map, the meaning of key should not 'cpuid'. Pid should be mush reasonable.

Although we can use a auxiliary map which maps pid to array index...

Thank you.

Alexei Starovoitov

unread,

Jul 17, 2015, 9:30:10 PM7/17/15

to

On 7/17/15 6:02 PM, pi3orama wrote:
> An example: we want to count the number of cycles between entry and exit point of a particular library function (glibc write() for example). Context switch is possible, but we don't care cycles consumed by other tasks. Then we need to create a perf event in task context using:
>
> perf _event_open(&attr, pid, -1/* cpu */, ...);
> Since it is a library function, we have to choose pids we interest.

sure. just store that fd under whatever index in perf_event_array
and use it from the program. index is not cpuid. it's just an index.

> We should also probe sys_clone and create new perf event when thread creating, we haven't think how to do that now.

opening a perf_event from the program? That will be very very hard.
Much easier to kprobe sys_clone and signal to user space via
bpf_output_trace_data() and user space will be
perf_event_open-ing new event for new task.

ps
please tweak your email client to wrap lines.

Kaixu Xia

unread,

Jul 22, 2015, 4:20:06 AM7/22/15

to

Previous patch v1 url:
https://lkml.org/lkml/2015/7/17/287

This patchset allows user read PMU events in the following way:
1. Open the PMU using perf_event_open() (for each CPUs or for
each processes he/she'd like to watch);
2. Create a BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map;
3. Insert FDs into the map with some key-value mapping scheme
(i.e. cpuid -> event on that CPU);
4. Load and attach eBPF programs as usual;
5. In eBPF program, get the perf_event_map_fd and key (i.e.
cpuid get from bpf_get_smp_processor_id()) then use
bpf_perf_event_read() to read from it.
6. Do anything he/her want.

changes in V2:
- put atomic_long_inc_not_zero() between fdget() and fdput();
- limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
- Only read the event counter on current CPU or on current
process;
- add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
pointer to the struct perf_event;
- according to the perf_event_map_fd and key, the function
bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 5/5 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

$ cat /sys/kernel/debug/tracing/trace_pipe
$ ./tracex6
...
cat-677 [002] d..1 210.299270: : bpf count: CPU-2 5316659
cat-677 [002] d..1 210.299316: : bpf count: CPU-2 5378639
cat-677 [002] d..1 210.299362: : bpf count: CPU-2 5440654
cat-677 [002] d..1 210.299408: : bpf count: CPU-2 5503211
cat-677 [002] d..1 210.299454: : bpf count: CPU-2 5565438
cat-677 [002] d..1 210.299500: : bpf count: CPU-2 5627433
cat-677 [002] d..1 210.299547: : bpf count: CPU-2 5690033
cat-677 [002] d..1 210.299593: : bpf count: CPU-2 5752184
cat-677 [002] d..1 210.299639: : bpf count: CPU-2 5814543
<...>-548 [009] d..1 210.299667: : bpf count: CPU-9 605418074
<...>-548 [009] d..1 210.299692: : bpf count: CPU-9 605452692
cat-677 [002] d..1 210.299700: : bpf count: CPU-2 5896319
<...>-548 [009] d..1 210.299710: : bpf count: CPU-9 605477824
<...>-548 [009] d..1 210.299728: : bpf count: CPU-9 605501726
<...>-548 [009] d..1 210.299745: : bpf count: CPU-9 605525279
<...>-548 [009] d..1 210.299762: : bpf count: CPU-9 605547817
<...>-548 [009] d..1 210.299778: : bpf count: CPU-9 605570433
<...>-548 [009] d..1 210.299795: : bpf count: CPU-9 605592743
...

The detail of patches is as follow:

Patch 1/5 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 2/5 introduces a map_traverse_elem() function for further use;

Patch 3/5 convets event file descriptors into perf_event structure when
add new element to the map;

Patch 4/5 implement function bpf_perf_event_read() that get the selected
hardware PMU conuter;

Patch 5/5 give a simple example.

Kaixu Xia (5):
bpf: Add new bpf map type to store the pointer to struct perf_event
bpf: Add function map->ops->map_traverse_elem() to traverse map elems
bpf: Save the pointer to struct perf_event to map
bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
samples/bpf: example of get selected PMU counter value

include/linux/bpf.h | 6 +++
include/linux/perf_event.h | 5 ++-
include/uapi/linux/bpf.h | 3 ++
kernel/bpf/arraymap.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/helpers.c | 42 +++++++++++++++++
kernel/bpf/syscall.c | 26 +++++++++++
kernel/events/core.c | 30 ++++++++++++-
kernel/trace/bpf_trace.c | 2 +
samples/bpf/Makefile | 4 ++
samples/bpf/bpf_helpers.h | 2 +
samples/bpf/tracex6_kern.c | 27 +++++++++++
samples/bpf/tracex6_user.c | 67 +++++++++++++++++++++++++++
12 files changed, 321 insertions(+), 3 deletions(-)
create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c

--
1.8.3.4

Kaixu Xia

unread,

Jul 22, 2015, 4:20:06 AM7/22/15

to

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map will only store the pointer to struct perf_event.

Signed-off-by: Kaixu Xia <xiak...@huawei.com>
---
include/linux/bpf.h | 2 ++
include/uapi/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 43 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..f6a2442 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -11,6 +11,8 @@
#include <linux/workqueue.h>
#include <linux/file.h>

+#define MAX_PERF_EVENT_ARRAY_ENTRY (2*NR_CPUS)
+
struct bpf_map;

/* map is generic key/value storage optionally accesible by eBPF programs */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};

enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..183c1f7 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -255,3 +255,43 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
+{
+ /* only the pointer to struct perf_event can be stored in
+ * perf_event_array map
+ */
+ if (attr->value_size != sizeof(void *))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->max_entries > MAX_PERF_EVENT_ARRAY_ENTRY)
+ return ERR_PTR(-EINVAL);
+
+ return array_map_alloc(attr);
+}
+
+static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = perf_event_array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = perf_event_array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);

Kaixu Xia

unread,

Jul 22, 2015, 4:20:07 AM7/22/15

to

We want to traverse the map elements and make use
of the map value one by one. So add new function
map->ops->map_traverse_elem() to traverse map elements.

Signed-off-by: Kaixu Xia <xiak...@huawei.com>
---

include/linux/bpf.h | 3 +++
kernel/bpf/arraymap.c | 17 +++++++++++++++++
2 files changed, 20 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f6a2442..257149c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,8 @@

struct bpf_map;

+typedef int (*bpf_map_traverse_callback)(void *value);
+

/* map is generic key/value storage optionally accesible by eBPF programs */

struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
@@ -26,6 +28,7 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+ int (*map_traverse_elem)(bpf_map_traverse_callback func, struct bpf_map *map);
};

struct bpf_map {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 183c1f7..410bc40 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -276,12 +276,29 @@ static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}

+static int perf_event_array_map_traverse_elem(bpf_map_traverse_callback func,
+ struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *value;
+ int i;
+
+ for(i = 0; i < array->map.max_entries; i++) {
+ value = array->value + array->elem_size * i;
+
+ func(value);
+ }
+

+ return 0;
+}
+

static const struct bpf_map_ops perf_event_array_ops = {

.map_alloc = perf_event_array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = perf_event_array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_traverse_elem = perf_event_array_map_traverse_elem,
};

static struct bpf_map_type_list perf_event_array_type __read_mostly = {

Alexei Starovoitov

unread,

Jul 22, 2015, 8:50:08 PM7/22/15

to

On 7/22/15 1:09 AM, Kaixu Xia wrote:
> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
> This map will only store the pointer to struct perf_event.
>
> Signed-off-by: Kaixu Xia <xiak...@huawei.com>
> ---
> include/linux/bpf.h | 2 ++
> include/uapi/linux/bpf.h | 1 +
> kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 43 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4383476..f6a2442 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -11,6 +11,8 @@
> #include <linux/workqueue.h>
> #include <linux/file.h>
>
> +#define MAX_PERF_EVENT_ARRAY_ENTRY (2*NR_CPUS)

why this artificial limit?
Just drop it.

> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
> +{
> + /* only the pointer to struct perf_event can be stored in
> + * perf_event_array map
> + */
> + if (attr->value_size != sizeof(void *))
> + return ERR_PTR(-EINVAL);

hmm. that's odd. why reinvent things? please do the same as
prog_array does. Namely below:

> +static const struct bpf_map_ops perf_event_array_ops = {
> + .map_alloc = perf_event_array_map_alloc,
> + .map_free = array_map_free,
> + .map_get_next_key = perf_event_array_map_get_next_key,
> + .map_lookup_elem = array_map_lookup_elem,
> + .map_delete_elem = array_map_delete_elem,

this is broken. you don't want programs to manipulate
'struct perf_event *' pointers.
lookup/update/delete helpers shouldn't be accessible from the programs
then update/delete can be cleanly implemented and called via syscall.
See how prog_array does it.

Also please collapse patches 1-3 into one. Their logically one piece.
I'll comment on them as well, but it would have been easier for me
and you if their were part of one email thread.

Alexei Starovoitov

unread,

Jul 22, 2015, 9:10:05 PM7/22/15

to

On 7/22/15 1:09 AM, Kaixu Xia wrote:

> We want to traverse the map elements and make use
> of the map value one by one. So add new function
> map->ops->map_traverse_elem() to traverse map elements.

...

> @@ -26,6 +28,7 @@ struct bpf_map_ops {
> void *(*map_lookup_elem)(struct bpf_map *map, void *key);
> int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
> int (*map_delete_elem)(struct bpf_map *map, void *key);
> + int (*map_traverse_elem)(bpf_map_traverse_callback func, struct bpf_map *map);

I think 'traverse' is too error prone.
Better to add 'map_delete_all_elem' callback.
Then convert bpf_prog_array_map_clear() into such callback
and similar callback from perf_event_array.
Then call it with from bpf_map_release() and drop
if (map->map_type == PROG_ARRAY) there.
Just unconditionally map->ops->map_delete_all_elem().
Though looking at patch 3. I don't see why you need that.
prog_array has to do it due to potential circular dependency
between prog_fd of a program that uses a prog_array_fd and
stores prog_fd inside that prog_array.
In case of perf_event_array no such thing possible.
Just release them as part of map_free.

Wangnan (F)

unread,

Jul 22, 2015, 9:20:06 PM7/22/15

to

On 2015/7/23 8:48, Alexei Starovoitov wrote:
> On 7/22/15 1:09 AM, Kaixu Xia wrote:
>> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
>> This map will only store the pointer to struct perf_event.
>>
>> Signed-off-by: Kaixu Xia <xiak...@huawei.com>
>> ---
>> include/linux/bpf.h | 2 ++
>> include/uapi/linux/bpf.h | 1 +
>> kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 43 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 4383476..f6a2442 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -11,6 +11,8 @@
>> #include <linux/workqueue.h>
>> #include <linux/file.h>
>>
>> +#define MAX_PERF_EVENT_ARRAY_ENTRY (2*NR_CPUS)
>
> why this artificial limit?
> Just drop it.
>

Then we should find another way to prevent user create a large map but
only use
a small portion of it, since normal array map can't report whether a
slot is used
or not. When releasing the map, we have to release each perf event.
Currently the only
way is to check map value in each slot.

Alexei Starovoitov

unread,

Jul 22, 2015, 9:40:09 PM7/22/15

to

On 7/22/15 6:08 PM, Wangnan (F) wrote:
> Then we should find another way to prevent user create a large map but
> only use
> a small portion of it, since normal array map can't report whether a
> slot is used
> or not. When releasing the map, we have to release each perf event.
> Currently the only
> way is to check map value in each slot.

yeah. In prog_array each element is either NULL ptr or valid
'struct bpf_prog *'. Why not to do the same for perf_event_array?

Kaixu Xia

unread,

Jul 23, 2015, 5:50:06 AM7/23/15

to

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.

This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiak...@huawei.com>
---
include/linux/bpf.h | 2 +

include/linux/perf_event.h | 2 +
include/uapi/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 113 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 15 ++++++
kernel/events/core.c | 23 +++++++++
6 files changed, 156 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..9cf74c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
#include <linux/file.h>
+#include <linux/perf_event.h>

struct bpf_map;

@@ -143,6 +144,7 @@ struct bpf_array {
union {
char value[0] __aligned(8);
struct bpf_prog *prog[0] __aligned(8);
+ struct perf_event *events[0] __aligned(8);
};
};
#define MAX_TAIL_CALL_CNT 32
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..2ea4067 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
+static struct perf_event *perf_event_get(unsigned int fd) { return NULL; }
static inline void perf_event_print_debug(void) { }
static inline int perf_event_task_disable(void) { return -EINVAL; }
static inline int perf_event_task_enable(void) { return -EINVAL; }

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};

enum bpf_prog_type {

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..e97efbc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -255,3 +255,116 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+

+static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
+{
+ /* only the pointer to struct perf_event can be stored in
+ * perf_event_array map
+ */

+ if (attr->value_size != sizeof(u32))

+ return ERR_PTR(-EINVAL);
+
+ return array_map_alloc(attr);
+}
+

+static void perf_event_array_map_free(struct bpf_map *map)

+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);

+ struct perf_event *event;
+ int i;
+
+ synchronize_rcu();
+
+ /* release the struct perf_event in perf_event_array_map */

+ for(i = 0; i < array->map.max_entries; i++) {

+ event = array->events[i];
+ if (event)
+ perf_event_release_kernel(event);
+ }
+ kvfree(array);

+}
+
+static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+

+static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static struct perf_event *convert_map_with_perf_event(void *value)

+{
+ struct perf_event *event;

+ u32 fd;
+
+ fd = *(u32 *)value;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return NULL;
+
+ /* limit the event type to PERF_TYPE_RAW
+ * and PERF_TYPE_HARDWARE.
+ */
+ if (event->attr.type != PERF_TYPE_RAW &&
+ event->attr.type != PERF_TYPE_HARDWARE)
+ return NULL;

+
+ return event;
+}

+
+/* only called from syscall */
+static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)

+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);

+ struct perf_event *event;
+ u32 index = *(u32 *)key;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ /* check if the value is already stored */
+ if (array->events[index])
+ return -EINVAL;
+
+ /* convert the fd to the pointer to struct perf_event */
+ event = convert_map_with_perf_event(value);
+ if (!event)
+ return -EBADF;
+
+ xchg(array->events + index, event);

+ return 0;
+}
+

+static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)

+{
+ return -EINVAL;
+}
+

+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = perf_event_array_map_alloc,

+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = perf_event_array_map_get_next_key,
+ .map_lookup_elem = perf_event_array_map_lookup_elem,
+ .map_update_elem = perf_event_array_map_update_elem,
+ .map_delete_elem = perf_event_array_map_delete_elem,

+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..c70f7e7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -924,6 +924,21 @@ static int check_call(struct verifier_env *env, int func_id)
*/
return -EINVAL;

+ if (map && map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ func_id != BPF_FUNC_perf_event_read)
+ /* perf_event_array map type needs extra care:
+ * only allow to pass it into bpf_perf_event_read() for now.
+ * bpf_map_update/delete_elem() must only be done via syscall
+ */
+ return -EINVAL;
+
+ if (func_id == BPF_FUNC_perf_event_read &&
+ map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ /* don't allow any other map type to be passed into
+ * bpf_perf_event_read()
+ */
+ return -EINVAL;
+
return 0;
}

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..08cb467 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,29 @@ void perf_event_delayed_put(struct task_struct *task)

WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}

+struct perf_event *perf_event_get(unsigned int fd)
+{
+ struct perf_event *event;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);

+ return ERR_PTR(-EINVAL);
+ }
+

+ event = f.file->private_data;
+

+ atomic_long_inc(&event->refcount);

+ fdput(f);
+
+ return event;
+}

+
/*
* inherit a event from parent task to child task:
*/
--
1.8.3.4

Kaixu Xia

unread,

Jul 23, 2015, 5:50:07 AM7/23/15

to

This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiak...@huawei.com>
---

samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 2 ++
samples/bpf/tracex6_kern.c | 26 ++++++++++++++++++
samples/bpf/tracex6_user.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 99 insertions(+)

create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
hostprogs-y += tracex3
hostprogs-y += tracex4
hostprogs-y += tracex5
+hostprogs-y += tracex6
hostprogs-y += lathist

test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o

# Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
always += tracex3_kern.o
always += tracex4_kern.o
always += tracex5_kern.o
+always += tracex6_kern.o
always += tcbpf1_kern.o
always += lathist_kern.o

@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_lathist += -lelf

# point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..c8a3594 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+ (void *) BPF_FUNC_perf_event_read;

/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 0000000..d213161
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,26 @@
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(unsigned long),
+ .max_entries = 32,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u64 count;
+ u32 key = bpf_get_smp_processor_id();
+ char fmt[] = "CPU-%d %llu\n";
+
+ count = bpf_perf_event_read(&my_map, &key);
+ bpf_trace_printk(fmt, sizeof(fmt), key, count);
+

+ return 0;
+}
+

+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 0000000..30307c9
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static void test_bpf_perf_event(void)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int *pmu_fd = malloc(nr_cpus * sizeof(int));
+ unsigned long value;
+ int i;
+
+ struct perf_event_attr attr_insn_pmu = {
+ .freq = 0,
+ .sample_period = 0x7fffffffffffffffULL,
+ .inherit = 0,
+ .type = PERF_TYPE_HARDWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0,/* PMU: cycles */
+ };
+
+ for(i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd[i] < 0)
+ printf("event syscall failed ****\n");
+
+ bpf_update_elem(map_fd[0], &i, (pmu_fd + i), BPF_ANY);
+
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+ }
+
+ system("ls");
+ system("pwd");
+ system("sleep 2");
+
+ for(i = 0; i < nr_cpus; i++)
+ close(pmu_fd[i]);
+
+ close(map_fd);
+
+ free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_bpf_perf_event();

+
+ return 0;
+}

Kaixu Xia

unread,

Jul 23, 2015, 5:50:07 AM7/23/15

to

Previous patch v2 url:
https://lkml.org/lkml/2015/7/22/104

This patchset allows user read PMU events in the following way:
1. Open the PMU using perf_event_open() (for each CPUs or for
each processes he/she'd like to watch);
2. Create a BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map;
3. Insert FDs into the map with some key-value mapping scheme
(i.e. cpuid -> event on that CPU);
4. Load and attach eBPF programs as usual;

5. In eBPF program, get the perf_event_map_fd and index (i.e.

cpuid get from bpf_get_smp_processor_id()) then use
bpf_perf_event_read() to read from it.
6. Do anything he/her want.

changes in V3:
- collapse V2 patches 1-3 into one;
- drop the function map->ops->map_traverse_elem() and release
the struct perf_event in map_free;
- only allow to access bpf_perf_event_read() from programs;
- update the perf_event_array_map elem via xchg();
- pass index directly to bpf_perf_event_read() instead of
MAP_KEY;

changes in V2:
- put atomic_long_inc_not_zero() between fdget() and fdput();
- limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
- Only read the event counter on current CPU or on current
process;
- add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
pointer to the struct perf_event;
- according to the perf_event_map_fd and key, the function
bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 3/3 is a simple example and shows how to use this new eBPF

programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

$ cat /sys/kernel/debug/tracing/trace_pipe
$ ./tracex6
...

cat-674 [000] d..1 146.413405: : CPU-0 2558223
<...>-699 [003] d..1 146.413441: : CPU-3 2663985
cat-674 [000] d..1 146.413480: : CPU-0 2659705
<...>-699 [003] d..1 146.413516: : CPU-3 2765199
cat-674 [000] d..1 146.413555: : CPU-0 2761277
<...>-699 [003] d..1 146.413600: : CPU-3 2877051
cat-674 [000] d..1 146.413651: : CPU-0 2889668
<...>-699 [003] d..1 146.413696: : CPU-3 3006447
cat-674 [000] d..1 146.413749: : CPU-0 3021285
<...>-699 [003] d..1 146.413787: : CPU-3 3131459
cat-674 [000] d..1 146.413838: : CPU-0 3141959
<...>-699 [003] d..1 146.413886: : CPU-3 3264188
cat-674 [000] d..1 146.413930: : CPU-0 3266461
<...>-699 [003] d..1 146.413973: : CPU-3 3381038
cat-674 [000] d..1 146.414025: : CPU-0 3393730
<...>-699 [003] d..1 146.414071: : CPU-3 3514676

...

The detail of patches is as follow:

Patch 1/3 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 2/3 implement function bpf_perf_event_read() that get the selected
hardware PMU conuter;

Patch 3/3 give a simple example.

Kaixu Xia (3):
bpf: Add new bpf map type to store the pointer to struct perf_event

bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
samples/bpf: example of get selected PMU counter value

kernel/trace/bpf_trace.c | 2 +
samples/bpf/Makefile | 4 ++
samples/bpf/bpf_helpers.h | 2 +

samples/bpf/tracex6_kern.c | 26 +++++++++++
samples/bpf/tracex6_user.c | 67 +++++++++++++++++++++++++++
12 files changed, 299 insertions(+), 3 deletions(-)

create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c

Kaixu Xia

unread,

Jul 23, 2015, 5:50:07 AM7/23/15

to

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the

Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiak...@huawei.com>
---

include/linux/bpf.h | 1 +
include/linux/perf_event.h | 3 ++-
include/uapi/linux/bpf.h | 1 +
kernel/bpf/helpers.c | 36 ++++++++++++++++++++++++++++++++++++
kernel/events/core.c | 4 ++--
kernel/trace/bpf_trace.c | 2 ++
6 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9cf74c0..0954b8f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;

+extern const struct bpf_func_proto bpf_perf_event_read_proto;
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2ea4067..899abcb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -662,7 +662,8 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
-
+extern void __perf_event_read(void *info);
+extern u64 perf_event_count(struct perf_event *event);

struct perf_sample_data {
/*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69a1f6b..b9b13ce 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -250,6 +250,7 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_get_current_comm,
+ BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */
__BPF_FUNC_MAX_ID,
};

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..aab219d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -182,3 +182,39 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
+
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;

+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+

+ if (index >= array->map.max_entries)
+ return -E2BIG;
+

+ event = array->events[index];

+ if (!event)
+ return -EBADF;
+

+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return -ENOENT;
+
+ if (event->oncpu != raw_smp_processor_id() &&
+ event->ctx->task != current)
+ return -EINVAL;
+
+ if (event->attr.inherit)
+ return -EINVAL;
+
+ __perf_event_read(event);
+
+ return perf_event_count(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 08cb467..c59d9c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3177,7 +3177,7 @@ void perf_event_exec(void)
/*
* Cross CPU call to read the hardware event
*/
-static void __perf_event_read(void *info)
+void __perf_event_read(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -3204,7 +3204,7 @@ static void __perf_event_read(void *info)
raw_spin_unlock(&ctx->lock);
}

-static inline u64 perf_event_count(struct perf_event *event)
+u64 perf_event_count(struct perf_event *event)
{
if (event->pmu->count)
return event->pmu->count(event);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041a..9cf094f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -183,6 +183,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
default:
return NULL;

Alexei Starovoitov

unread,

Jul 23, 2015, 7:00:07 PM7/23/15

to

On 7/23/15 2:42 AM, Kaixu Xia wrote:
> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
> This map only stores the pointer to struct perf_event. The
> user space event FDs from perf_event_open() syscall are converted
> to the pointer to struct perf_event and stored in map.

...

> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
> +{
> + /* only the pointer to struct perf_event can be stored in
> + * perf_event_array map
> + */
> + if (attr->value_size != sizeof(u32))
> + return ERR_PTR(-EINVAL);
> +
> + return array_map_alloc(attr);
> +}

since it's exactly the same as prog_array_map_alloc(),
just rename it to something like 'fd_array_map_alloc'
and use for both types.

> +static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
> + void *next_key)
> +{
> + return -EINVAL;
> +}
> +
> +static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> + return NULL;
> +}

same for the above two.
rename prog_array_map_* into fd_array_map_* and use for both map types.

> +static struct perf_event *convert_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return NULL;

don't lose error code, do 'return event' instead.

> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE.
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return NULL;

perf_event refcnt leak? need to do put_event.
and return ERR_PTR(-EINVAL)

> +

> + return event;
> +}
> +
> +/* only called from syscall */
> +static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
> + void *value, u64 map_flags)
> +{
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> + u32 index = *(u32 *)key;
> +
> + if (map_flags != BPF_ANY)
> + return -EINVAL;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + /* check if the value is already stored */
> + if (array->events[index])
> + return -EINVAL;
> +
> + /* convert the fd to the pointer to struct perf_event */
> + event = convert_map_with_perf_event(value);

imo helper name is misleading and it's too short to be separate
function. Just inline it and you can reuse 'index' variable.

> + if (!event)
> + return -EBADF;
> +
> + xchg(array->events + index, event);

refcnt leak of old event! Please think it through.
This type of bugs I shouldn't be finding.

> +static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
> +{
> + return -EINVAL;
> +}

no way to dec refcnt of perf_event from user space?
why not to do the same as prog_array_delete?

Alexei Starovoitov

unread,

Jul 23, 2015, 7:00:07 PM7/23/15

to

On 7/23/15 2:42 AM, Kaixu Xia wrote:

> According to the perf_event_map_fd and index, the function
> bpf_perf_event_read() can convert the corresponding map
> value to the pointer to struct perf_event and return the
> Hardware PMU counter value.
>
> Signed-off-by: Kaixu Xia <xiak...@huawei.com>

...

> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
> +{
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + event = array->events[index];
> + if (!event)
> + return -EBADF;

probably ENOENT makes more sense here.

> +
> + if (event->state != PERF_EVENT_STATE_ACTIVE)
> + return -ENOENT;

and -EINVAL here?

Alexei Starovoitov

unread,

Jul 23, 2015, 7:10:05 PM7/23/15

to

On 7/23/15 2:42 AM, Kaixu Xia wrote:

> This is a simple example and shows how to use the new ability
> to get the selected Hardware PMU counter value.
>
> Signed-off-by: Kaixu Xia <xiak...@huawei.com>

...

> +struct bpf_map_def SEC("maps") my_map = {
> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> + .key_size = sizeof(int),
> + .value_size = sizeof(unsigned long),
> + .max_entries = 32,
> +};

wait. how did it work here? value_size should be u32.

Daniel Borkmann

unread,

Jul 23, 2015, 7:40:06 PM7/23/15

to

On 07/22/2015 10:09 AM, Kaixu Xia wrote:
> Previous patch v1 url:
> https://lkml.org/lkml/2015/7/17/287

[ Sorry to chime in late, just noticed this series now as I wasn't in Cc for
the core BPF changes. More below ... ]

So far all the map backends are of generic nature, knowing absolutely nothing
about a particular consumer/subsystem of eBPF (tc, socket filters, etc). The
tail call is a bit special, but nevertheless generic for each user and [very]
useful, so it makes sense to inherit from the array map and move the code there.

I don't really like that we start add new _special_-cased maps here into the
eBPF core code, it seems quite hacky. :( From your rather terse commit description
where you introduce the maps, I failed to see a detailed elaboration on this i.e.
why it cannot be abstracted any different?

xiakaixu

unread,

Jul 23, 2015, 10:00:05 PM7/23/15

to

于 2015/7/24 6:59, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> This is a simple example and shows how to use the new ability
>> to get the selected Hardware PMU counter value.
>>
>> Signed-off-by: Kaixu Xia <xiak...@huawei.com>
> ...
>> +struct bpf_map_def SEC("maps") my_map = {
>> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>> + .key_size = sizeof(int),
>> + .value_size = sizeof(unsigned long),
>> + .max_entries = 32,
>> +};
>
> wait. how did it work here? value_size should be u32.

I tested the whole thing on ARM board. You are ringt, it
should be u32.
When create the array map, we choose the array->elem_size as
round_up(attr->value_size, 8), why 8?

Thanks!

xiakaixu

unread,

Jul 23, 2015, 10:00:06 PM7/23/15

to

Yeah, the errno is better.

Thanks!
>
>
> .

Alexei Starovoitov

unread,

Jul 23, 2015, 10:30:06 PM7/23/15

to

On 7/23/15 7:22 PM, xiakaixu wrote:
>>> + /* check if the value is already stored */
>>> >>+ if (array->events[index])
>>> >>+ return -EINVAL;
>>> >>+
>>> >>+ /* convert the fd to the pointer to struct perf_event */
>>> >>+ event = convert_map_with_perf_event(value);
>> >
>> >imo helper name is misleading and it's too short to be separate
>> >function. Just inline it and you can reuse 'index' variable.
>> >
>>> >>+ if (!event)
>>> >>+ return -EBADF;
>>> >>+
>>> >>+ xchg(array->events + index, event);
>> >
>> >refcnt leak of old event! Please think it through.
>> >This type of bugs I shouldn't be finding.

> Maybe the commit message is not elaborate. Here I prevent
> user space from updating the existed event, so the return
> value of xchg() is NULL and no refcnt leak of old event.
> I will do the same as prog_array in next version.

I see then it's even worse.
You think that above check:

+ if (array->events[index])
+ return -EINVAL;

will protect the double insert?
It won't, since there are no locks here.
You can have two processes both seeing empty slot and
racing to do xchg.

Alexei Starovoitov

unread,

Jul 23, 2015, 10:30:08 PM7/23/15

to

On 7/23/15 6:54 PM, xiakaixu wrote:
> 于 2015/7/24 6:59, Alexei Starovoitov 写道:
>> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>>> This is a simple example and shows how to use the new ability
>>> to get the selected Hardware PMU counter value.
>>>
>>> Signed-off-by: Kaixu Xia <xiak...@huawei.com>
>> ...
>>> +struct bpf_map_def SEC("maps") my_map = {
>>> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>>> + .key_size = sizeof(int),
>>> + .value_size = sizeof(unsigned long),
>>> + .max_entries = 32,
>>> +};
>>
>> wait. how did it work here? value_size should be u32.
>
> I tested the whole thing on ARM board. You are ringt, it
> should be u32.
> When create the array map, we choose the array->elem_size as
> round_up(attr->value_size, 8), why 8?

because from user space point of view we're storing FDs
which are u32, but kernel stores pointers.
but round_up(attr->value_size, 8) is done because there
can be 8 byte fields in there and we have 8-byte load/store insns.
So whether pointer is 32 or 64-bit they still fit.

xiakaixu

unread,

Jul 23, 2015, 10:30:08 PM7/23/15

to

Maybe the commit message is not elaborate. Here I prevent
user space from updating the existed event, so the return
value of xchg() is NULL and no refcnt leak of old event.
I will do the same as prog_array in next version.
>

>> +static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
>> +{
>> + return -EINVAL;
>> +}
>
> no way to dec refcnt of perf_event from user space?
> why not to do the same as prog_array_delete?

Will follow them in V4.
>
>
> .

xiakaixu

unread,

Jul 24, 2015, 10:20:06 PM7/24/15

to

于 2015/7/24 7:33, Daniel Borkmann 写道:
> On 07/22/2015 10:09 AM, Kaixu Xia wrote:
>> Previous patch v1 url:
>> https://lkml.org/lkml/2015/7/17/287
>
> [ Sorry to chime in late, just noticed this series now as I wasn't in Cc for
> the core BPF changes. More below ... ]

Sorry about this, will add you to the CC list:) Welcome your comments.

It will be very useful that giving the eBPF programs the ablility to access
hardware PMU counter, just as I mentioned in V1 commit message.
Of course, there are some special code when creating the perf_event type map
in V2, but you will find less special code in the next version(V3). I have
reused most of the prog_array map implementation. We can make the perf_event
array map more generic in the future.

BR.

> .

Peter Zijlstra

unread,

Aug 3, 2015, 5:40:06 AM8/3/15

to

Please no poking of event internal state outside of perf code.

Peter Zijlstra

unread,

Aug 3, 2015, 5:40:07 AM8/3/15

to

On Thu, Jul 23, 2015 at 09:42:40AM +0000, Kaixu Xia wrote:
> +static struct perf_event *convert_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return NULL;
> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE.
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return NULL;

Aside from the ref-leak already mentioned; please introduce something
like:

const struct perf_event_attr *perf_event_attrs(struct perf_event *event);

To avoid having to poke inside of the event outside of perf code.

> +
> + return event;
> +}

xiakaixu

unread,

Aug 3, 2015, 6:40:06 AM8/3/15

to

Thanks for your review. I will move it to kernel/events/core.c.