[RFC][PATCH v2 06/11] perf: core, export pmus via sysfs

Lin Ming

unread,

May 18, 2010, 1:50:03 PM5/18/10

to

Now only exports cpu hardware events.

For each PMU, there are 2 sysfs dirs: event_source and events.

For example,
/sys/devices/system/cpu/event_source/
/sys/devices/system/cpu/events/

$ tree /sys/devices/system/cpu/event_source/
/sys/devices/system/cpu/event_source/
`-- id

Signed-off-by: Lin Ming <ming....@intel.com>
---
drivers/base/node.c | 3 +-
include/linux/node.h | 2 +
include/linux/perf_event.h | 10 +++
kernel/perf_event.c | 169 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 183 insertions(+), 1 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 057979a..3b77585 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -19,10 +19,11 @@

static struct sysdev_class_attribute *node_state_attrs[];

-static struct sysdev_class node_class = {
+struct sysdev_class node_class = {
.name = "node",
.attrs = node_state_attrs,
};
+EXPORT_SYMBOL(node_class);

static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
diff --git a/include/linux/node.h b/include/linux/node.h
index 06292da..43e4422 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -50,6 +50,8 @@ extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk);
extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
node_registration_func_t unregister);
#endif
+
+extern struct sysdev_class node_class;
#else
static inline int register_one_node(int nid)
{
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ad2aea3..ef519d2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -552,12 +552,18 @@ struct perf_event;
#define PMU_TYPE_CPU 0
#define PMU_TYPE_NODE 1

+struct perf_event_kobject {
+ struct kobject kobj;
+ u64 id;
+};
+
/**
* struct pmu - generic performance monitoring unit
*/
struct pmu {
int id;
struct list_head entry;
+ struct kobject kobj;

int (*enable) (struct perf_event *event);
void (*disable) (struct perf_event *event);
@@ -577,6 +583,7 @@ struct pmu {
int (*commit_txn) (struct pmu *pmu);

int (*init_event) (struct perf_event *event);
+ int (*register_events) (struct pmu *pmu, struct kobject *events_kobj);
};

/**
@@ -1022,6 +1029,9 @@ extern void perf_event_disable(struct perf_event *event);

extern int perf_event_register_pmu(struct pmu *pmu);
extern void perf_event_unregister_pmu(int id);
+char *perf_hw_event_name(int id);
+char *perf_hw_cache_event_name(u8 type, u8 op, u8 result);
+extern struct kobj_type event_ktype;
#else
static inline void
perf_event_task_sched_in(struct task_struct *task) { }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 44e35ad..f6df0f8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/node.h>
#include <linux/smp.h>
#include <linux/file.h>
#include <linux/poll.h>
@@ -5767,6 +5768,174 @@ static int __init perf_event_sysfs_init(void)
}
device_initcall(perf_event_sysfs_init);

+static char *hw_event_names[] = {
+ "cycles",
+ "instructions",
+ "cache-references",
+ "cache-misses",
+ "branches",
+ "branch-misses",
+ "bus-cycles",
+};
+
+static char *hw_cache[] = {
+ "L1-dcache",
+ "L1-icache",
+ "LLC",
+ "dTLB",
+ "iTLB",
+ "branch",
+};
+
+static char *hw_cache_op[] = {
+ "load",
+ "store",
+ "prefetch",
+};
+
+static char *hw_cache_result[] = {
+ "refs",
+ "misses",
+};
+
+char *perf_hw_event_name(int id)
+{
+ if (id >= ARRAY_SIZE(hw_event_names))
+ return NULL;
+
+ return hw_event_names[id];
+}
+
+char *perf_hw_cache_event_name(u8 cache_type, u8 cache_op, u8 cache_result)
+{
+ static char name[50];
+
+ sprintf(name, "%s-%s-%s", hw_cache[cache_type],
+ hw_cache_op[cache_op],
+ hw_cache_result[cache_result]);
+
+ return name;
+}
+
+static ssize_t event_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ int n;
+ struct perf_event_kobject *event_kobj =
+ container_of(kobj, struct perf_event_kobject, kobj);
+
+ n = sprintf(buf, "0x%llx\n", event_kobj->id);
+
+ return n;
+}
+
+static const struct sysfs_ops event_sysfs_ops = {
+ .show = event_show,
+ .store = NULL,
+};
+
+struct kobj_type event_ktype = {
+ .sysfs_ops = &event_sysfs_ops,
+};
+
+static ssize_t event_source_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ int n;
+ struct pmu *pmu = container_of(kobj, struct pmu, kobj);
+
+ n = sprintf(buf, "%d\n", pmu->id);
+
+ return n;
+}
+
+static const struct sysfs_ops event_source_sysfs_ops = {
+ .show = event_source_show,
+ .store = NULL,
+};
+
+static struct kobj_type event_source_ktype = {
+ .sysfs_ops = &event_source_sysfs_ops,
+};
+
+static struct attribute event_source_id_attr = {
+ .name = "id",
+ .mode = 0444,
+};
+
+static int __init perf_pmu_sysfs_init(void)
+{
+ struct pmu *pmu;
+ struct kobject *parent_kobj, *events_kobj;
+ int err = 0;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ switch (pmu->id) {
+ case PMU_TYPE_CPU:
+ parent_kobj = &cpu_sysdev_class.kset.kobj;
+ break;
+#ifdef CONFIG_NUMA
+ case PMU_TYPE_NODE:
+ parent_kobj = &node_class.kset.kobj;
+ break;
+#endif
+
+ /* TBD: add other pmu types later */
+ default:
+ parent_kobj = NULL;
+ break;
+ }
+
+ if (!parent_kobj)
+ continue;
+
+ /*
+ * Create event_source sysfs dir, for example
+ * /sys/devices/system/cpu/event_source
+ */
+ err = kobject_init_and_add(&pmu->kobj,
+ &event_source_ktype, parent_kobj, "event_source");
+ if (err)
+ break;
+
+ /*
+ * Create event_source/id attribute, for example
+ * /sys/devices/system/cpu/event_source/id
+ */
+ err = sysfs_create_file(&pmu->kobj, &event_source_id_attr);
+ if (err)
+ break;
+
+ /*
+ * Create events sysfs dir, for example
+ * /sys/devices/system/cpu/events
+ */
+ events_kobj = kobject_create_and_add("events", parent_kobj);
+ if (!events_kobj) {
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * Register all events under events dir, for example
+ * /sys/devices/system/cpu/events/cycles
+ * /sys/devices/system/cpu/events/instructions
+ */
+ if (pmu->register_events) {
+ err = pmu->register_events(pmu, events_kobj);
+ if (err)
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
+device_initcall(perf_pmu_sysfs_init);
+
int perf_event_register_pmu(struct pmu *pmu)
{
struct pmu *tmp;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Greg KH

unread,

May 18, 2010, 4:20:03 PM5/18/10

to

On Wed, May 19, 2010 at 01:46:42AM +0000, Lin Ming wrote:
> Now only exports cpu hardware events.
>
> For each PMU, there are 2 sysfs dirs: event_source and events.
>
> For example,
> /sys/devices/system/cpu/event_source/
> /sys/devices/system/cpu/events/
>
> $ tree /sys/devices/system/cpu/event_source/
> /sys/devices/system/cpu/event_source/
> `-- id
>
> $ tree /sys/devices/system/cpu/events/
> /sys/devices/system/cpu/events/
> |-- L1-dcache-load-misses
> | |-- event_source -> ../../event_source

What is this symlink for? Is it needed as they all seem to point to the
same thing.

Also, as you are trying to add new sysfs interfaces, please document
them in Documentation/ABI.

And, do you have to use "raw" kobjects here? Any chance you can use a
struct device instead?

I'm still not really understanding what exactly you are trying to show
in the sysfs tree here anyway, perhaps the documentation update will
clear that up for me.

thanks,

greg k-h

Greg KH

unread,

May 18, 2010, 4:20:02 PM5/18/10

to

On Wed, May 19, 2010 at 01:46:42AM +0000, Lin Ming wrote:

> +struct kobj_type event_ktype = {
> + .sysfs_ops = &event_sysfs_ops,
> +};

No release function? Hm, are you setting yourself up to be publically
made fun of? You have read the kobject documentation about this, right?
Have you looked at the kernel logs for when you try to free this object?

thanks,

greg k-h

Lin Ming

unread,

May 18, 2010, 10:40:01 PM5/18/10

to

On Wed, 2010-05-19 at 04:05 +0800, Greg KH wrote:
> On Wed, May 19, 2010 at 01:46:42AM +0000, Lin Ming wrote:
> > Now only exports cpu hardware events.
> >
> > For each PMU, there are 2 sysfs dirs: event_source and events.
> >
> > For example,
> > /sys/devices/system/cpu/event_source/
> > /sys/devices/system/cpu/events/
> >
> > $ tree /sys/devices/system/cpu/event_source/
> > /sys/devices/system/cpu/event_source/
> > `-- id
> >
> > $ tree /sys/devices/system/cpu/events/
> > /sys/devices/system/cpu/events/
> > |-- L1-dcache-load-misses
> > | |-- event_source -> ../../event_source
>
> What is this symlink for? Is it needed as they all seem to point to the
> same thing.

The symlink is used to find the "event_source" of the "event".

/sys/devices/system/cpu/event_source/

/sys/devices/system/cpu/events/
|-- L1-dcache-load-misses
| |-- event_source -> ../../event_source

For above example, "event_source" is the cpu pmu and the "event" is
L1-dcache-load-misses.

Yes, they point to the same thing, because all the events
under /sys/devices/system/cpu/events/* is monitored by the same event
source, ie, the cpu pmu(/sys/devices/system/cpu/event_source/).

>
> Also, as you are trying to add new sysfs interfaces, please document
> them in Documentation/ABI.

Will add the document.

>
> And, do you have to use "raw" kobjects here? Any chance you can use a
> struct device instead?

Let me think about this.

>
> I'm still not really understanding what exactly you are trying to show
> in the sysfs tree here anyway, perhaps the documentation update will
> clear that up for me.

To support multiple pmus(or call them event source), I want to show all
the pmus and events in the sysfs tree.

For cpu pmu,
/sys/devices/system/cpu/event_source/*
/sys/devices/system/cpu/events/*

For node pmu,
/sys/devices/system/node/event_source/*
/sys/devices/system/node/events/*

For mce,
/sys/kernel/events/mce/*

and so on...the exact placement is not yet final.

Thanks,
Lin Ming

Lin Ming

unread,

May 18, 2010, 10:40:01 PM5/18/10

to

On Wed, 2010-05-19 at 04:07 +0800, Greg KH wrote:
> On Wed, May 19, 2010 at 01:46:42AM +0000, Lin Ming wrote:
> > +struct kobj_type event_ktype = {
> > + .sysfs_ops = &event_sysfs_ops,
> > +};
>
> No release function? Hm, are you setting yourself up to be publically
> made fun of? You have read the kobject documentation about this, right?
> Have you looked at the kernel logs for when you try to free this object?

Ah, will fix this "fun".

Greg KH

unread,

May 18, 2010, 10:50:01 PM5/18/10

to

Then, why have a symlink if they are all going to be the same?

> > I'm still not really understanding what exactly you are trying to show
> > in the sysfs tree here anyway, perhaps the documentation update will
> > clear that up for me.
>
> To support multiple pmus

What is a "pmu"?

> (or call them event source),

What is an "event source"?

> I want to show all
> the pmus and events in the sysfs tree.

Why do these things need to be in sysfs? What benifit is it going to
have?

> For cpu pmu,
> /sys/devices/system/cpu/event_source/*
> /sys/devices/system/cpu/events/*

What is a "cpu pmu"?

> For node pmu,
> /sys/devices/system/node/event_source/*
> /sys/devices/system/node/events/*

What is a "node pmu"?

> For mce,
> /sys/kernel/events/mce/*

What? No, don't create /sys/kernel/events/. Is that in this patchset?
Ick, that's such a "general" name that it is going to get confusing very
quickly. Heck, I still don't know what this "event" thing is yet :)

> and so on...the exact placement is not yet final.

Again, why do you need/want anything in sysfs in the first place?
What problem is it going to solve? Who is going to benifit? Why do
they care? What is this whole thing about?

Lin Ming

unread,

May 18, 2010, 11:50:02 PM5/18/10

to

So how about move the symlink to parent directory, like

/sys/devices/system/cpu/event_source/

/sys/devices/system/cpu/events/
|-- event_source -> ../event_source

>
> > > I'm still not really understanding what exactly you are trying to show
> > > in the sysfs tree here anyway, perhaps the documentation update will
> > > clear that up for me.
> >
> > To support multiple pmus
>
> What is a "pmu"?

Performance Monitoring Unit

>
> > (or call them event source),
>
> What is an "event source"?

The source who generates the event.
The event includes hardware events, cache-misses, bus-cycles,
interrupts, error conditions, hotplug......
and software events, context-switches, page-faults, sched events......

>
> > I want to show all
> > the pmus and events in the sysfs tree.
>
> Why do these things need to be in sysfs? What benifit is it going to
> have?

Kernel to provide the information of the event source and
hardware/software events via sysfs and user-space can get
this info from sysfs.

>
> > For cpu pmu,
> > /sys/devices/system/cpu/event_source/*
> > /sys/devices/system/cpu/events/*
>
> What is a "cpu pmu"?

The performance monitoring unit in cpu.

>
> > For node pmu,
> > /sys/devices/system/node/event_source/*
> > /sys/devices/system/node/events/*
>
> What is a "node pmu"?

The pmu shared by logical cpus in a package, for example, Nehalem uncore
pmu, PowerPc nest units.

>
> > For mce,
> > /sys/kernel/events/mce/*
>
> What? No, don't create /sys/kernel/events/. Is that in this patchset?

No, not in this patchset.

Greg KH

unread,

May 19, 2010, 1:10:02 AM5/19/10

to

On Wed, May 19, 2010 at 11:40:29AM +0800, Lin Ming wrote:
> On Wed, 2010-05-19 at 10:48 +0800, Greg KH wrote:
> > On Wed, May 19, 2010 at 10:34:55AM +0800, Lin Ming wrote:
> > > /sys/devices/system/cpu/events/
> > > |-- L1-dcache-load-misses
> > > | |-- event_source -> ../../event_source
> > >
> > > For above example, "event_source" is the cpu pmu and the "event" is
> > > L1-dcache-load-misses.
> > >
> > > Yes, they point to the same thing, because all the events
> > > under /sys/devices/system/cpu/events/* is monitored by the same event
> > > source, ie, the cpu pmu(/sys/devices/system/cpu/event_source/).
> >
> > Then, why have a symlink if they are all going to be the same?
>
> So how about move the symlink to parent directory, like
>
> /sys/devices/system/cpu/event_source/
>
> /sys/devices/system/cpu/events/
> |-- event_source -> ../event_source

Again, why would that make any sense? You would be creating a symlink
to something that is always the same symlink. Why would that even be
needed?

> > > > I'm still not really understanding what exactly you are trying to show
> > > > in the sysfs tree here anyway, perhaps the documentation update will
> > > > clear that up for me.
> > >
> > > To support multiple pmus
> >
> > What is a "pmu"?
>
> Performance Monitoring Unit

What is a "unit"?

> > > (or call them event source),
> >
> > What is an "event source"?
>
> The source who generates the event.

The traditional way to define a word, or phrase, is to not use the word
or phrase in the definition, otherwise that definition makes no sense.

> The event includes hardware events, cache-misses, bus-cycles,
> interrupts, error conditions, hotplug......
> and software events, context-switches, page-faults, sched events......

So, stuff that happens to a CPU that are usually handled in the
performance counters portion of the CPU, right?

Why put this in sysfs? Why do you think that mapping this information
there makes sense?

> > > I want to show all
> > > the pmus and events in the sysfs tree.
> >
> > Why do these things need to be in sysfs? What benifit is it going to
> > have?
>
> Kernel to provide the information of the event source and
> hardware/software events via sysfs and user-space can get
> this info from sysfs.

But that's not what you were doing with your "open a sysfs file from
within the kernel and use the kobject there" logic, right? You were
doing everything within the kernel itself, so sysfs, and kobjects and
the like, aren't really needed at all, right?

How are you exporting this information to userspace today?

> > > For mce,
> > > /sys/kernel/events/mce/*
> >
> > What? No, don't create /sys/kernel/events/. Is that in this patchset?
>
> No, not in this patchset.

Good :)

> > Ick, that's such a "general" name that it is going to get confusing very
> > quickly. Heck, I still don't know what this "event" thing is yet :)
> >
> > > and so on...the exact placement is not yet final.
> >
> > Again, why do you need/want anything in sysfs in the first place?
> > What problem is it going to solve? Who is going to benifit? Why do
> > they care? What is this whole thing about?

You forgot to answer these questions...

Lin Ming

unread,

May 19, 2010, 2:40:01 AM5/19/10

to

On Wed, 2010-05-19 at 13:00 +0800, Greg KH wrote:
> On Wed, May 19, 2010 at 11:40:29AM +0800, Lin Ming wrote:
> > On Wed, 2010-05-19 at 10:48 +0800, Greg KH wrote:
> > > On Wed, May 19, 2010 at 10:34:55AM +0800, Lin Ming wrote:
> > > > /sys/devices/system/cpu/events/
> > > > |-- L1-dcache-load-misses
> > > > | |-- event_source -> ../../event_source
> > > >
> > > > For above example, "event_source" is the cpu pmu and the "event" is
> > > > L1-dcache-load-misses.
> > > >
> > > > Yes, they point to the same thing, because all the events
> > > > under /sys/devices/system/cpu/events/* is monitored by the same event
> > > > source, ie, the cpu pmu(/sys/devices/system/cpu/event_source/).
> > >
> > > Then, why have a symlink if they are all going to be the same?
> >
> > So how about move the symlink to parent directory, like
> >
> > /sys/devices/system/cpu/event_source/
> >
> > /sys/devices/system/cpu/events/
> > |-- event_source -> ../event_source
>
> Again, why would that make any sense? You would be creating a symlink
> to something that is always the same symlink. Why would that even be
> needed?

Do you mean the target of the symlink is also a symlink? No.

/sys/devices/system/cpu/events/event_source points
to /sys/devices/system/cpu/event_source.

/sys/devices/system/cpu/event_source/ is a sysfs dir, not a symlink.

>
> > > > > I'm still not really understanding what exactly you are trying to show
> > > > > in the sysfs tree here anyway, perhaps the documentation update will
> > > > > clear that up for me.
> > > >
> > > > To support multiple pmus
> > >
> > > What is a "pmu"?
> >
> > Performance Monitoring Unit
>
> What is a "unit"?
>
> > > > (or call them event source),
> > >
> > > What is an "event source"?
> >
> > The source who generates the event.
>
> The traditional way to define a word, or phrase, is to not use the word
> or phrase in the definition, otherwise that definition makes no sense.

Ingo's explanation looks much more clear.

<---snip start--->
We _really_ dont want to call it a 'PMU' but 'events coming from an event
source'.

The reason is that a PMU is an existing term that is quite attached to a CPU -
while many hardware events come not from a PMU. Interrupts, error conditions,
hotplug events, etc. etc.

Furthermore, the name 'PMU' is even less correct for software events.

So lets stick with 'events' and with some container that originates them.
(event_source) Ok?
<---snip end--->

>
> > The event includes hardware events, cache-misses, bus-cycles,
> > interrupts, error conditions, hotplug......
> > and software events, context-switches, page-faults, sched events......
>
> So, stuff that happens to a CPU that are usually handled in the
> performance counters portion of the CPU, right?

For cpu hardware events(cycles, branch-misses, L1-dcache-loads etc...),
yes.

>
> Why put this in sysfs? Why do you think that mapping this information
> there makes sense?

sysfs, "It provides a means to export kernel data structures, their
attributes, and the linkages between them to userspace."

pmus and events have attributes that userspace tool want to know, for
example, pmu id, event config value.

And they also have linkage, for example, cycles event uses cpu pmu to
handle it, so

/sys/devices/system/cpu/events/cycles/event_source
---> /sys/devices/system/cpu/event_source

>
> > > > I want to show all
> > > > the pmus and events in the sysfs tree.
> > >
> > > Why do these things need to be in sysfs? What benifit is it going to
> > > have?
> >
> > Kernel to provide the information of the event source and
> > hardware/software events via sysfs and user-space can get
> > this info from sysfs.
>
> But that's not what you were doing with your "open a sysfs file from
> within the kernel and use the kobject there" logic, right? You were
> doing everything within the kernel itself, so sysfs, and kobjects and
> the like, aren't really needed at all, right?

No.

In userspace, open a sysfs file, see patch 11.
Then pass the open file's fd to the syscall sys_perf_event_open.

In kernel, use the passed in fd to find the pmu.

>
> How are you exporting this information to userspace today?

Not export it today.

>
> > > > For mce,
> > > > /sys/kernel/events/mce/*
> > >
> > > What? No, don't create /sys/kernel/events/. Is that in this patchset?
> >
> > No, not in this patchset.
>
> Good :)
>
> > > Ick, that's such a "general" name that it is going to get confusing very
> > > quickly. Heck, I still don't know what this "event" thing is yet :)
> > >
> > > > and so on...the exact placement is not yet final.
> > >
> > > Again, why do you need/want anything in sysfs in the first place?
> > > What problem is it going to solve? Who is going to benifit? Why do
> > > they care? What is this whole thing about?
>
> You forgot to answer these questions...

Want to solve the problem of how to address an "event source".
I thought it's clear to export via sysfs, but, I maybe totally wrong...

Thanks,
Lin Ming

Borislav Petkov

unread,

May 19, 2010, 3:10:02 AM5/19/10

to

From: Lin Ming <ming....@intel.com>
Date: Wed, May 19, 2010 at 10:34:55AM +0800

Hi,

[..]

>
> For mce,
> /sys/kernel/events/mce/*

have we considered the per-cpu granularity of MCEs here and if yes, how
to represent that?

Something like

/sys/kernel/events/mce/cpus/cpuX/..

or similar maybe. One use case I can think of right now is being able to
inject an MCE on a certain cpu...

Thanks.

--
Regards/Gruss,
Boris.

Operating Systems Research Center
Advanced Micro Devices, Inc.

Peter Zijlstra

unread,

May 19, 2010, 3:20:02 AM5/19/10

to

On Tue, 2010-05-18 at 19:48 -0700, Greg KH wrote:
> Again, why do you need/want anything in sysfs in the first place?
> What problem is it going to solve? Who is going to benifit? Why do
> they care? What is this whole thing about?

OK, so all of this is about perf_event. The story starts with CPUs
adding a PMU (Performance Monitor Unit) which allows the user to
count/sample cpu state.

The whole perf_counter subsystem was created to abstract this piece of
hardware and provide an kernel interface to it.

Then we realized that a generalization of the PMU exists in pretty much
everything that generates 'events' of interest and so we started adding
software PMUs that allowed us to do the same for tracepoints etc.

So we ended up with perf_events. A subsystem dedicated to counting
events and event based sampling.

Now the problem this patch set tries to solve; more hardware than the
CPU has such capabilities. There are memory controllers, bus controllers
and devices with similar capabilities.

So we need a way to identify and locate these things, and since sysfs
has the full machine topology in it, the idea was to represent these
things in sysfs as an event_source class.

Since the CPU and memory controllers are (assumed) symmetric on the
system, we get to add things like:

/sys/devices/system/cpu/cpu_event_source/
/sys/devices/system/node/node_event_source/

Devices like GPUs can do:

/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/radeon_event_source/

Hooking them into sysfs at the proper device/machine topology location
allows us to quickly locate and identify these 'event_sources'.

Since all hardware wants to keep life interesting they all work
differently and programming PMUs is no different, they count different
things, have different ways to program them etc. But for each class
there is a useful subset of things that is pretty uniform.

CPU based PMUs all can count things like clock-cycles and instructions,
Memory controllers can count things like local/remote memory accesses
etc.

So each class has a number of actual events that are worthy of
abstracting. The idea was to place these events in the event_source,
like:

/sys/devices/system/cpu/cpu_event_source/cycles/
/sys/devices/system/cpu/cpu_event_source/instructions/

And then there are the software event_sources that expose kernel events
(through tracepoints), currently tracepoints live
in /debug/tracing/events/ (or /sys/kernel/debug/tracing/events/ for
those so inclined). But the above abstraction would suggest we expose
them similarly.

I'm not sure where we'd want them to live, we could add them to:

/sys/kernel/tracepoint_event_source/

and have them live there, but I'm open to alternatives :-)

[ With event_source's being a sysfs-class, we also get a nice flat
collection in /sys/class/event_source/ helping those who get lost
in the device topology, me :-) ]

The next issue seems to be the interface between this sysfs
representation and the perf_event syscall, how do we go about creating
an actual perf_event object from this rich sysfs event_source class
object.

The sys_perf_event_open() call takes a struct perf_event_attr pointer
which describes the event and its properties. The current event
classification goes through:

struct perf_event_attr {
__u32 type;
__u64 config;

...
};

So my initial idea was to let each event_source have a type_id and let
each of its events have a config field and read those and insert them in
your structure.

So we'd get:

/sys/devices/system/cpu/cpu_event_source/type_id
/sys/devices/system/cpu/cpu_event_source/instructions/config

cat those to get: .type = 0, .config = 1
(PERF_TYPE_HARDWARE:PERF_COUNT_HW_INSTRUCTIONS).

Then Ingo objected and said, if we need to open and read those file, you
might as well just open one file and pass the fd along, saves some
syscalls.

So you'd end up doing:

fd = open("/sys/devices/system/cpu/cpu_event_source/instructions/config");
attr->type = fd | PERF_TYPE_FD;
event_fd = perf_event_open(attr, ... );
close(fd);

>From that one fd we can find to which 'event_source' it belongs and what
particular config we need to use.

Plenty of opinions to be had on that I guess.

Anyway, this was the what, why and how of it.

Peter Zijlstra

unread,

May 19, 2010, 3:20:02 AM5/19/10

to

On Wed, 2010-05-19 at 09:06 +0200, Borislav Petkov wrote:
>
> have we considered the per-cpu granularity of MCEs here and if yes, how
> to represent that?

Uhm, by opening the MCE event on a particular cpu? Remember that
sys_perf_event_open() has a cpu target.

The thing is, CPUs are assumed symmetric, thus also all MCE events are
symmetric, one CPU cannot generate other MCE than another. So the only
thing that differs is where you want to listen for them.

Ingo Molnar

unread,

May 19, 2010, 3:30:01 AM5/19/10

to

* Peter Zijlstra <pet...@infradead.org> wrote:

> On Wed, 2010-05-19 at 09:06 +0200, Borislav Petkov wrote:
> >
> > have we considered the per-cpu granularity of MCEs
> > here and if yes, how to represent that?
>
> Uhm, by opening the MCE event on a particular cpu?
> Remember that sys_perf_event_open() has a cpu target.
>
> The thing is, CPUs are assumed symmetric, thus also all
> MCE events are symmetric, one CPU cannot generate other
> MCE than another. So the only thing that differs is
> where you want to listen for them.

Note, this does not preclude Linux from supporting
assymetric MP, should the need arise: if MP assymetry is
supported then that can (and should) be expressed in the
sysfs topology accordingly - and the moment the assymetric
MP topology is enumerated in sysfs it gives a place for
the different event_source's to live there as well.

Thanks,

Ingo

Greg KH

unread,

May 20, 2010, 2:50:02 PM5/20/10

to

On Wed, May 19, 2010 at 09:14:36AM +0200, Peter Zijlstra wrote:
> On Tue, 2010-05-18 at 19:48 -0700, Greg KH wrote:
> > Again, why do you need/want anything in sysfs in the first place?
> > What problem is it going to solve? Who is going to benifit? Why do
> > they care? What is this whole thing about?
>
>
> OK, so all of this is about perf_event. The story starts with CPUs
> adding a PMU (Performance Monitor Unit) which allows the user to
> count/sample cpu state.
>
> The whole perf_counter subsystem was created to abstract this piece of
> hardware and provide an kernel interface to it.
>
> Then we realized that a generalization of the PMU exists in pretty much
> everything that generates 'events' of interest and so we started adding
> software PMUs that allowed us to do the same for tracepoints etc.
>
> So we ended up with perf_events. A subsystem dedicated to counting
> events and event based sampling.
>
> Now the problem this patch set tries to solve; more hardware than the
> CPU has such capabilities. There are memory controllers, bus controllers
> and devices with similar capabilities.
>
> So we need a way to identify and locate these things, and since sysfs
> has the full machine topology in it, the idea was to represent these
> things in sysfs as an event_source class.
>
> Since the CPU and memory controllers are (assumed) symmetric on the
> system, we get to add things like:
>
>
> /sys/devices/system/cpu/cpu_event_source/

Wouldn't that really be:
/sys/devices/system/cpu/cpu0/cpu_event_source/
?

/sys/devices/system/cpu is a "type" of devices in the system here, and
isn't an event source specific to the device itself?

Or is it for all cpus together?

> /sys/devices/system/node/node_event_source/
>
> Devices like GPUs can do:
>
> /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/radeon_event_source/
>
> Hooking them into sysfs at the proper device/machine topology location
> allows us to quickly locate and identify these 'event_sources'.

Ok, this all makes a lot more sense now, thanks.

> Since all hardware wants to keep life interesting they all work
> differently and programming PMUs is no different, they count different
> things, have different ways to program them etc. But for each class
> there is a useful subset of things that is pretty uniform.
>
> CPU based PMUs all can count things like clock-cycles and instructions,
> Memory controllers can count things like local/remote memory accesses
> etc.
>
> So each class has a number of actual events that are worthy of
> abstracting. The idea was to place these events in the event_source,
> like:
>
> /sys/devices/system/cpu/cpu_event_source/cycles/
> /sys/devices/system/cpu/cpu_event_source/instructions/
>
>
>
> And then there are the software event_sources that expose kernel events
> (through tracepoints), currently tracepoints live
> in /debug/tracing/events/ (or /sys/kernel/debug/tracing/events/ for
> those so inclined). But the above abstraction would suggest we expose
> them similarly.
>
> I'm not sure where we'd want them to live, we could add them to:
>
> /sys/kernel/tracepoint_event_source/
>
> and have them live there, but I'm open to alternatives :-)

Once you go outside of /sys/devices/ you aren't playing with devices
properly, so you might just want to stick to a "class" and have
/sys/class/tracepoint_event_source/ where all of the devices would end
up symlinking to.

> [ With event_source's being a sysfs-class, we also get a nice flat
> collection in /sys/class/event_source/ helping those who get lost
> in the device topology, me :-) ]

Yes, but isn't the fact that you can have different types of
event sources lend itself to different classes of event sources?

Ah, pass the fd of a sysfs file to sysfs to get the kobject. Ick,
that's just, well, something that I never even considered someone would
need/want to do...

sysfs exports single values just fine. If you are starting to do more
complex things, like you currently are, maybe you shouldn't be in
sysfs...

I can always knock up a eventfs for you do mount at /sys/kernel/events/
or something if you want :)

thanks,

greg k-h

Peter Zijlstra

unread,

May 20, 2010, 4:00:03 PM5/20/10

to

On Thu, 2010-05-20 at 11:42 -0700, Greg KH wrote:
> On Wed, May 19, 2010 at 09:14:36AM +0200, Peter Zijlstra wrote:

> > Since the CPU and memory controllers are (assumed) symmetric on the
> > system, we get to add things like:
> >
> >
> > /sys/devices/system/cpu/cpu_event_source/
>
> Wouldn't that really be:
> /sys/devices/system/cpu/cpu0/cpu_event_source/
> ?
>
> /sys/devices/system/cpu is a "type" of devices in the system here, and
> isn't an event source specific to the device itself?
>
> Or is it for all cpus together?

All CPUs are assumed identical, and the perf syscall has task/cpu
monitor targets. If the CPUs would not be identical (like Paul Mundt
said SH might do) then it would make sense to have different
event_sources for each cpu.

> > I'm not sure where we'd want them to live, we could add them to:
> >
> > /sys/kernel/tracepoint_event_source/
> >
> > and have them live there, but I'm open to alternatives :-)
>
> Once you go outside of /sys/devices/ you aren't playing with devices
> properly, so you might just want to stick to a "class" and have
> /sys/class/tracepoint_event_source/ where all of the devices would end
> up symlinking to.

Sure, that would work.

> > [ With event_source's being a sysfs-class, we also get a nice flat
> > collection in /sys/class/event_source/ helping those who get lost
> > in the device topology, me :-) ]
>
> Yes, but isn't the fact that you can have different types of
> event sources lend itself to different classes of event sources?

I'm not quite sure adding another abstraction level buys us much.

> > fd = open("/sys/devices/system/cpu/cpu_event_source/instructions/config");
> > attr->type = fd | PERF_TYPE_FD;
> > event_fd = perf_event_open(attr, ... );
> > close(fd);
> >
> > From that one fd we can find to which 'event_source' it belongs and what
> > particular config we need to use.
>
> Ah, pass the fd of a sysfs file to sysfs to get the kobject. Ick,
> that's just, well, something that I never even considered someone would
> need/want to do...

No, we don't pass the fd to sysfs, we pass the fd into a syscall.

> sysfs exports single values just fine. If you are starting to do more
> complex things, like you currently are, maybe you shouldn't be in
> sysfs...

Well, like said, I'm fine with it actually being single values, its just
that Ingo suggested skipping a few syscalls.

Robert just suggested we could use the sysfs files as device nodes and
have then open() return a perf_event fd. Its just that that would
require we add a ioctl to change the perf_event_attr structure and
attach it to a context.

> I can always knock up a eventfs for you do mount at /sys/kernel/events/
> or something if you want :)

But that won't get us the nice device linkage, right?

Greg KH

unread,

May 20, 2010, 4:20:02 PM5/20/10

to

On Thu, May 20, 2010 at 09:52:32PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-05-20 at 11:42 -0700, Greg KH wrote:
> > On Wed, May 19, 2010 at 09:14:36AM +0200, Peter Zijlstra wrote:
>
> > > Since the CPU and memory controllers are (assumed) symmetric on the
> > > system, we get to add things like:
> > >
> > >
> > > /sys/devices/system/cpu/cpu_event_source/
> >
> > Wouldn't that really be:
> > /sys/devices/system/cpu/cpu0/cpu_event_source/
> > ?
> >
> > /sys/devices/system/cpu is a "type" of devices in the system here, and
> > isn't an event source specific to the device itself?
> >
> > Or is it for all cpus together?
>
> All CPUs are assumed identical, and the perf syscall has task/cpu
> monitor targets. If the CPUs would not be identical (like Paul Mundt
> said SH might do) then it would make sense to have different
> event_sources for each cpu.

Ah, ok.

> > > fd = open("/sys/devices/system/cpu/cpu_event_source/instructions/config");
> > > attr->type = fd | PERF_TYPE_FD;
> > > event_fd = perf_event_open(attr, ... );
> > > close(fd);
> > >
> > > From that one fd we can find to which 'event_source' it belongs and what
> > > particular config we need to use.
> >
> > Ah, pass the fd of a sysfs file to sysfs to get the kobject. Ick,
> > that's just, well, something that I never even considered someone would
> > need/want to do...
>
> No, we don't pass the fd to sysfs, we pass the fd into a syscall.

Sorry, yes, that is what I was trying to say. You then take that fd,
pass it to the sysfs core within the kernel, and get a kobject back.

> > sysfs exports single values just fine. If you are starting to do more
> > complex things, like you currently are, maybe you shouldn't be in
> > sysfs...
>
> Well, like said, I'm fine with it actually being single values, its just
> that Ingo suggested skipping a few syscalls.
>
> Robert just suggested we could use the sysfs files as device nodes and
> have then open() return a perf_event fd. Its just that that would
> require we add a ioctl to change the perf_event_attr structure and
> attach it to a context.

Nope, sorry, that's not going to happen, that is not what sysfs is for.
No ioctls or device nodes in there please.

> > I can always knock up a eventfs for you do mount at /sys/kernel/events/
> > or something if you want :)
>
> But that won't get us the nice device linkage, right?

True, you would loose that.

thanks,

greg k-h

Ingo Molnar

unread,

May 20, 2010, 4:20:02 PM5/20/10

to

* Greg KH <gr...@kroah.com> wrote:

> [...]

>
> I can always knock up a eventfs for you do mount at /sys/kernel/events/ or
> something if you want :)

eventfs was my first idea, until Peter convinced me that we want sysfs :-)

One important aspect would be to move it into the physical topology. Graphics
card? It might have events. PCI device? It might have events. Southbridge? It
might have a PMU and events. CPU? It has a PMU.

Especially when it comes to complex physical topologies on larger systems, we
eventually want to visualize things in tooling as well - as a tree of the
physical topology. Also, physical topologies will only become more complex, so
we dont want to detach events from them.

> sysfs exports single values just fine. If you are starting to do more
> complex things, like you currently are, maybe you shouldn't be in sysfs...

This is really like a read-only attributes, and it would be multi-line only
for the event format descriptor - a genuinely new aspect: a flexible ABI
descriptor.

It's an attribute for a very good purpose: flexible ABI with a user-space that
interprets new format descriptions automatically. This is not just theory, for
example perf trace does this today, and you can write scripts with old tools
for a new event that shows up in a new kernel, without rebuilding the tools.

Here is an example of a format descriptor:

# cat /debug/tracing/events/sched/sched_wakeup/format
name: sched_wakeup
ID: 59
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:int common_lock_depth; offset:8; size:4; signed:1;

field:char comm[TASK_COMM_LEN]; offset:12; size:16; signed:1;
field:pid_t pid; offset:28; size:4; signed:1;
field:int prio; offset:32; size:4; signed:1;
field:int success; offset:36; size:4; signed:1;
field:int target_cpu; offset:40; size:4; signed:1;

print fmt: "comm=%s pid=%d prio=%d success=%d target_cpu=%03d", REC->comm, REC->pid, REC->prio, REC->success, REC->target_cpu

Also, we already have quite a few multi-line files in sysfs, for example:

$ cat /sys/devices/pnp0/00:09/options
Dependent: 00 - Priority preferred
port 0x378-0x378, align 0x0, size 0x8, 16-bit address decoding
port 0x778-0x778, align 0x0, size 0x8, 16-bit address decoding
irq 7 High-Edge
dma 3 8-bit compatible
Dependent: 01 - Priority acceptable
port 0x378-0x378, align 0x0, size 0x8, 16-bit address decoding
port 0x778-0x778, align 0x0, size 0x8, 16-bit address decoding
irq 3,4,5,6,7,10,11,12 High-Edge
dma 0,1,2,3 8-bit compatible
Dependent: 02 - Priority acceptable
port 0x278-0x278, align 0x0, size 0x8, 16-bit address decoding
port 0x678-0x678, align 0x0, size 0x8, 16-bit address decoding
irq 3,4,5,6,7,10,11,12 High-Edge
dma 0,1,2,3 8-bit compatible
Dependent: 03 - Priority acceptable
port 0x3bc-0x3bc, align 0x0, size 0x4, 16-bit address decoding
port 0x7bc-0x7bc, align 0x0, size 0x4, 16-bit address decoding
irq 3,4,5,6,7,10,11,12 High-Edge
dma 0,1,2,3 8-bit compatible

$ cat /sys/devices/pci0000:00/0000:00:1a.7/pools
poolinfo - 0.1
ehci_sitd 0 0 96 0
ehci_itd 0 0 160 0
ehci_qh 4 42 96 1
ehci_qtd 4 42 96 1
buffer-2048 0 0 2048 0
buffer-512 0 0 512 0
buffer-128 0 0 128 0
buffer-32 1 128 32 1

In fact uevents have multi-line attributes as well:

$ cat /sys/devices/pci0000:00/0000:00:1a.1/usb4/uevent
MAJOR=189
MINOR=384
DEVNAME=bus/usb/004/001
DEVTYPE=usb_device
DRIVER=usb
DEVICE=/proc/bus/usb/004/001
PRODUCT=1d6b/1/206
TYPE=9/0/0
BUSNUM=004
DEVNUM=001

Thanks,

Ingo

Greg KH

unread,

May 20, 2010, 7:30:01 PM5/20/10

to

On Thu, May 20, 2010 at 10:14:18PM +0200, Ingo Molnar wrote:
>
> * Greg KH <gr...@kroah.com> wrote:
>
> > [...]
> >
> > I can always knock up a eventfs for you do mount at /sys/kernel/events/ or
> > something if you want :)
>
> eventfs was my first idea, until Peter convinced me that we want sysfs :-)
>
> One important aspect would be to move it into the physical topology. Graphics
> card? It might have events. PCI device? It might have events. Southbridge? It
> might have a PMU and events. CPU? It has a PMU.
>
> Especially when it comes to complex physical topologies on larger systems, we
> eventually want to visualize things in tooling as well - as a tree of the
> physical topology. Also, physical topologies will only become more complex, so
> we dont want to detach events from them.

Ok, yes, physical topology would be nice to have, I agree.

> > sysfs exports single values just fine. If you are starting to do more
> > complex things, like you currently are, maybe you shouldn't be in sysfs...
>
> This is really like a read-only attributes, and it would be multi-line only
> for the event format descriptor - a genuinely new aspect: a flexible ABI
> descriptor.

Oh no...

> It's an attribute for a very good purpose: flexible ABI with a user-space that
> interprets new format descriptions automatically. This is not just theory, for
> example perf trace does this today, and you can write scripts with old tools
> for a new event that shows up in a new kernel, without rebuilding the tools.
>
> Here is an example of a format descriptor:
>
> # cat /debug/tracing/events/sched/sched_wakeup/format
> name: sched_wakeup
> ID: 59
> format:
> field:unsigned short common_type; offset:0; size:2; signed:0;
> field:unsigned char common_flags; offset:2; size:1; signed:0;
> field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
> field:int common_pid; offset:4; size:4; signed:1;
> field:int common_lock_depth; offset:8; size:4; signed:1;
>
> field:char comm[TASK_COMM_LEN]; offset:12; size:16; signed:1;
> field:pid_t pid; offset:28; size:4; signed:1;
> field:int prio; offset:32; size:4; signed:1;
> field:int success; offset:36; size:4; signed:1;
> field:int target_cpu; offset:40; size:4; signed:1;
>
> print fmt: "comm=%s pid=%d prio=%d success=%d target_cpu=%03d", REC->comm, REC->pid, REC->prio, REC->success, REC->target_cpu

Hm, kind of like a "sane" xml, right?

> Also, we already have quite a few multi-line files in sysfs, for example:

These are all aborations, please don't perputuate it.

> $ cat /sys/devices/pnp0/00:09/options
> Dependent: 00 - Priority preferred
> port 0x378-0x378, align 0x0, size 0x8, 16-bit address decoding
> port 0x778-0x778, align 0x0, size 0x8, 16-bit address decoding
> irq 7 High-Edge
> dma 3 8-bit compatible
> Dependent: 01 - Priority acceptable
> port 0x378-0x378, align 0x0, size 0x8, 16-bit address decoding
> port 0x778-0x778, align 0x0, size 0x8, 16-bit address decoding
> irq 3,4,5,6,7,10,11,12 High-Edge
> dma 0,1,2,3 8-bit compatible
> Dependent: 02 - Priority acceptable
> port 0x278-0x278, align 0x0, size 0x8, 16-bit address decoding
> port 0x678-0x678, align 0x0, size 0x8, 16-bit address decoding
> irq 3,4,5,6,7,10,11,12 High-Edge
> dma 0,1,2,3 8-bit compatible
> Dependent: 03 - Priority acceptable
> port 0x3bc-0x3bc, align 0x0, size 0x4, 16-bit address decoding
> port 0x7bc-0x7bc, align 0x0, size 0x4, 16-bit address decoding
> irq 3,4,5,6,7,10,11,12 High-Edge
> dma 0,1,2,3 8-bit compatible

That should be a debugfs file.

> $ cat /sys/devices/pci0000:00/0000:00:1a.7/pools
> poolinfo - 0.1
> ehci_sitd 0 0 96 0
> ehci_itd 0 0 160 0
> ehci_qh 4 42 96 1
> ehci_qtd 4 42 96 1
> buffer-2048 0 0 2048 0
> buffer-512 0 0 512 0
> buffer-128 0 0 128 0
> buffer-32 1 128 32 1

Odd, I hadn't noticed that one before. I can't figure out what that
file is, who creates it?

Ick, mm/dmapool.c? Hm, not good, that's a debugging file only, and
really does not belong in sysfs. It seems to predate 2.6.12, so it made
it in before debugfs was around. I'll work on moving it out of sysfs...

> In fact uevents have multi-line attributes as well:
>
> $ cat /sys/devices/pci0000:00/0000:00:1a.1/usb4/uevent
> MAJOR=189
> MINOR=384
> DEVNAME=bus/usb/004/001
> DEVTYPE=usb_device
> DRIVER=usb
> DEVICE=/proc/bus/usb/004/001
> PRODUCT=1d6b/1/206
> TYPE=9/0/0
> BUSNUM=004
> DEVNUM=001

Yes, that's the environment variables that are sent to userspace in the
uevent. I don't like the multi-line stuff for this one, but we couldn't
think of a better way at the time.

Anyway, back to your original issue, multi-line sysfs files.

I really don't want to do something like that, in sysfs, if at all
possible. We have been working very hard to keep the sysfs file format
simple, and to follow the one-value-per-file rule, so we don't end up
repeating the same mistakes we did in /proc.

Now one could argue that we are not entirely successful, especially
based on your examples above. However, those are the rare exception,
not the rule by far.

So, where do we do something like this? I don't know. I still like the
idea of eventfs, and we could pass in a kobject to it to have it create
the tree if needed. Yeah, that would be a replication of some of the
sysfs structure, but you could have a custom file format, like you show
above, which would you could control and keep in step with your
userspace tools.

How deep in the device tree are you really going to be caring about? It
sounds like the large majority of events are only going to be coming
from the "system" type objects (cpu, nodes, memory, etc.) and very few
would be from things that we consider a 'struct device' today (like a
pci, usb, scsi, or input, etc.)

thanks,

greg k-h

Peter Zijlstra

unread,

May 21, 2010, 4:10:02 AM5/21/10

to

On Thu, 2010-05-20 at 16:12 -0700, Greg KH wrote:
> How deep in the device tree are you really going to be caring about? It
> sounds like the large majority of events are only going to be coming
> from the "system" type objects (cpu, nodes, memory, etc.) and very few
> would be from things that we consider a 'struct device' today (like a
> pci, usb, scsi, or input, etc.)

The general noise I hear from the hardware people is that we'll see more
and more device-level stuff - bus bridges/controller and actual devices
(GPUs, NICs etc.) will be wanting to export performance metrics.

Ingo Molnar

unread,

May 21, 2010, 5:50:02 AM5/21/10

to

* Peter Zijlstra <pet...@infradead.org> wrote:

> On Thu, 2010-05-20 at 16:12 -0700, Greg KH wrote:

> > How deep in the device tree are you really going to be
> > caring about? It sounds like the large majority of
> > events are only going to be coming from the "system"
> > type objects (cpu, nodes, memory, etc.) and very few
> > would be from things that we consider a 'struct
> > device' today (like a pci, usb, scsi, or input, etc.)
>
> The general noise I hear from the hardware people is
> that we'll see more and more device-level stuff - bus
> bridges/controller and actual devices (GPUs, NICs etc.)
> will be wanting to export performance metrics.

There's (much) more:

- laptops want to provide power level/usage metrics,

- we could express a lot of special, lower level
(transport specific) disk IO stats via events as well -
without having to push those stats to a higher level
(where it might not make sense). Currently such kinds
of stats/metrics are very device/subsystem specific
way, if they are provided at all.

Also, we already have quite a few per device tracepoints
upstream. Here are a few examples:

- GPU tracepoints (trace_i915_gem_request_submit(), etc.)
- WIFI tracepoints (trace_iwlwifi_dev_ioread32(), etc.)
- block tracepoints (trace_block_bio_complete())

So these would be attached to:

# GEM events of drm/card0:
/sys/devices/pci0000:00/0000:00:02.0/drm/card0/events/i915_gem_request_submit/

# Wifi-ioread events of wlan0:
/sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ioread32/

# whole sdb disk events:
/sys/block/sdb/events/block_bio_complete/

# sdb1 partition events:
/sys/block/sdb/sdb1/events/block_bio_complete/

And we also have 'software nodes' in /sys that have events
upstream here and today. For example for SLAB we already
have kmalloc/kfree tracepoints (trace_kmalloc() and
trace_kfree()):

# all kmalloc events:
/sys/kernel/slab/events/

# kmalloc events for sighand_cache:
/sys/kernel/slab/sighand_cache/events/kmalloc/

# kfree events for sighand_cache:
/sys/kernel/slab/sighand_cache/events/kfree/

In general the set of events we have upstream is growing
along an exponential curve (there's over a hundred now,
via tracepoints).

They are either logically attached to the hardware
topology of the system (as in the first set of examples
above), or ae attached to the software/subsystem object
topology of the kernel (some examples of which are
described in the second set of examples above).

Sometimes there are aliasing/filtering relationship
between events, which is expressed very well via the
hierarchy and granularity of /sysfs.

New events would go into that topology there in a natural
way.

For example general hugepage tracepoints (should we
introduce any) would go into the existing hugepage node:

/sys/kernel/mm/hugepages/events/...

All in one, all these existing and future events, both of
hardware and software type, are literally begging to be
attached to nodes in /sys :-)

If we created a separate eventfs for it we'd have to start
with duplicating all the topology/hiearchy/structure that
is present in sysfs already. (and dilluting /sys's
utility)

That would be a bad thing, so it would be nice if we found
a workable solution here. We could split up the record
format some more:

/sys/kernel/sched/events/sched_wakeup/format/
/sys/kernel/sched/events/sched_wakeup/format/common_type/
/sys/kernel/sched/events/sched_wakeup/format/common_flags/
/sys/kernel/sched/events/sched_wakeup/format/common_preempt_count/
/sys/kernel/sched/events/sched_wakeup/format/common_pid/
/sys/kernel/sched/events/sched_wakeup/format/common_lock_depth/
/sys/kernel/sched/events/sched_wakeup/format/comm/
/sys/kernel/sched/events/sched_wakeup/format/pid/
/sys/kernel/sched/events/sched_wakeup/format/prio/
/sys/kernel/sched/events/sched_wakeup/format/success/
/sys/kernel/sched/events/sched_wakeup/format/target_cpu/

Into single-value files. But this would add significant
parsing overhead (plus significant allocation overhead),
for no tangible benefit.

The problem with /proc was always the lack of standard
structure and the lack of performance - while the format
file is about _more_ structure.

Increasing structure parsing overhead does not look like
the right answer to that problem.

Hm?

Ingo

Lin Ming

unread,

May 31, 2010, 10:40:01 PM5/31/10

to

The difficulty is how to know where each event should be attached to.

struct ftrace_event_call *call;

for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
/* where will this event be attached to? */
}

Any idea?

Thanks,
Lin Ming

Ingo Molnar

unread,

Jun 8, 2010, 2:50:01 PM6/8/10

to

Well, it cannot be automatic - for each subsystem it's a different sysfs
point. So it has to be specified in the TRACE_EVENT() definition or so.

Ingo

Lin Ming

unread,

Jun 21, 2010, 5:00:02 AM6/21/10

to

> From: Ingo Molnar <mi...@elte.hu>

(sorry for late response...)

There are so many events via tracepoints, how about below sysfs point
for each events?

1. kvm events
/sys/devices/system/kvm/kvm0/events/kvm_set_irq/
...
/sys/devices/system/kvm/kvm0/events/kvm_emulate_insn/

2. kvm_mmu events
/sys/devices/system/kvm/kvm0/events/kvm_mmu_pagetable_walk/
...
/sys/devices/system/kvm/kvm0/events/kvm_mmu_zap_page/

3. raw_syscalls events
/sys/kernel/events/sys_enter/
/sys/kernel/events/sys_exit/

4. mce events
/sys/devices/system/machinecheck/machinecheck0/mce_record/
...
/sys/devices/system/machinecheck/machinecheckN/mce_record/

5. sched events
/sys/kernel/events/sched_kthread_stop/
...
/sys/kernel/events/sched_stat_runtime/

6. irq events
/sys/kernel/events/irq_handler_entry/
...
/sys/kernel/events/softirq_exit/

7. timer events
/sys/kernel/events/timer_init/
...
/sys/kernel/events/itimer_expire/

8. signal events
/sys/kernel/events/signal_generate/
...
/sys/kernel/events/signal_lose_info/

9. workqueue events
/sys/kernel/events/workqueue_insertion/
...
/sys/kernel/events/workqueue_destruction/

10. lock events
/sys/kernel/events/lock_acquire/
...
/sys/kernel/events/lock_acquired/

11. module events
/sys/module/events/module_load/
...
/sys/module/events/module_request/

12. power events
/sys/power/events/power_start/
...
/sys/power/events/power_end/

13. kmem events
/sys/kernel/mm/events/kmalloc/
...
/sys/kernel/mm/events/mm_page_alloc_extfrag/

14. ext4 events
/sys/fs/events/ext4_free_inode/
...
/sys/fs/events/ext4_mb_buddy_bitmap_load/

15. jbd2 events
/sys/fs/events/jbd2_checkpoint/
...
/sys/fs/events/jbd2_cleanup_journal_tail/

16. xfs events
/sys/fs/events/xfs_attr_list_sf/
...
/sys/fs/events/xfs_log_recover_inode_skip/

17. gfs2 events
/sys/fs/events/gfs2_glock_state_change/
...
/sys/fs/events/gfs2_block_alloc/

18. block events
/sys/block/sda/events/block_rq_abort/
...
/sys/block/sdb/events/block_rq_remap/

19. bkl events
/sys/kernel/events/lock_kernel/
...
/sys/kernel/events/unlock_kernel/

20. scsi events
/sys/bus/scsi/devices/0:0:0:0/events/scsi_dispatch_cmd_start/
...
/sys/bus/scsi/devices/0:0:0:0/events/scsi_eh_wakeup/

21. iwlwifi_io events

/sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ioread32/

...
/sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ucode_event/

22. iwm events
<where should iwm events be in sysfs?>

23. skb events
/sys/class/net/events/kfree_skb/
/sys/class/net/events/skb_copy_datagram_iovec/

24. napi events
/sys/class/net/events/napi/

25. mac80211 events
/sys/class/net/events/drv_start/
...
/sys/class/net/events/stop_queue

Lin Ming

unread,

Jun 22, 2010, 3:30:02 AM6/22/10

to

On Tue, 2010-06-22 at 14:59 +0800, Johannes Berg wrote:
> On Tue, 2010-06-22 at 14:25 +0800, Lin Ming wrote:
> > On Mon, 2010-06-21 at 17:34 +0800, Johannes Berg wrote:

> > > On Mon, 2010-06-21 at 16:55 +0800, Lin Ming wrote:
> > >
> > > > 21. iwlwifi_io events
> > > > /sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ioread32/
> > > > ...
> > > > /sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ucode_event/
> > >
> > >

> > > That doesn't work, you could have multiple PCI devices in your system.
> >
> > Understood. This is just a "demo".
> >
> > Actually, I mean
> >
> > net/wlan0/events/
> > net/waln1/events/
> > ....
> > net/walnN/events/
>
> That's not appropriate either though since you may have multiple network
> interfaces on the same hardware :)

Doesn't net/wlan0...wlanN mean multiple network interfaces on the same
hardware?

If my understanding is wrong, would you show me an example in sysfs?

>
> > > > 23. skb events
> > > > /sys/class/net/events/kfree_skb/
> > > > /sys/class/net/events/skb_copy_datagram_iovec/
> > >

> > > > 25. mac80211 events
> > > > /sys/class/net/events/drv_start/
> > > > ...
> > > > /sys/class/net/events/stop_queue
> > >

> > > It doesn't really seem right to mix all these.
> >
> > Well, agree, skb events are totally different with mac80211 events.
> > Any idea?
>
> I suppose the most appropriate thing would
> be /sys/class/ieee80211/phyN/events/... for almost all of them.

Good idea.

Thanks,
Lin Ming

>
> johannes

Johannes Berg

unread,

Jun 22, 2010, 3:40:02 AM6/22/10

to

On Tue, 2010-06-22 at 15:22 +0800, Lin Ming wrote:

> > > net/wlan0/events/
> > > net/waln1/events/
> > > ....
> > > net/walnN/events/
> >
> > That's not appropriate either though since you may have multiple network
> > interfaces on the same hardware :)
>
> Doesn't net/wlan0...wlanN mean multiple network interfaces on the same
> hardware?

Yes, but the trace points aren't per network interface but rather per
hardware piece.

Johannes Berg

unread,

Jun 22, 2010, 3:50:02 AM6/22/10

to

On Tue, 2010-06-22 at 09:33 +0200, Johannes Berg wrote:

> > > > net/wlan0/events/
> > > > net/waln1/events/
> > > > ....
> > > > net/walnN/events/
> > >
> > > That's not appropriate either though since you may have multiple network
> > > interfaces on the same hardware :)
> >
> > Doesn't net/wlan0...wlanN mean multiple network interfaces on the same
> > hardware?
>
> Yes, but the trace points aren't per network interface but rather per
> hardware piece.

Which really just means that whoever writes the tracepoint needs to
provide a struct device for where to put it (at least in the case of
driver tracepoints), and then ideally some description of the device
also gets put into the ringbuffer.

Assuming you actually want to have the event show up in sysfs twice if
it has multiple producers? I'd like that, it would make sense for a lot
of cases since you might only care about one of the producers.

Lin Ming

unread,

Jun 22, 2010, 4:00:02 AM6/22/10

to

On Tue, 2010-06-22 at 15:33 +0800, Johannes Berg wrote:
> On Tue, 2010-06-22 at 15:22 +0800, Lin Ming wrote:
>
> > > > net/wlan0/events/
> > > > net/waln1/events/
> > > > ....
> > > > net/walnN/events/
> > >
> > > That's not appropriate either though since you may have multiple network
> > > interfaces on the same hardware :)
> >
> > Doesn't net/wlan0...wlanN mean multiple network interfaces on the same
> > hardware?
>
> Yes, but the trace points aren't per network interface but rather per
> hardware piece.

So it can only trace the whole hardware piece rather than a specific
interface?

If yes, then will change to
- /sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ioread32/
+ /sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/events/iwlwifi_dev_ioread32/

Johannes Berg

unread,

Jun 22, 2010, 4:00:02 AM6/22/10

to

On Tue, 2010-06-22 at 15:47 +0800, Lin Ming wrote:

> So it can only trace the whole hardware piece rather than a specific
> interface?
>
> If yes, then will change to
> - /sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/net/wlan0/events/iwlwifi_dev_ioread32/
> + /sys/devices/pci0000:00/0000:00:1c.1/0000:03:00.0/events/iwlwifi_dev_ioread32/

Yes, I guess you were right from the start and I was just thinking about
it the wrong way. So presumably you'll somehow have to "instantiate"
tracepoints for a given device?

Lin Ming

unread,

Jun 22, 2010, 4:10:02 AM6/22/10

to

On Tue, 2010-06-22 at 15:39 +0800, Johannes Berg wrote:
> On Tue, 2010-06-22 at 09:33 +0200, Johannes Berg wrote:
>
> > > > > net/wlan0/events/
> > > > > net/waln1/events/
> > > > > ....
> > > > > net/walnN/events/
> > > >
> > > > That's not appropriate either though since you may have multiple network
> > > > interfaces on the same hardware :)
> > >
> > > Doesn't net/wlan0...wlanN mean multiple network interfaces on the same
> > > hardware?
> >
> > Yes, but the trace points aren't per network interface but rather per
> > hardware piece.
>
> Which really just means that whoever writes the tracepoint needs to
> provide a struct device for where to put it (at least in the case of
> driver tracepoints), and then ideally some description of the device
> also gets put into the ringbuffer.

I'm not familiar with tracepoint code. Correct me if I'm wrong.
Do you mean that, for example, if iwlwifi_dev_ioread32 event is traced,
then the "device" info will get put into the ftrace ringbuffer?

>
> Assuming you actually want to have the event show up in sysfs twice if
> it has multiple producers? I'd like that, it would make sense for a lot
> of cases since you might only care about one of the producers.

Yes, each producers has a "events" sysfs dir.

Johannes Berg

unread,

Jun 22, 2010, 4:20:02 AM6/22/10

to

On Tue, 2010-06-22 at 16:04 +0800, Lin Ming wrote:

> > Which really just means that whoever writes the tracepoint needs to
> > provide a struct device for where to put it (at least in the case of
> > driver tracepoints), and then ideally some description of the device
> > also gets put into the ringbuffer.
>
> I'm not familiar with tracepoint code. Correct me if I'm wrong.
> Do you mean that, for example, if iwlwifi_dev_ioread32 event is traced,
> then the "device" info will get put into the ftrace ringbuffer?

No, right now it doesn't.

Ingo Molnar

unread,

Jun 24, 2010, 5:40:04 AM6/24/10

to

* Johannes Berg <joha...@sipsolutions.net> wrote:

> On Tue, 2010-06-22 at 15:22 +0800, Lin Ming wrote:
>
> > > > net/wlan0/events/
> > > > net/waln1/events/
> > > > ....
> > > > net/walnN/events/
> > >
> > > That's not appropriate either though since you may have multiple network
> > > interfaces on the same hardware :)
> >
> > Doesn't net/wlan0...wlanN mean multiple network interfaces on the same
> > hardware?
>
> Yes, but the trace points aren't per network interface but rather per
> hardware piece.

Yeah - we generally want events to live at their 'natural' source in sysfs.

So if it's a per device hardware event, it should live with the hardware
piece. If it's a higher level chipset event, it should live where the chipset
driver is in sysfs. If it's a subsystem level event then it should live there.

I think what you mentioned in your other posting makes the most sense: give
flexibility to tracepoint authors to place the event in the most sensible
sysfs place. It is them who define the tracepoints and the events so any
second guessing by a generic layer will probably get in the way. The generic
tool layer will be content with having the event_source class in sysfs, to see
'all' event sources ttheir topological structure.

That's probably best achieved via a TRACE_EVENT() variant, by passing in the
sysfs location.

It might even make sense to make this a part of TRACE_EVENT() itself and make
'NULL' the current default, non-sysfs-enumerated behavior. That way we can
gradually (and non-intrusively) find all the right sysfs places for events.

Thanks,

Ingo

Johannes Berg

unread,

Jun 24, 2010, 12:20:02 PM6/24/10

to

On Thu, 2010-06-24 at 11:36 +0200, Ingo Molnar wrote:

> That's probably best achieved via a TRACE_EVENT() variant, by passing in the
> sysfs location.
>
> It might even make sense to make this a part of TRACE_EVENT() itself and make
> 'NULL' the current default, non-sysfs-enumerated behavior. That way we can
> gradually (and non-intrusively) find all the right sysfs places for events.

No, this doesn't work. A lot of events are multi-instance. Say you have
an event for each USB device. This event would have to show up in many
places in sysfs, and each trace_foo() invocation needs to get the struct
device pointer, not just the TRACE_EVENT() definition. Additionally, to
create/destroy the sysfs pieces we need something like
init_trace_foo(dev) and destroy_trace_foo(dev) be called when the sysfs
points for the device should be created/destroyed.

The TRACE_EVENT() just defines the template, but such multi-instance
events really should be standardised in terms of their struct device (or
maybe kobject).

I think that needs some TRACE_DEVICE_EVENT macro that creates the
required inlines etc, and including the init/destroy that are called
when the event should show up in sysfs.

There's no way you can have the event show up in sysfs at the right spot
with _just_ a TRACE_EVENT macro, since at define time in the header file
you don't even have a valid struct device pointer.

johannes

Ingo Molnar

unread,

Jun 24, 2010, 1:40:02 PM6/24/10

to

* Johannes Berg <joha...@sipsolutions.net> wrote:

> On Thu, 2010-06-24 at 11:36 +0200, Ingo Molnar wrote:
>
> > That's probably best achieved via a TRACE_EVENT() variant, by passing in the
> > sysfs location.
> >
> > It might even make sense to make this a part of TRACE_EVENT() itself and make
> > 'NULL' the current default, non-sysfs-enumerated behavior. That way we can
> > gradually (and non-intrusively) find all the right sysfs places for events.
>
> No, this doesn't work. A lot of events are multi-instance. Say you have an
> event for each USB device. This event would have to show up in many places
> in sysfs, and each trace_foo() invocation needs to get the struct device
> pointer, not just the TRACE_EVENT() definition. Additionally, to
> create/destroy the sysfs pieces we need something like init_trace_foo(dev)
> and destroy_trace_foo(dev) be called when the sysfs points for the device
> should be created/destroyed.

Yes - but even this could be expressed via TRACE_EVENT(): by giving it a
device-specific function pointer and then instantiating individual events from
a single, central place in sysfs.

That is the place where we already know where it ends up in sysfs, and where
the event-specific function can match up whether that particular node belongs
to it and whether an additional event directory should be created for that
particular sysfs node.

> The TRACE_EVENT() just defines the template, but such multi-instance events
> really should be standardised in terms of their struct device (or maybe
> kobject).
>
> I think that needs some TRACE_DEVICE_EVENT macro that creates the required
> inlines etc, and including the init/destroy that are called when the event
> should show up in sysfs.
>
> There's no way you can have the event show up in sysfs at the right spot
> with _just_ a TRACE_EVENT macro, since at define time in the header file you
> don't even have a valid struct device pointer.

That would be another possible way to do it - to explicitly create the events
directory. It looks a bit simpler as we wouldnt have to touch TRACE_EVENT()
and because it directly expresses the 'this node has an events directory'
property at the place where we create the device node.

Thanks,

Ingo

Lin Ming

unread,

Jun 29, 2010, 2:20:01 AM6/29/10

to

Let me take i915 tracepoints as an example.
Do you mean something like below?

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 423dc90..9e7e4a0 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -28,6 +28,7 @@
*/

#include <linux/device.h>
+#include <linux/perf_event.h>
#include "drmP.h"
#include "drm.h"
#include "i915_drm.h"
@@ -413,7 +414,17 @@ int i965_reset(struct drm_device *dev, u8 flags)
static int __devinit
i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
- return drm_get_dev(pdev, ent, &driver);
+ struct kobject *kobj;
+ int ret;
+
+ ret = drm_get_dev(pdev, ent, &driver);
+
+ if (!ret) {
+ kobj = &pdev->dev.kobj;
+ perf_sys_register_tp(kobj, "i915");
+ }
+
+ return ret;
}

static void
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 716f99b..2a6d834 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1019,6 +1019,8 @@ extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
+
+extern void perf_sys_register_tp(struct kobject *kobj, char *tp_system);
#else
static inline void
perf_event_task_sched_in(struct task_struct *task) { }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 403d180..1b85dad 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5877,3 +5877,32 @@ static int __init perf_event_sysfs_init(void)
&perfclass_attr_group);
}
device_initcall(perf_event_sysfs_init);
+
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
+void perf_sys_register_tp(struct kobject *kobj, char *tp_system)
+{
+ struct ftrace_event_call *call;
+ struct kobject *events_kobj;
+
+ events_kobj = kobject_create_and_add("events", kobj);
+ if (!events_kobj)
+ return;
+
+ for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
+ if (call->class->system && !strcmp(call->class->system, tp_system)) {
+
+ /* create events/<tracepoint> */
+ kobject_create_and_add(call->name, events_kobj);
+
+ /* create events/<tracepoint>/enable, filter, format, id */
+ /* TBD ... */
+ }
+ }
+}

Ingo Molnar

unread,

Jun 29, 2010, 5:00:03 AM6/29/10

to

* Lin Ming <ming....@intel.com> wrote:

Yeah, something like that - assuming that this means that we'll add the events
directory to the device directory, to all the
/sys/bus/pci/drivers/i915/*/events/ driver directories right? (i havent
checked the DRM code)

Small detail, it could be written a bit more compactly, like:

> + int ret;
> +
> + ret = drm_get_dev(pdev, ent, &driver);

> + if (!ret)
> + perf_sys_register_tp(&pdev->dev.kobj, "i915");
> +
> + return ret;

Also, we can (optionally) consider 'generic', subsystem level events to also
show up under:

/sys/bus/pci/drivers/i915/events/

This would give a model to non-device-specific events to be listed one level
higher in the sysfs hierarchy.

This too would be done in the driver, not by generic code. It's generally the
driver which knows how the events should be categorized.

I'd imagine something similar for wireless drivers as well - most currently
defined events would show up on a per device basis there.

Can you see practical problems with this scheme?

Ingo

Lin Ming

unread,

Jun 29, 2010, 5:30:01 AM6/29/10

to

I haven't run the code, but I think yes.

>
> Small detail, it could be written a bit more compactly, like:

Thanks for the tip.

>
> > + int ret;
> > +
> > + ret = drm_get_dev(pdev, ent, &driver);
> > + if (!ret)
> > + perf_sys_register_tp(&pdev->dev.kobj, "i915");
> > +
> > + return ret;
>
> Also, we can (optionally) consider 'generic', subsystem level events to also
> show up under:
>
> /sys/bus/pci/drivers/i915/events/
>
> This would give a model to non-device-specific events to be listed one level
> higher in the sysfs hierarchy.
>
> This too would be done in the driver, not by generic code. It's generally the
> driver which knows how the events should be categorized.

This is a bit difficult. I'd like not to touch TRACE_EVENT().
How does the driver know if an event is 'generic' if TRACE_EVENT is not
touched?

>
> I'd imagine something similar for wireless drivers as well - most currently
> defined events would show up on a per device basis there.
>
> Can you see practical problems with this scheme?

Not now. I may find some problems when write more detail code.

Lin Ming

Ingo Molnar

unread,

Jun 29, 2010, 6:30:02 AM6/29/10

to

* Lin Ming <ming....@intel.com> wrote:

> > Also, we can (optionally) consider 'generic', subsystem level events to
> > also show up under:
> >
> > /sys/bus/pci/drivers/i915/events/
> >
> > This would give a model to non-device-specific events to be listed one
> > level higher in the sysfs hierarchy.
> >
> > This too would be done in the driver, not by generic code. It's generally
> > the driver which knows how the events should be categorized.
>

> This is a bit difficult. I'd like not to touch TRACE_EVENT(). [...]

We can certainly start with the simpler variant - it's also the more common
case.

> [...] How does the driver know if an event is 'generic' if TRACE_EVENT is
> not touched?

Well, it's per driver code which creates the 'events' directory anyway, so
that code decides where to link things. It can link it to the per driver kobj
- or to the per subsys kobj.

> > I'd imagine something similar for wireless drivers as well - most
> > currently defined events would show up on a per device basis there.
> >
> > Can you see practical problems with this scheme?
>
> Not now. I may find some problems when write more detail code.

Ok. Feel free to post RFC patches (even if they are not fully complete yet),
so that we can see how things are progressing.

I suspect the best approach would be to try to figure out the right sysfs
placement for one or two existing driver tracepoints, so that we can see it
all in practice. (Obviously any changes to drivers will have to go via the
relevant driver maintainer tree(s).)

Thanks,

Lin Ming

unread,

Jul 2, 2010, 4:10:04 AM7/2/10

to

Well, take i915 tracepoints as an example, the sys structures as below

And below is the very draft patch to export i915 tracepoints in sysfs.
Is it the right direction?

---
drivers/gpu/drm/i915/i915_drv.c | 15 +++-
include/linux/perf_event.h | 2 +
kernel/perf_event.c | 168 +++++++++++++++++++++++++++++++++++++++
4 files changed, 186 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 423dc90..eb7fa9e 100644

--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -28,6 +28,7 @@
*/

#include <linux/device.h>
+#include <linux/perf_event.h>
#include "drmP.h"
#include "drm.h"
#include "i915_drm.h"

@@ -413,7 +414,19 @@ int i965_reset(struct drm_device *dev, u8 flags)

static int __devinit
i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
- return drm_get_dev(pdev, ent, &driver);
+ struct kobject *kobj;

+ struct drm_device *drm_dev;

+ int ret;
+
+ ret = drm_get_dev(pdev, ent, &driver);
+
+ if (!ret) {

+ drm_dev = pci_get_drvdata(pdev);
+ kobj = &drm_dev->primary->kdev.kobj;

+ perf_sys_register_tp(kobj, "i915");
+ }
+
+ return ret;
}

static void
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 716f99b..2a6d834 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1019,6 +1019,8 @@ extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
+
+extern void perf_sys_register_tp(struct kobject *kobj, char *tp_system);
#else
static inline void
perf_event_task_sched_in(struct task_struct *task) { }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c

index 403d180..068ee48 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5877,3 +5877,171 @@ static int __init perf_event_sysfs_init(void)

&perfclass_attr_group);
}
device_initcall(perf_event_sysfs_init);
+
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];

+extern void print_event_filter(struct ftrace_event_call *call,
+ struct trace_seq *s);
+
+struct tp_kobject {
+ struct kobject *kobj;
+ struct ftrace_event_call *call;
+ struct tp_kobject *next;
+};
+
+static struct tp_kobject *tp_kobject_list;
+
+static struct ftrace_event_call *perf_sys_find_tp_call(struct kobject *kobj)
+{
+ struct tp_kobject *tp_kobj;
+
+ tp_kobj = tp_kobject_list;
+
+ while (tp_kobj) {
+ if (kobj == tp_kobj->kobj)
+ return tp_kobj->call;
+
+ tp_kobj = tp_kobj->next;
+ }
+
+ return NULL;
+}
+
+#define TP_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define TP_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t enable_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)

+{
+ struct ftrace_event_call *call;
+

+ call = perf_sys_find_tp_call(kobj);
+ return sprintf(buf, "%d\n", call->flags & TRACE_EVENT_FL_ENABLED);
+}
+
+static ssize_t enable_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ /* Not implemented yet */
+
+ return 0;
+}
+TP_ATTR(enable);
+
+static ssize_t filter_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)

+{
+ struct ftrace_event_call *call;

+ struct trace_seq *s;
+
+ call = perf_sys_find_tp_call(kobj);
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ print_event_filter(call, s);
+
+ memcpy(buf, s->buffer, s->len);
+
+ kfree(s);
+
+ return s->len;
+}
+
+static ssize_t filter_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ /* Not implemented yet */
+
+ return 0;
+}
+TP_ATTR(filter);
+
+static ssize_t format_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ /* Not implemented yet */
+
+ return 0;
+}
+TP_ATTR_RO(format);
+
+static ssize_t id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)

+{
+ struct ftrace_event_call *call;
+

+ call = perf_sys_find_tp_call(kobj);
+
+ return sprintf(buf, "%d\n", call->event.type);
+}
+TP_ATTR_RO(id);
+
+static struct attribute *tp_attrs[] = {
+ &enable_attr.attr,
+ &filter_attr.attr,
+ &format_attr.attr,
+ &id_attr.attr,
+ NULL,
+};
+
+static struct attribute_group tp_attr_group = {
+ .attrs = tp_attrs,
+};
+
+static int perf_sys_add_tp(struct kobject *parent, struct ftrace_event_call *call)
+{
+ struct tp_kobject *tp_kobj;
+ struct kobject *event_kobj;
+ int err;
+
+ event_kobj = kobject_create_and_add(call->name, parent);
+ if (!event_kobj)
+ return -ENOMEM;
+ err = sysfs_create_group(event_kobj, &tp_attr_group);
+ if (err) {
+ kobject_put(event_kobj);
+ return -ENOMEM;
+ }
+
+ tp_kobj = kmalloc(sizeof(*tp_kobj), GFP_KERNEL);
+ if (!tp_kobj) {
+ kobject_put(event_kobj);
+ return -ENOMEM;
+ }
+
+ tp_kobj->kobj = event_kobj;
+ tp_kobj->call = call;
+ tp_kobj->next = tp_kobject_list;
+ tp_kobject_list = tp_kobj;
+
+ return 0;
+}

+
+void perf_sys_register_tp(struct kobject *kobj, char *tp_system)
+{
+ struct ftrace_event_call *call;
+ struct kobject *events_kobj;
+
+ events_kobj = kobject_create_and_add("events", kobj);
+ if (!events_kobj)
+ return;
+
+ for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
+ if (call->class->system && !strcmp(call->class->system, tp_system)) {

+ perf_sys_add_tp(events_kobj, call);
+ }
+ }
+}

Ingo Molnar

unread,

Jul 3, 2010, 9:00:02 AM7/3/10

to

Yeah, i think so.

The per driver impact is small and to the point:

> drivers/gpu/drm/i915/i915_drv.c | 15 +++-

> i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> {
> - return drm_get_dev(pdev, ent, &driver);
> + struct kobject *kobj;
> + struct drm_device *drm_dev;
> + int ret;
> +
> + ret = drm_get_dev(pdev, ent, &driver);
> +
> + if (!ret) {
> + drm_dev = pci_get_drvdata(pdev);
> + kobj = &drm_dev->primary->kdev.kobj;
> + perf_sys_register_tp(kobj, "i915");
> + }
> +
> + return ret;

(It could be even shorter - the same compactness comment as i made last time
still holds for this function.)

Thanks,

Ingo

Corey Ashford

unread,

Jul 16, 2010, 8:30:02 PM7/16/10

to

...

Hi Lin,

Sorry for my late reply on this thread. I had missed these posts
earlier because I had an email filter that was set to look for messages
with "perf" in the subject, and so I missed this entire thread.

With your example here, let's say I want to open this event with the
perf_events ABI... how would I go about doing that? Have you figured
out whether the caller would read the id and pass that into the
interface, or perhaps pass in the fd of the id file (or perhaps the fd
of the specific event directory).

Also, I see the filter and format fields here. Would the caller write
to these fields to set them up? What's the format of the data that's
written to them? Would it be totally device dependent? It seems like
there should be a way for a user space tool to discover what can be
programmed into the filter and format fields.

- Corey

Lin Ming

unread,

Jul 20, 2010, 1:50:02 AM7/20/10

to

Sorry for my late reply too.
I have been busy with some other stuff. Hope I can send a more
functional patches this week.

>
> With your example here, let's say I want to open this event with the
> perf_events ABI... how would I go about doing that? Have you figured
> out whether the caller would read the id and pass that into the
> interface, or perhaps pass in the fd of the id file (or perhaps the fd
> of the specific event directory).

Please just ignore my above example. Now I have some uncompleted new
patches to export hardware/software/tracepoint events via sysfs, like
below.

The event path is passed in with perf's "-e" option, for example
perf record -e /sys/kernel/events/page-faults -- <some commands>

The caller reads config and type and pass them into perf_event_attr.

>
> Also, I see the filter and format fields here. Would the caller write
> to these fields to set them up? What's the format of the data that's
> written to them? Would it be totally device dependent? It seems like
> there should be a way for a user space tool to discover what can be
> programmed into the filter and format fields.

Now only read-only event attributes(config and type) are exported.
I want to first make some minimal functional patches. Then to implement
the complex writable attributes.

Lin Ming

Robert Richter

unread,

Jul 20, 2010, 11:20:02 AM7/20/10

to

On 20.07.10 01:48:28, Lin Ming wrote:
The caller reads config and type and pass them into perf_event_attr.
>
> 1. Hardware events
> /sys/devices/system/cpu/cpu0...cpuN/events
> |-- L1-dcache-load-misses ===> event name
> | |-- config ===> config value for the event
> | `-- type ===> event type

Wouldn't it be much easier to have a unique sysfs id (could be an
u64):

> |-- L1-dcache-load-misses ===> event name

> | `-- id ===> event id

... and then extend the syscall to enable an event by its sysfs id:

memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_SYSFS;
attr.sysfs_id = sysfs_id;
attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
attr.config = config;
...

The kerrnel then knows which event is meant and the don't have to
provide event specific paramaters such as type/config that requires an
event specific setup. The advantage would be that we can open an event
file descriptor of every kind of event in a standardized way.

-Robert

--
Advanced Micro Devices, Inc.
Operating System Research Center

Corey Ashford

unread,

Jul 20, 2010, 1:50:02 PM7/20/10

to

I'm not seeing the value of writable attributes in sysfs at this point.
Wouldn't that disconnect the event opening between the syscall and the
writing of attributes in user space, with no real way to tie them
together? For example, what if two users wrote to the same attribute
with different values... which one would take precedence when you go to
do the open syscall? I think all of the attribute data should be in the
open call, and sysfs should be read-only.

Earlier, I briefly presented an idea that would allow a caller to read
attribute formatting information, such as a shift and mask value, which
would allow the caller to build up a more complex .config value,
possibly extending into a new attr field - .config_extra[n] as dictated
by the shift value; shift values greater than 63 would place the
attribute into .config_extra[shift amount / 64] shifted by shift amount
% 64. It's not the prettiest interface, but I think it could work and
would be extensible.

- Corey

Corey Ashford

unread,

Jul 20, 2010, 2:00:02 PM7/20/10

to

On 07/20/2010 08:19 AM, Robert Richter wrote:
> On 20.07.10 01:48:28, Lin Ming wrote:
> The caller reads config and type and pass them into perf_event_attr.
>>
>> 1. Hardware events
>> /sys/devices/system/cpu/cpu0...cpuN/events
>> |-- L1-dcache-load-misses ===> event name
>> | |-- config ===> config value for the event
>> | `-- type ===> event type
>
> Wouldn't it be much easier to have a unique sysfs id (could be an
> u64):
>
>> |-- L1-dcache-load-misses ===> event name
>> | `-- id ===> event id
>
> ... and then extend the syscall to enable an event by its sysfs id:
>
> memset(&attr, 0, sizeof(attr));
> attr.type = PERF_TYPE_SYSFS;
> attr.sysfs_id = sysfs_id;
> attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
> attr.config = config;
> ...
>
> The kerrnel then knows which event is meant and the don't have to
> provide event specific paramaters such as type/config that requires an
> event specific setup. The advantage would be that we can open an event
> file descriptor of every kind of event in a standardized way.

Your example above still shows the .config member being set. Was that
intentional?

Maybe another way to accomplish this would be to reuse the .config field
for the sysfs_id.

We still need a way to deal with event attributes though, so something
more than a single sysfs_id would be needed to specify the event completely.

- Corey

Robert Richter

unread,

Jul 20, 2010, 2:40:01 PM7/20/10

to

On 20.07.10 13:50:01, Corey Ashford wrote:

> > ... and then extend the syscall to enable an event by its sysfs id:
> >
> > memset(&attr, 0, sizeof(attr));
> > attr.type = PERF_TYPE_SYSFS;
> > attr.sysfs_id = sysfs_id;
> > attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
> > attr.config = config;
> > ...

> Your example above still shows the .config member being set. Was that

> intentional?
>
> Maybe another way to accomplish this would be to reuse the .config field
> for the sysfs_id.

This was intended as this could be used to configure the event,
otherwise there is no way to setup the event with certain
parameters. The config value will be event specific then and we can be
sure the parameter belongs to _this_ kind of event.

> We still need a way to deal with event attributes though, so something
> more than a single sysfs_id would be needed to specify the event completely.

It is true that you still need knowledge of what the event is
measuring and how it is set up or configured. Maybe the configuration
may left blank if the event can be setup without it. But with this
approach you can get file descriptors for every event a user may be
interested in simply by looking into sysfs.

For example, I was thinking of perfctr events vs. ibs events. The cpu
could setup something like:

/sys/devices/system/cpu/cpu0...cpuN/events/perfctr/id
/sys/devices/system/cpu/cpu0...cpuN/events/ibs_op/id

Both events are setup with one 64 bit config value that is basically
the event's configuration msr (x86 perfctr or AMD IBS). These are
definded in the hardware specifications. Its formats differ. You could
then open the event file descriptor using the sysfs id and use the
config value to customize the event. You don't have a complicated
setup or implementation to detect which kind of event you want to use
as the id indicates the type of event.

Actually, we could setup e.g. also trace events with this mechanism.

-Robert

--
Advanced Micro Devices, Inc.
Operating System Research Center

--

Corey Ashford

unread,

Jul 20, 2010, 5:20:01 PM7/20/10

to

On 07/20/2010 11:30 AM, Robert Richter wrote:
> On 20.07.10 13:50:01, Corey Ashford wrote:
>
>>> ... and then extend the syscall to enable an event by its sysfs id:
>>>
>>> memset(&attr, 0, sizeof(attr));
>>> attr.type = PERF_TYPE_SYSFS;
>>> attr.sysfs_id = sysfs_id;
>>> attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
>>> attr.config = config;
>>> ...
>
>> Your example above still shows the .config member being set. Was that
>> intentional?
>>
>> Maybe another way to accomplish this would be to reuse the .config field
>> for the sysfs_id.
>
> This was intended as this could be used to configure the event,
> otherwise there is no way to setup the event with certain
> parameters. The config value will be event specific then and we can be
> sure the parameter belongs to _this_ kind of event.
>
>> We still need a way to deal with event attributes though, so something
>> more than a single sysfs_id would be needed to specify the event completely.
>
> It is true that you still need knowledge of what the event is
> measuring and how it is set up or configured. Maybe the configuration
> may left blank if the event can be setup without it. But with this
> approach you can get file descriptors for every event a user may be
> interested in simply by looking into sysfs.
>

Yes, that would be a nice feature.

> For example, I was thinking of perfctr events vs. ibs events. The cpu
> could setup something like:
>
> /sys/devices/system/cpu/cpu0...cpuN/events/perfctr/id
> /sys/devices/system/cpu/cpu0...cpuN/events/ibs_op/id
>
> Both events are setup with one 64 bit config value that is basically
> the event's configuration msr (x86 perfctr or AMD IBS). These are
> definded in the hardware specifications. Its formats differ. You could
> then open the event file descriptor using the sysfs id and use the
> config value to customize the event. You don't have a complicated
> setup or implementation to detect which kind of event you want to use
> as the id indicates the type of event.
>
> Actually, we could setup e.g. also trace events with this mechanism.

In perf_events, as I recall, they started out with a combined type and
config field, but it quickly became obvious that config was going to get
too crowded even with 64 bits available, so they were split up into
separate type and config fields. I fear that's what would happen to the
sysfs_id value as well... it would be too crowded.

Retaining the type and config nodes in sysfs makes it very clear for a
programmer as to how to use them.... just read and copy them into the
attr struct's corresponding members, and requires no changes to the
existing attr struct (at least for the moment).

- Corey