Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] x86/kvm: Show guest system/user cputime in cpustat

5 views
Skip to first unread message

Sheng Yang

unread,
Mar 11, 2010, 2:20:01 AM3/11/10
to
Currently we can only get the cpu_stat of whole guest as one. This patch
enhanced cpu_stat with more detail, has guest_system and guest_user cpu time
statistics with a little overhead.

Signed-off-by: Sheng Yang <sh...@linux.intel.com>
---

This draft patch based on KVM upstream to show the idea. I would split it into
more kernel friendly version later.

The overhead is, the cost of get_cpl() after each exit from guest.

Comments are welcome!

arch/x86/kvm/x86.c | 10 ++++++++++
fs/proc/stat.c | 22 ++++++++++++++++------
include/linux/kernel_stat.h | 2 ++
include/linux/kvm_host.h | 1 +
include/linux/sched.h | 1 +
kernel/sched.c | 6 ++++++
6 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 703f637..c8ea6e1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4290,6 +4290,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
}
}

+static void kvm_update_guest_mode(struct kvm_vcpu *vcpu)
+{
+ int cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ if (cpl != 0)
+ current->flags |= PF_VCPU_USER;
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
@@ -4377,6 +4385,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu);

+ kvm_update_guest_mode(vcpu);
+
/*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad..d07640a 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
int i, j;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
- cputime64_t guest, guest_nice;
+ cputime64_t guest, guest_nice, guest_user, guest_system;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)

user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
- guest = guest_nice = cputime64_zero;
+ guest = guest_nice = guest_user = guest_system = cputime64_zero;
getboottime(&boottime);
jif = boottime.tv_sec;

@@ -53,6 +53,10 @@ static int show_stat(struct seq_file *p, void *v)
guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
guest_nice = cputime64_add(guest_nice,
kstat_cpu(i).cpustat.guest_nice);
+ guest_user = cputime64_add(guest_user,
+ kstat_cpu(i).cpustat.guest_user);
+ guest_system = cputime64_add(guest_system,
+ kstat_cpu(i).cpustat.guest_system);
for_each_irq_nr(j) {
sum += kstat_irqs_cpu(j, i);
}
@@ -68,7 +72,7 @@ static int show_stat(struct seq_file *p, void *v)
sum += arch_irq_stat();

seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
+ "%llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
(unsigned long long)cputime64_to_clock_t(system),
@@ -78,7 +82,9 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ (unsigned long long)cputime64_to_clock_t(guest_nice),
+ (unsigned long long)cputime64_to_clock_t(guest_user),
+ (unsigned long long)cputime64_to_clock_t(guest_system));
for_each_online_cpu(i) {

/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -93,9 +99,11 @@ static int show_stat(struct seq_file *p, void *v)
steal = kstat_cpu(i).cpustat.steal;
guest = kstat_cpu(i).cpustat.guest;
guest_nice = kstat_cpu(i).cpustat.guest_nice;
+ guest_user = kstat_cpu(i).cpustat.guest_user;
+ guest_system = kstat_cpu(i).cpustat.guest_system;
seq_printf(p,
"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
+ "%llu %llu %llu\n",
i,
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
@@ -106,7 +114,9 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ (unsigned long long)cputime64_to_clock_t(guest_nice),
+ (unsigned long long)cputime64_to_clock_t(guest_user),
+ (unsigned long long)cputime64_to_clock_t(guest_system));
}
seq_printf(p, "intr %llu", (unsigned long long)sum);

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index c059044..e43b2f7 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -26,6 +26,8 @@ struct cpu_usage_stat {
cputime64_t steal;
cputime64_t guest;
cputime64_t guest_nice;
+ cputime64_t guest_user;
+ cputime64_t guest_system;
};

struct kernel_stat {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a3fd0f9..497e795 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -496,6 +496,7 @@ static inline void kvm_guest_exit(void)
{
account_system_vtime(current);
current->flags &= ~PF_VCPU;
+ current->flags &= ~PF_VCPU_USER;
}

static inline gpa_t gfn_to_gpa(gfn_t gfn)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78efe7c..49bf81d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1759,6 +1759,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
+#define PF_VCPU_USER 0x00000020 /* I'm a virtual CPU in usermode */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3a8fb30..9cfc288 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5092,6 +5092,12 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
} else {
cpustat->user = cputime64_add(cpustat->user, tmp);
cpustat->guest = cputime64_add(cpustat->guest, tmp);
+ if (p->flags & PF_VCPU_USER)
+ cpustat->guest_user =
+ cputime64_add(cpustat->guest_user, tmp);
+ else
+ cpustat->guest_system =
+ cputime64_add(cpustat->guest_system, tmp);
}
}

--
1.7.0.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Avi Kivity

unread,
Mar 11, 2010, 2:40:02 AM3/11/10
to
On 03/11/2010 09:20 AM, Sheng Yang wrote:
> Currently we can only get the cpu_stat of whole guest as one. This patch
> enhanced cpu_stat with more detail, has guest_system and guest_user cpu time
> statistics with a little overhead.
>
> Signed-off-by: Sheng Yang<sh...@linux.intel.com>
> ---
>
> This draft patch based on KVM upstream to show the idea. I would split it into
> more kernel friendly version later.
>
> The overhead is, the cost of get_cpl() after each exit from guest.
>

This can be very expensive in the nested virtualization case, so I
wouldn't like this to be in normal paths. I think detailed profiling
like that can be left to 'perf kvm', which only has overhead if enabled
at runtime.

For example you can put the code to note the cpl in a tracepoint which
is enabled dynamically.

--
error compiling committee.c: too many arguments to function

Sheng Yang

unread,
Mar 11, 2010, 2:50:03 AM3/11/10
to
On Thursday 11 March 2010 15:36:01 Avi Kivity wrote:
> On 03/11/2010 09:20 AM, Sheng Yang wrote:
> > Currently we can only get the cpu_stat of whole guest as one. This patch
> > enhanced cpu_stat with more detail, has guest_system and guest_user cpu
> > time statistics with a little overhead.
> >
> > Signed-off-by: Sheng Yang<sh...@linux.intel.com>
> > ---
> >
> > This draft patch based on KVM upstream to show the idea. I would split it
> > into more kernel friendly version later.
> >
> > The overhead is, the cost of get_cpl() after each exit from guest.
>
> This can be very expensive in the nested virtualization case, so I
> wouldn't like this to be in normal paths. I think detailed profiling
> like that can be left to 'perf kvm', which only has overhead if enabled
> at runtime.

Yes, that's my concern too(though nested vmcs/vmcb read already too expensive,
they should be optimized...). The other concern is, perf alike mechanism would
bring a lot more overhead compared to this.

> For example you can put the code to note the cpl in a tracepoint which
> is enabled dynamically.

Yanmin have already implement "perf kvm" to support this. We are just arguing
if a normal top-alike mechanism is necessary.

I am also considering to make it a feature that can be disabled. But seems it
make things complicate and result in uncertain cpustat output.

--
regards
Yang, Sheng

Avi Kivity

unread,
Mar 11, 2010, 3:00:03 AM3/11/10
to
On 03/11/2010 09:46 AM, Sheng Yang wrote:
> On Thursday 11 March 2010 15:36:01 Avi Kivity wrote:
>
>> On 03/11/2010 09:20 AM, Sheng Yang wrote:
>>
>>> Currently we can only get the cpu_stat of whole guest as one. This patch
>>> enhanced cpu_stat with more detail, has guest_system and guest_user cpu
>>> time statistics with a little overhead.
>>>
>>> Signed-off-by: Sheng Yang<sh...@linux.intel.com>
>>> ---
>>>
>>> This draft patch based on KVM upstream to show the idea. I would split it
>>> into more kernel friendly version later.
>>>
>>> The overhead is, the cost of get_cpl() after each exit from guest.
>>>
>> This can be very expensive in the nested virtualization case, so I
>> wouldn't like this to be in normal paths. I think detailed profiling
>> like that can be left to 'perf kvm', which only has overhead if enabled
>> at runtime.
>>
> Yes, that's my concern too(though nested vmcs/vmcb read already too expensive,
> they should be optimized...).

Any ideas on how to do that? Perhaps use paravirt_ops to covert the
vmread into a memory read? We store the vmwrites in the vmcs anyway.

> The other concern is, perf alike mechanism would
> bring a lot more overhead compared to this.
>

Ordinarily users won't care if time is spent in guest kernel mode or
guest user mode. They want to see which guest is imposing a load on a
system. I consider a user profiling a guest from the host an advanced
and rarer use case, so it's okay to require tools and additional
overhead for this.

>> For example you can put the code to note the cpl in a tracepoint which
>> is enabled dynamically.
>>
> Yanmin have already implement "perf kvm" to support this. We are just arguing
> if a normal top-alike mechanism is necessary.
>
> I am also considering to make it a feature that can be disabled. But seems it
> make things complicate and result in uncertain cpustat output.
>

I'm not even sure that guest time was a good idea.

--
error compiling committee.c: too many arguments to function

--

Zhang, Yanmin

unread,
Mar 11, 2010, 3:30:01 AM3/11/10
to
On Thu, 2010-03-11 at 09:50 +0200, Avi Kivity wrote:
> On 03/11/2010 09:46 AM, Sheng Yang wrote:
> > On Thursday 11 March 2010 15:36:01 Avi Kivity wrote:
> >
> >> On 03/11/2010 09:20 AM, Sheng Yang wrote:
> >>

>
> >>> Currently we can only get the cpu_stat of whole guest as one. This patch
> >>> enhanced cpu_stat with more detail, has guest_system and guest_user cpu
> >>> time statistics with a little overhead.
> >>>
> >>> Signed-off-by: Sheng Yang<sh...@linux.intel.com>

It seems per-process guest cpu utilization is more useful than per-cpu's.


> >>> ---
> >>>
> >>> This draft patch based on KVM upstream to show the idea. I would split it
> >>> into more kernel friendly version later.
> >>>
> >>> The overhead is, the cost of get_cpl() after each exit from guest.
> >>>
> >> This can be very expensive in the nested virtualization case, so I
> >> wouldn't like this to be in normal paths. I think detailed profiling
> >> like that can be left to 'perf kvm', which only has overhead if enabled
> >> at runtime.
> >>
> > Yes, that's my concern too(though nested vmcs/vmcb read already too expensive,
> > they should be optimized...).
>
> Any ideas on how to do that? Perhaps use paravirt_ops to covert the
> vmread into a memory read? We store the vmwrites in the vmcs anyway.

Another method is to add sysctl entry, such like /proc/sys/kernel/collect_guest_utilization,
and we can set it off by default. Or add a /sys/kernel/debug/kvm/collect_guest_utilization.

>
> > The other concern is, perf alike mechanism would
> > bring a lot more overhead compared to this.
> >
>
> Ordinarily users won't care if time is spent in guest kernel mode or
> guest user mode. They want to see which guest is imposing a load on a
> system. I consider a user profiling a guest from the host an advanced
> and rarer use case, so it's okay to require tools and additional
> overhead for this.

Here is the story why Sheng worked out the patch. Some guys work on
KVM performance. They want us to extend top to show guest utilization
info, such like guest kernel and guest userspace cpu utilization. With
the new tool, they could find which VM (mapping with qemu process id)
consumes too much cpu time in host space (including kernel and userspace),
and compare them with guest kernel/userspace. That information could provide
a first-hand high-level overview about all VMs running in the system
and help admin quickly find what the worst VM instance is.

So we need per-process (guest) cpu utilization than per-cpu guest utilization.

>
> >> For example you can put the code to note the cpl in a tracepoint which
> >> is enabled dynamically.
> >>
> > Yanmin have already implement "perf kvm" to support this. We are just arguing
> > if a normal top-alike mechanism is necessary.

perf kvm mostly is used to find hot functions which might cause more overhead.
Sheng's patch has less overhead.


> >
> > I am also considering to make it a feature that can be disabled. But seems it
> > make things complicate and result in uncertain cpustat output.
> >
>
> I'm not even sure that guest time was a good idea.


--

Sheng Yang

unread,
Mar 11, 2010, 4:20:03 AM3/11/10
to
On Thursday 11 March 2010 15:50:54 Avi Kivity wrote:
> On 03/11/2010 09:46 AM, Sheng Yang wrote:
> > On Thursday 11 March 2010 15:36:01 Avi Kivity wrote:
> >> On 03/11/2010 09:20 AM, Sheng Yang wrote:
> >>> Currently we can only get the cpu_stat of whole guest as one. This
> >>> patch enhanced cpu_stat with more detail, has guest_system and
> >>> guest_user cpu time statistics with a little overhead.
> >>>
> >>> Signed-off-by: Sheng Yang<sh...@linux.intel.com>
> >>> ---
> >>>
> >>> This draft patch based on KVM upstream to show the idea. I would split
> >>> it into more kernel friendly version later.
> >>>
> >>> The overhead is, the cost of get_cpl() after each exit from guest.
> >>
> >> This can be very expensive in the nested virtualization case, so I
> >> wouldn't like this to be in normal paths. I think detailed profiling
> >> like that can be left to 'perf kvm', which only has overhead if enabled
> >> at runtime.
> >
> > Yes, that's my concern too(though nested vmcs/vmcb read already too
> > expensive, they should be optimized...).
>
> Any ideas on how to do that? Perhaps use paravirt_ops to covert the
> vmread into a memory read? We store the vmwrites in the vmcs anyway.

When Qing(CCed) was working on nested VMX in the past, he found PV
vmread/vmwrite indeed works well(it would write to the virtual vmcs so vmwrite
can also benefit). Though compared to old machine(one our internal patch shows
improve more than 5%), NHM get less benefit due to the reduced vmexit cost.

--
regards
Yang, Sheng

>
> > The other concern is, perf alike mechanism would
> > bring a lot more overhead compared to this.
>
> Ordinarily users won't care if time is spent in guest kernel mode or
> guest user mode. They want to see which guest is imposing a load on a
> system. I consider a user profiling a guest from the host an advanced
> and rarer use case, so it's okay to require tools and additional
> overhead for this.
>
> >> For example you can put the code to note the cpl in a tracepoint which
> >> is enabled dynamically.
> >
> > Yanmin have already implement "perf kvm" to support this. We are just
> > arguing if a normal top-alike mechanism is necessary.
> >
> > I am also considering to make it a feature that can be disabled. But
> > seems it make things complicate and result in uncertain cpustat output.
>
> I'm not even sure that guest time was a good idea.
>
--

Qing He

unread,
Mar 12, 2010, 4:00:02 AM3/12/10
to

One of the hurdles to PVize vmread/vmwrite is the fact that the memory
layout of physical vmcs remains unknown. Of course it can use the custom
vmcs layout utilized by nested virtualization, but that looks a little weird,
since different nested virtualization implementation may create different
custom layout.

I once used another approach to partially accelerate the vmread/vmwrite
in nested virtualization case, which also gives good performance gain (around
7% on pre-nehalem, based on this, PV vmread/vmwrite had another 7%). That
is to make a shortcut to handle EXIT_REASON_VM{READ,WRITE}, without
even turning on the IF.

Thanks,
Qing

Avi Kivity

unread,
Mar 13, 2010, 3:30:02 AM3/13/10
to
On 03/12/2010 10:53 AM, Qing He wrote:
>
>> When Qing(CCed) was working on nested VMX in the past, he found PV
>> vmread/vmwrite indeed works well(it would write to the virtual vmcs so vmwrite
>> can also benefit). Though compared to old machine(one our internal patch shows
>> improve more than 5%), NHM get less benefit due to the reduced vmexit cost.
>>
>>
> One of the hurdles to PVize vmread/vmwrite is the fact that the memory
> layout of physical vmcs remains unknown. Of course it can use the custom
> vmcs layout utilized by nested virtualization, but that looks a little weird,
> since different nested virtualization implementation may create different
> custom layout.
>

Note we must use a custom layout and cannot depend on the physical
layout, due to live migration. The layout becomes an ABI.

> I once used another approach to partially accelerate the vmread/vmwrite
> in nested virtualization case, which also gives good performance gain (around
> 7% on pre-nehalem, based on this, PV vmread/vmwrite had another 7%). That
> is to make a shortcut to handle EXIT_REASON_VM{READ,WRITE}, without
> even turning on the IF.
>

Interesting. That means our exit path is inefficient; it seems to imply
half the time is spent outside the hardware vmexit path.

A quick profile (on non-Nehalem) shows many atomics and calls into the
lapic, as well as update_cr8_intercept which is sometimes unnecessary;
these could easily be optimized.

Definitely optimizing the non-paravirt path is preferred to adding more
paravirtualization.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

0 new messages