This patch proposes to use a cpuid interface to detect if we are running on an
hypervisor.
The discovery of a hypervisor is determined by bit 31 of CPUID#1_ECX, which is
defined to be "hypervisor present bit". For a VM, the bit is 1, otherwise it is
set to 0. This bit is not officially documented by either Intel/AMD yet, but
they plan to do so some time soon, in the meanwhile they have promised to keep
it reserved for virtualization.
Also, Intel & AMD have reserved the cpuid levels 0x40000000 - 0x400000FF for
software use. Hypervisors can use these levels to provide an interface to pass
information from the hypervisor to the guest. This is similar to how we extract
information about a physical cpu by using cpuid.
XEN/KVM are already using the info leaf to get the hypervisor signature.
VMware hardware version 7 defines some of these cpuid levels, below is a brief
description about those. These levels can be implemented by other hypervisors
too so that Linux has a standard way of communicating to any hypervisor.
Leaf 0x40000000, Hypervisor CPUID information
# EAX: The maximum input value for hypervisor CPUID info (0x40000010).
# EBX, ECX, EDX: Hypervisor vendor ID signature. E.g. "VMwareVMware"
Leaf 0x40000010, Timing information.
# EAX: (Virtual) TSC frequency in kHz.
# EBX: (Virtual) Bus (local apic timer) frequency in kHz.
# ECX, EDX: RESERVED
This patch uses the timing leaf to get the tsc_frequency from the hypervisor.
Since the calibration algorithm can have errors in a virtualized environment,
the best way to calibrate TSC frequency would be to ask the hypervisor about it.
Along with it we also use the hypervisor information leaf to print info messages
at kernel bootup.
Signed-off-by: Alok N Kataria <akat...@vmware.com>
Cc: Jun Nakajima <Jun.Na...@Intel.Com>
---
arch/x86/kernel/setup.c | 17 +++++++++++++++++
arch/x86/kernel/tsc.c | 24 +++++++++++++++++++++++-
include/asm-x86/cpufeature.h | 2 ++
include/asm-x86/processor.h | 21 +++++++++++++++++++++
4 files changed, 63 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6133530..14a4f64 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -765,6 +765,21 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
{}
};
+static void __init detect_hypervisor(void)
+{
+ if (cpu_has_hypervisor) {
+ unsigned int eax, ebx, ecx, edx;
+ char hyper_vendor_id[13];
+
+ cpuid(HYPERVISOR_INFO_LEAF, &eax, &ebx, &ecx, &edx);
+ memcpy(hyper_vendor_id + 0, &ebx, 4);
+ memcpy(hyper_vendor_id + 4, &ecx, 4);
+ memcpy(hyper_vendor_id + 8, &edx, 4);
+ hyper_vendor_id[12] = '\0';
+ printk(KERN_INFO "Hypervisor vendor id %s\n", hyper_vendor_id);
+ }
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -915,6 +930,8 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled)
efi_init();
+ detect_hypervisor();
+
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 161bb85..605cf84 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -345,16 +345,38 @@ failed:
return 0;
}
+unsigned long hypervisor_tsc_freq(void)
+{
+ unsigned long tsc_khz;
+ unsigned int max_cpuid_leaf;
+
+ if (cpu_has_hypervisor) {
+ max_cpuid_leaf = cpuid_eax(HYPERVISOR_INFO_LEAF);
+ if (max_cpuid_leaf >= HYPERVISOR_TIMING_LEAF) {
+ tsc_khz = cpuid_eax(HYPERVISOR_TIMING_LEAF);
+ printk(KERN_INFO
+ "TSC frequency read from hypervisor\n");
+ return tsc_khz;
+ }
+ }
+ return 0;
+}
+
/**
* native_calibrate_tsc - calibrate the tsc on boot
+ * return value is the tsc frequency in khz.
*/
unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate;
+ unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
int hpet = is_hpet_enabled(), i, loopmin;
+ tsc_khz = hypervisor_tsc_freq();
+ if (tsc_khz)
+ return tsc_khz;
+
local_irq_save(flags);
fast_calibrate = quick_pit_calibrate();
local_irq_restore(flags);
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 800ec03..d3aaff0 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -116,6 +116,7 @@
#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
@@ -236,6 +237,7 @@ extern const char * const x86_power_flags[32];
#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
+#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index ee7cbb3..70ca49b 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -124,6 +124,27 @@ struct cpuinfo_x86 {
#define X86_VENDOR_UNKNOWN 0xff
/*
+ * Intel & AMD have reserved the cpuid levels 0x40000000 - 0x400000FF for
+ * software use. Hypervisors can use these levels to provide an interface
+ * to pass information from the hypervisor to the guest. This is similar
+ * to how we extract information about a physical cpu by using cpuid.
+ */
+
+/*
+ * This CPUID leaf returns the information about the hypervisor.
+ * EAX : maximum input value for CPUID supported by the hypervisor.
+ * EBX, ECX, EDX : Hypervisor vendor ID signature. E.g. VMwareVMware.
+ */
+#define HYPERVISOR_INFO_LEAF 0x40000000
+/*
+ * This leaf gets timing information from the hypervisor.
+ * EAX: (Virtual) TSC frequency in kHz.
+ * EBX: (Virtual) Bus (local apic timer) frequency in kHz.
+ * ECX, EDX: RESERVED
+ */
+#define HYPERVISOR_TIMING_LEAF 0x40000010
+
+/*
* capabilities of CPUs
*/
extern struct cpuinfo_x86 boot_cpu_data;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
This is great, obviously... although we'll have to deal with legacy
methods for a while if not indefinitely (just as we have to for
pre-CPUID processors).
>
> +static void __init detect_hypervisor(void)
> +{
> + if (cpu_has_hypervisor) {
> + unsigned int eax, ebx, ecx, edx;
> + char hyper_vendor_id[13];
> +
> + cpuid(HYPERVISOR_INFO_LEAF, &eax, &ebx, &ecx, &edx);
> + memcpy(hyper_vendor_id + 0, &ebx, 4);
> + memcpy(hyper_vendor_id + 4, &ecx, 4);
> + memcpy(hyper_vendor_id + 8, &edx, 4);
> + hyper_vendor_id[12] = '\0';
> + printk(KERN_INFO "Hypervisor vendor id %s\n", hyper_vendor_id);
> + }
> +}
> +
This should be broken out into a separate file in cpu/*, because we
*will* need to detect hypervisors by other means.
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -345,16 +345,38 @@ failed:
> return 0;
> }
>
> +unsigned long hypervisor_tsc_freq(void)
> +{
> + unsigned long tsc_khz;
> + unsigned int max_cpuid_leaf;
> +
> + if (cpu_has_hypervisor) {
> + max_cpuid_leaf = cpuid_eax(HYPERVISOR_INFO_LEAF);
> + if (max_cpuid_leaf >= HYPERVISOR_TIMING_LEAF) {
> + tsc_khz = cpuid_eax(HYPERVISOR_TIMING_LEAF);
> + printk(KERN_INFO
> + "TSC frequency read from hypervisor\n");
> + return tsc_khz;
> + }
> + }
> + return 0;
> +}
> +
I would call this "vmware_tsc_freq()" because it is a VMWare-defined
interface... you can't just poke at 0x40000010 and assume it is using
the VMWare definition.
In order for *that* to be safe, you'd have to have well-defined ranges
for different virtualization vendors where each of them can define their
own stuff.
<asm/processor.h> is the wrong place for this, and these constants
should have CPUID_ in them to tell what they fundamentally are.
My preference would be for <asm/cpuid.h>, but otherwise
<asm/cpufeature.h> wouldn't be entirely wrong.
-hpa
I don't, realistically, think we can phase them out for a very long
time, and then it's usually a "why bother". What we want to do is
abstract them so they don't make the rest of the code suck.
>
> I would like to see this as a generic hypervisor way to get frequency
> rather than a VMware specific thingy.
>> In order for *that* to be safe, you'd have to have well-defined ranges
>> for different virtualization vendors where each of them can define their
>> own stuff.
>
> My motivation for doing this is to have a standard across all the
> hypervisor's. If all the different hypervisor guys can come to some
> sought of consensus on the various hypervisor leafs that would help keep
> this simple and a lot more maintainable.
>
Agreed. However, that's obviously beyond our immediate control.
Thanks for the comments, please find my replies below.
On Fri, 2008-09-26 at 17:09 -0700, H. Peter Anvin wrote:
> Alok Kataria wrote:
> >
> > Leaf 0x40000000, Hypervisor CPUID information
> > # EAX: The maximum input value for hypervisor CPUID info (0x40000010).
> > # EBX, ECX, EDX: Hypervisor vendor ID signature. E.g. "VMwareVMware"
> >
>
> This is great, obviously... although we'll have to deal with legacy
> methods for a while if not indefinitely (just as we have to for
> pre-CPUID processors).
Ok, do you think we should keep those (legacy) interfaces separate so
that they can be phased out whenever the time is right.
>
> >
> > +static void __init detect_hypervisor(void)
> > +{
> > + if (cpu_has_hypervisor) {
> > + unsigned int eax, ebx, ecx, edx;
> > + char hyper_vendor_id[13];
> > +
> > + cpuid(HYPERVISOR_INFO_LEAF, &eax, &ebx, &ecx, &edx);
> > + memcpy(hyper_vendor_id + 0, &ebx, 4);
> > + memcpy(hyper_vendor_id + 4, &ecx, 4);
> > + memcpy(hyper_vendor_id + 8, &edx, 4);
> > + hyper_vendor_id[12] = '\0';
> > + printk(KERN_INFO "Hypervisor vendor id %s\n", hyper_vendor_id);
> > + }
> > +}
> > +
>
> This should be broken out into a separate file in cpu/*, because we
> *will* need to detect hypervisors by other means.
Ok, i will do that.
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -345,16 +345,38 @@ failed:
> > return 0;
> > }
> >
> > +unsigned long hypervisor_tsc_freq(void)
> > +{
> > + unsigned long tsc_khz;
> > + unsigned int max_cpuid_leaf;
> > +
> > + if (cpu_has_hypervisor) {
> > + max_cpuid_leaf = cpuid_eax(HYPERVISOR_INFO_LEAF);
> > + if (max_cpuid_leaf >= HYPERVISOR_TIMING_LEAF) {
> > + tsc_khz = cpuid_eax(HYPERVISOR_TIMING_LEAF);
> > + printk(KERN_INFO
> > + "TSC frequency read from hypervisor\n");
> > + return tsc_khz;
> > + }
> > + }
> > + return 0;
> > +}
> > +
>
> I would call this "vmware_tsc_freq()" because it is a VMWare-defined
> interface... you can't just poke at 0x40000010 and assume it is using
> the VMWare definition.
I would like to see this as a generic hypervisor way to get frequency
rather than a VMware specific thingy.
>
> In order for *that* to be safe, you'd have to have well-defined ranges
> for different virtualization vendors where each of them can define their
> own stuff.
My motivation for doing this is to have a standard across all the
hypervisor's. If all the different hypervisor guys can come to some
sought of consensus on the various hypervisor leafs that would help keep
this simple and a lot more maintainable.
>
Ok makes sense, will do that.
Thanks,
Alok
I'm sympathetic to the idea, but it seems a bit under-defined.
Are you leaving a gap between 0x40000000 and -10 for what? Future
extension? Avoiding existing hypervisor-specific leaves?
I think there's a move towards doing a scan for a signature, such as
checking every 16 leaves after 0x40000000 for "a while" looking for
interesting signatures, so that a hypervisor can support multiple ABIs
at once. Given this, it would be better to define a "Generic Hypervisor
ABI" signature, and put all the related leaves together.
And then, rather than having a simple "maximum leaf", it would be better
to have cap bits for each specific feature. For example, how would the
"RESERVED" registers in "Timing information" ever get used? How would
you know that they were no longer reserved, but now meaningful?
That said, I'm a bit worried about the whole idea of having these kinds
of timing parameters. It does assume that they're constant for the
whole life of the VM. What if they change due to power management or
migration?
J
That's kind of iffy, although at least it does have a modicum of being
controlled.
There is already a de facto standard for doing this: on a (currently)
64K boundary, add a leaf with a vendor ID and a limit; the presence is
detectable by the limit in EAX having the proper upper bits.
Then have each vendor pick a range that they maintain. Intel uses
0x0000xxxx (although they claim control of the entire numberspace), AMD
uses 0x8000xxxx, VIA uses 0xC000xxxx, Transmeta used 0x8086xxxx, and
0x4000xxxx is being reserved for "virtualization". There are tools
which use this as a way to try to dump all of CPUID without knowing details.
See the problem here? This is in effect an unmanaged space. This means
that without the vendor ID it is going to be meaningless, unless at
least the major players in the virtualization industry could agree with
how to use it, and that would still leave other users out in the cold.
Now, that would still require a vendor numberspace registry. The
obvious one is to use the numbers issued by PCI-SIG, which would require
16 bits -- that would presumably mean numbers of the form 0x40SSSSxx
with SSSS being the vendor ID; this would require scanning on a 256-byte
granularity for a generic tool.
Overall, though, *any* generic solution requires buyin from all
significant players in the space, *AND* a way to distinguish
noncompliant implementations. Designing a functional solution is the
easy part of that[*]. Getting sufficient buyin in the hard part.
> And then, rather than having a simple "maximum leaf", it would be better
> to have cap bits for each specific feature. For example, how would the
> "RESERVED" registers in "Timing information" ever get used? How would
> you know that they were no longer reserved, but now meaningful?
Typically you'd define them to be zero unless usable, and define them so
that a meaningful value would be nonzero.
> That said, I'm a bit worried about the whole idea of having these kinds
> of timing parameters. It does assume that they're constant for the
> whole life of the VM. What if they change due to power management or
> migration?
Presumably you'd have to have some way to notify the VM, via an
interrupt of some sort.
-hpa
[*] Consider the following totally half-baked example:
CPUID leaf 0x40000000
ECX-EDX-EBX Vendor name
EAX Max CPUID level supported
Motivation: existing practice
CPUID leaf 0x40000001...
EAX leaf number Pointer
ECX DID:VID PCI-style
EDX 0xcc06ab0b Magic number
EBX 0x7ab3857a Magic number
This would use the PCI vendor ID and an arbitrary "device ID"
to point to a leaf number, which would then contain information
starting with an identification/count leaf. The DID:VID would
signal who defined the specification, not necessarily who wrote
the hypervisor. This is similar to how Intel uses AMD-defined
CPUID levels, for example.
-hpa
Uhm, no, they're defined by the _hypervisor_.
Please see my comments below.
Avoiding existing leaves,
Microsoft's Hypervisor is using levels 0x40000000 - 0x40000005.
The first 2 are standard levels and the rest of them are Microsoft
hypervisors specific levels. So started with 0x40000010.
>
> I think there's a move towards doing a scan for a signature, such as
> checking every 16 leaves after 0x40000000 for "a while" looking for
> interesting signatures, so that a hypervisor can support multiple ABIs
> at once. Given this, it would be better to define a "Generic Hypervisor
> ABI" signature, and put all the related leaves together.
Hmm interesting, do you have any pointers to this ?
>
> And then, rather than having a simple "maximum leaf", it would be better
> to have cap bits for each specific feature. For example, how would the
> "RESERVED" registers in "Timing information" ever get used? How would
> you know that they were no longer reserved, but now meaningful?
The unused (reserved) value is set to zero right now, whenever a need is
felt we can define a meaningful value and that can be used.
>
> That said, I'm a bit worried about the whole idea of having these kinds
> of timing parameters. It does assume that they're constant for the
> whole life of the VM. What if they change due to power management or
> migration?
For power management, the trend, even on native hardware, is toward a
constant rate TSC. So, I don't see this is a big concern; after all a
virtual cpu should be able to virtualize the TSC as constant rate even
when the underlying TSC is not (by trapping out). And since this is
only true for older processors, this seems acceptable. In other words,
my feeling is we should think of the cpu-scaling issues as a legacy
issue and not optimize the interface for it.
As far as live migration, for full-virt, we think that it should happen
invisibly to the guest. So even if we move to a host with different TSC
frequency it should be the job of the hypervisor to still emulate the
old frequency.
Thanks,
Alok
The first two? And standard according to whom?
-hpa
Maybe by standard they mean standard definitions for them, the meaning
of the rest of the definitions change according to the value returned by
leaf 0x40000001 (Hypervisor vendor-neutral interface id).
Thanks,
Alok
No. I think I saw a passing reference from Tim Deegan to it, but I
couldn't find it again when I looked.
> As far as live migration, for full-virt, we think that it should happen
> invisibly to the guest. So even if we move to a host with different TSC
> frequency it should be the job of the hypervisor to still emulate the
> old frequency.
>
Can the tsc be emulated without a performance hit, or does it trap into
the hypervisor?
--
Gleb.
In theory it can, but it would be a bad idea. cpuid is best used to
communicate cpu features; ACPI and DMI are (mostly) system features.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
> +unsigned long hypervisor_tsc_freq(void)
> +{
> + unsigned long tsc_khz;
> + unsigned int max_cpuid_leaf;
> +
> + if (cpu_has_hypervisor) {
> + max_cpuid_leaf = cpuid_eax(HYPERVISOR_INFO_LEAF);
> + if (max_cpuid_leaf >= HYPERVISOR_TIMING_LEAF) {
> + tsc_khz = cpuid_eax(HYPERVISOR_TIMING_LEAF);
> + printk(KERN_INFO
> + "TSC frequency read from hypervisor\n");
> + return tsc_khz;
> + }
> + }
> + return 0;
> +}
Shouldn't you check the hypervisor signature here?
> /*
> + * Intel & AMD have reserved the cpuid levels 0x40000000 - 0x400000FF for
> + * software use. Hypervisors can use these levels to provide an interface
> + * to pass information from the hypervisor to the guest. This is similar
> + * to how we extract information about a physical cpu by using cpuid.
> + */
> +
> +/*
> + * This CPUID leaf returns the information about the hypervisor.
> + * EAX : maximum input value for CPUID supported by the hypervisor.
> + * EBX, ECX, EDX : Hypervisor vendor ID signature. E.g. VMwareVMware.
> + */
> +#define HYPERVISOR_INFO_LEAF 0x40000000
> +/*
> + * This leaf gets timing information from the hypervisor.info.
> + * EAX: (Virtual) TSC frequency in kHz.
> + * EBX: (Virtual) Bus (local apic timer) frequency in kHz.
> + * ECX, EDX: RESERVED
> + */
> +#define HYPERVISOR_TIMING_LEAF 0x40000010
Likewise, I think this should be HYPERVISOR_VMWARE_TIMING_LEAF.
Or we need some way to
(a) standardize hypervisor cpuid leafes (or parts of it) and
(b) handle sparsely filled cpuid info.
Right now both kvm and xen use the first one or two leafes (after info),
but in incompatible ways, so for these the signature *must* be checked
before using the info found there.
0x40000010 doesn't clash with anything as far I know, so we could
attempt to make that standard accross hypervisors.
cheers,
Gerd
Besides that, nobody stops a Hypervisor to offer valid and usefull DMI and
ACPI tables in the first place. I dont see a need to tunnel those through
CPUID.
Gruss
Bernd
I was talking about checking every 256 leaves from 0x40000000, but I
can't remember where I heard it. The only relevant reference I can
find is the MS hypervisor interface spec's description of leaf
0x40000001: "Hypervisor vendor-neutral interface identification. This
determines the semantics of the leaves from 0x40000002 through
0x400000FF"
Cheers,
Tim.
--
Tim Deegan <Tim.D...@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]
--
Gleb.
This was discussed at the virtualization mini summit (can't remember who
brought it up).
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
--
Also, we have reserved the MSRs from 0x40000000 - 0x400000FF for software use. We could use such MSRs, but what's the benefits of using those when ACPI already defined the detection mechanism?
.
Jun Nakajima | Intel Open Source Technology Center
On Mon, 2008-09-29 at 01:24 -0700, Gerd Hoffmann wrote:
> Hi,
>
> > +unsigned long hypervisor_tsc_freq(void)
> > +{
> > + unsigned long tsc_khz;
> > + unsigned int max_cpuid_leaf;
> > +
> > + if (cpu_has_hypervisor) {
> > + max_cpuid_leaf = cpuid_eax(HYPERVISOR_INFO_LEAF);
> > + if (max_cpuid_leaf >= HYPERVISOR_TIMING_LEAF) {
> > + tsc_khz = cpuid_eax(HYPERVISOR_TIMING_LEAF);
> > + printk(KERN_INFO
> > + "TSC frequency read from hypervisor\n");
> > + return tsc_khz;
> > + }
> > + }
> > + return 0;
> > +}
>
> Shouldn't you check the hypervisor signature here?
Nope the whole idea of not checking the hypervisor signature is that we
should keep this interface generic.
In the current code, before poking the TIMING_LEAF, we do check if the
maximum supported cpuid level is greater than that leaf. If it is we go
ahead and probe that cpuid leaf.
Also one thing to remember is, that a hypervisor can decide to not
implement this level and just return "0" the kernel can then just ignore
that value. That's what we do currently in native_calibrate_tsc.
>
> > /*
> > + * Intel & AMD have reserved the cpuid levels 0x40000000 - 0x400000FF for
> > + * software use. Hypervisors can use these levels to provide an interface
> > + * to pass information from the hypervisor to the guest. This is similar
> > + * to how we extract information about a physical cpu by using cpuid.
> > + */
> > +
> > +/*
> > + * This CPUID leaf returns the information about the hypervisor.
> > + * EAX : maximum input value for CPUID supported by the hypervisor.
> > + * EBX, ECX, EDX : Hypervisor vendor ID signature. E.g. VMwareVMware.
> > + */
> > +#define HYPERVISOR_INFO_LEAF 0x40000000
> > +/*
> > + * This leaf gets timing information from the hypervisor.info.
> > + * EAX: (Virtual) TSC frequency in kHz.
> > + * EBX: (Virtual) Bus (local apic timer) frequency in kHz.
> > + * ECX, EDX: RESERVED
> > + */
> > +#define HYPERVISOR_TIMING_LEAF 0x40000010
>
> Likewise, I think this should be HYPERVISOR_VMWARE_TIMING_LEAF.
Nope, lets do the "Or" part :)
>
> Or we need some way to
> (a) standardize hypervisor cpuid leafes (or parts of it) and
> (b) handle sparsely filled cpuid info.
>
Exactly.
Returning zero for the fields which are not supported by the hypervisor,
lets us do that.
So for instance right now, VMware has defined 40000010 leaf, if either
kvm/xen think it could be useful they could just define that leaf to
return nonzero value and the kernel will start using it for them.
Likewise, if in future either kvm/xen come up with a need to define a
new CPUID leaf they can define the semantics for that leaf, and the
corresponding kernel side stuff. If VMware, think that this new leaf is
useful, we can then support that leaf in our hypervisor or return zero
otherwise.
> Right now both kvm and xen use the first one or two leafes (after info),
> but in incompatible ways, so for these the signature *must* be checked
> before using the info found there.
Hmm that's unfortunate, but we can have exceptions for these one of
cases and AFAIK these are only checked in the kvm/xen code path and not
in any generic code as of now, right ?
btw, i could only find the semantics for 0x40000001 leaf in KVM's header
file but don't see Xen using that leaf, can you please point me which
leafs are you referring to here.
>
> 0x40000010 doesn't clash with anything as far I know, so we could
> attempt to make that standard accross hypervisors.
Yep.
Thanks,
Alok
Unfortunately, given current evidence this is entirely unrealistic.
> So for instance right now, VMware has defined 40000010 leaf, if either
> kvm/xen think it could be useful they could just define that leaf to
> return nonzero value and the kernel will start using it for them.
> Likewise, if in future either kvm/xen come up with a need to define a
> new CPUID leaf they can define the semantics for that leaf, and the
> corresponding kernel side stuff. If VMware, think that this new leaf is
> useful, we can then support that leaf in our hypervisor or return zero
> otherwise.
This is only true if you can also except M$ and other hypervisor vendors
to stick to it. So far, hypervisor vendors have hardly shown any
inclination toward standardization.
Hence I really don't think it is sane.
-hpa
Nice idea. Problem with that is that approach is that we don't have
full control here. It probably isn't that a hard to have vmware, xen
and kvm agree here, given vmware proposes this and for xen+kvm one can
send patches. But even that you can't take for granted, see the
discussion of the "tsc-may-change-on-migration" problem.
The real big problem are other closed-source hypervisors (VirtualPC /
Hyper-V / Parallels / ...). How can we be sure they don't define that
leaf to something different?
> Also one thing to remember is, that a hypervisor can decide to not
> implement this level and just return "0" the kernel can then just ignore
> that value. That's what we do currently in native_calibrate_tsc.
The fudamental issue outlined above aside: Even the "ignore 0" part
isn't in the patch right now.
>> Right now both kvm and xen use the first one or two leafes (after info),
>> but in incompatible ways, so for these the signature *must* be checked
>> before using the info found there.
>
> Hmm that's unfortunate, but we can have exceptions for these one of
> cases and AFAIK these are only checked in the kvm/xen code path and not
> in any generic code as of now, right ?
Yes.
> btw, i could only find the semantics for 0x40000001 leaf in KVM's header
> file but don't see Xen using that leaf, can you please point me which
> leafs are you referring to here.
pv drivers in hvm guests use that (and query very xen-specific stuff
which wouldn't make much sense in other hypervisors). It isn't in the
kernel source tree, look here instead:
Yep.
And please note that this does allow either Xen/KVM to propose a new
leaf and the rest of the hypervisor players can decide to export that
leaf or return a zero value.
> But even that you can't take for granted, see the
> discussion of the "tsc-may-change-on-migration" problem.
I may have been unclear in my first attempt to this question, let me try
again.
If the frequency of tsc changes during migration, it should be the task
of hypervisor to handle it. There could be multiple ways to solve that
problem, either the hypervisor emulates the old frequency (by whatever
way) or there are cpufreq drivers in the guest which detect changes in
frequency, and ask the hypervisor for the new frequency. The interface
still allows you to query the cpuid leaf and get the new frequency.
right ?
>
> The real big problem are other closed-source hypervisors (VirtualPC /
> Hyper-V / Parallels / ...). How can we be sure they don't define that
> leaf to something different?
How does that matter, if we are able to standardize all this then,
hypervisors which want to run a Linux guest should effectively play by
the standards over here or else they would never work properly on Linux.
What we are trying to do here is try to standardize things for Linux so
that the Linux kernel implementation for Virtualization is that much
more easier. If this goes well other closed source hypervisors too can
effectively follow these standards.
If the other closed source hypervisors define their own cpuid leafs they
will still have to make Linux kernel side changes to make use of these
new leafs. Which allows them to add to these pool of cpuid interface's
too.
>
> > Also one thing to remember is, that a hypervisor can decide to not
> > implement this level and just return "0" the kernel can then just ignore
> > that value. That's what we do currently in native_calibrate_tsc.
>
> The fudamental issue outlined above aside: Even the "ignore 0" part
> isn't in the patch right now.
Hmm, I am confused, from the patch i posted above, in
native_calibrate_tsc
+ tsc_khz = hypervisor_tsc_freq();
+ if (tsc_khz)
+ return tsc_khz;
We do ignore zero values over here.
>
> >> Right now both kvm and xen use the first one or two leafes (after info),
> >> but in incompatible ways, so for these the signature *must* be checked
> >> before using the info found there.
> >
> > Hmm that's unfortunate, but we can have exceptions for these one of
> > cases and AFAIK these are only checked in the kvm/xen code path and not
> > in any generic code as of now, right ?
>
> Yes.
>
> > btw, i could only find the semantics for 0x40000001 leaf in KVM's header
> > file but don't see Xen using that leaf, can you please point me which
> > leafs are you referring to here.
>
> pv drivers in hvm guests use that (and query very xen-specific stuff
> which wouldn't make much sense in other hypervisors). It isn't in the
> kernel source tree, look here instead:
>
> http://xenbits.xensource.com/xen-3.3-testing.hg?file/19201eebab16/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
Thanks for the pointer. This would fall in the exception too.
Thanks,
Alok
I don't think this is a realistic point of view, especically given who
we're dealing with in the closed-source world.
Without any kind of tagging of the information, it's realistically futile.
Intel/AMD haven't helped any by reserving a space which is entirely too
small.
-hpa
For example, we can set the following ranges so that so that each VMM vender can define and implement features avoiding conflicts:
vmware to define 0x4000001X
xen to define 0x4000002X
kvm to define 0x4000003X
...
The point here is that all the features are generic because we don't make them exclusive. To that end, we don't check the signature. Whatever common features can be found in the above, and each hypervisor can choose to implement what was defined by other hypervisors.
Detection of the feature 0x400000XY is done by:
1. Get EAX from Leaf 0x4000000000, Hypervisor CPUID information. EAX returns the maximum input value for hypervisor CPUID info.
If EAX < 0x400000XY, then the feature is not available.
2. Get EAX from the target Leaf 0x400000XY by doing cpuid_eax(0x400000XY).
If (EAX == 0), the feature is not implemented.
If the hypervisor does not implement a particular feature specified by 0x400000XN and 0x400000XN < 0x400000XY, the hypervisor needs to return 0 in EAX with cpuid_eax(0x400000XN) (XN > 0).
.
Jun Nakajima | Intel Open Source Technology Center
Unless there is a central authority assigning these, "we" can do all we
want, enough people will not pay attention.
Basically, there needs to be a standards document that describes the
architecture, *and* needs to either have universal buy-in with all the
vendors or imposed by an authority with enough clout to do so (Intel might.)
-hpa
Yes. It would be nice to have "Hypervisor vendor id" somewhere in
/sys or /proc. It seems that userspace is already hungry for that
information:
http://article.gmane.org/gmane.linux.utilities.util-linux-ng/1788
Karel
--
Karel Zak <kz...@redhat.com>
I think using fixed offsets is unwise, since there's already contention
for the same leaves. Making sure that each block of leaves (where a
block is 16, 256 or some other number of leaves) is self-describing via
ABI signatures is the only sane way to go. There's still the issue of
assigning ABI signatures to vendors, but that's 1) less of an issue, and
2) can be self-assigned with very low likelihood of collision. That way
a guest can scan that region of leaf space for ABI signatures it
understand, and can pick and choose among what it finds (but not mix and
match - that sounds like a course for disaster).
If we use such a scheme, we can 1) avoid any existing users of that
space, 2) cleanly delimit a hypervisor-agnostic ABI portion of the leaf
space, and 3) allow hypervisors to implement multiple ABIs at once.
J
If you can't mix and match, there is no point, since very likely all
hypervisors will have at least some unique information.
> If we use such a scheme, we can 1) avoid any existing users of that
> space, 2) cleanly delimit a hypervisor-agnostic ABI portion of the leaf
> space, and 3) allow hypervisors to implement multiple ABIs at once.
Yes, see my previous "half-baked" sketch.
-hpa
This small print is part of the guest/host ABI though, so hypervisors
must agree here too, be it "tsc is constant" or "re-read tsc freq on
$event" or whatever else. Otherwise it isn't a generic interface.
>> The real big problem are other closed-source hypervisors (VirtualPC /
>> Hyper-V / Parallels / ...). How can we be sure they don't define that
>> leaf to something different?
>
> How does that matter, if we are able to standardize all this then,
> hypervisors which want to run a Linux guest should effectively play by
> the standards over here or else they would never work properly on Linux.
Although we are working on world domination I think we are not close
enough yet that this is a realistic point of view.
> Hmm, I am confused, from the patch i posted above, in
> native_calibrate_tsc
>
> + tsc_khz = hypervisor_tsc_freq();
> + if (tsc_khz)
> + return tsc_khz;
>
> We do ignore zero values over here.
Oh, ok.
I expected the check explicitly coded within the hypervisor_tsc_freq()
function. This deserves at least a comment saying that this side effect
is actually intentional.
> I think using fixed offsets is unwise, since there's already contention
> for the same leaves. Making sure that each block of leaves (where a
> block is 16, 256 or some other number of leaves) is self-describing via
> ABI signatures is the only sane way to go. There's still the issue of
Aren't we overthinking / overdesigning this a bit? It's not rocket
science. We'd like to have a leaf set aside for TSC frequency, and
maybe another leaf in the future. We think other vendors might find a
static clock frequency leaf to be useful, so if that happens to be the
case, feel free to re-use the leaf.
We don't expect to see lots of proliferation of CPU leaves at all, in
fact, we'd be flummoxed to propose more than one right now. So
basically a nicely written comment section explaining how the SW CPUID
registers are layed out is probably sufficient. Other vendors can add
to it as they see fit, and Linux itself can be the central standard
body. After all, it's what we all work on, and it makes sense for
everyone here, even MS, to have the software leaves defined in a public
work.
The whole thing is software defined so it's not a big deal if one or all
parties eventually don't play well with others, grow up to become
bullies with ADD, or simply autistic children who ignore the whole
thing. You can always make detection vendor dependent when that
happens.
Right now there's nothing shockingly vendor dependent, just a whole lot
of complicated proposals about how to define what the bits are going to
define and not enough bits of information to actually express. It seems
perfectly okay for now to have new leaf proposals defined by fiat for
now.
As long as there is a vendor-ID leaf, nobody is blocking any forward
progress by adding a new non-conflicting leaf. We can always add the
meta-leafs required for decoding if something tangible materializes, but
for now the TSC leaf seems pretty useful and I would probably want to
proclaim it by fatwa, if I had such a power.
Zach
Sure, some leaves have been defined already, but it's not too late to
try to go the generic route for the remaining leaves.
The longer we wait to come to a consensus on generic leaves, the more
contention we will have. So let's try to address it now.
> Making sure that each block of leaves (where a
> block is 16, 256 or some other number of leaves) is self-describing via
> ABI signatures is the only sane way to go. There's still the issue of
> assigning ABI signatures to vendors, but that's 1) less of an issue, and
> 2) can be self-assigned with very low likelihood of collision. That way
> a guest can scan that region of leaf space for ABI signatures it
> understand, and can pick and choose among what it finds (but not mix and
> match - that sounds like a course for disaster).
>
> If we use such a scheme, we can 1) avoid any existing users of that
> space, 2) cleanly delimit a hypervisor-agnostic ABI portion of the leaf
> space, and 3) allow hypervisors to implement multiple ABIs at once.
I don't agree that this solution makes any difference OTOH, it just
complicates the situation.
Here is why....
1. This solution really bloats the kernel, below are few observations
i) What happens if two ABI signature blocks define a cpuid leaf which
have similar semantics, how does the kernel handle this ?
ii) Also, think about a case where a hypervisor supports 2 such ABI
blocks - which have cpuid leafs with similar semantics - and decides to
implement one of those and not the other leaf. How does the kernel
handle this ?
I do understand that there are ways to handle this in the kernel but at
the cost of just making the kernel more and more complicated for such
corner cases.
2. Also as a side note, what you are proposing just breaks down the
problem in to sub-problems, i.e we still need to make sure that the
individual cpuid leafs in the "block of leaves" are still generic. Also
we may still have situations where hypervisor "A" wants to define a
subset of cpuid block X, hypervisor B had defined.
3. All in all, the end result of doing this would be that each
hypervisor defines its own signature block forgetting about what the
other hypervisor is doing. This is hardly generalizing this space.
Seriously, if we want to generalize this in the current situation, how
about doing this ?
1. Have 0x40000000 as the hypervisor signature leaf.
2. Have 0x40000001-0x4000000F as the hypervisor specific leafs.
3. 0x40000010 - 0x400000FF as the generic space.
Use the non-zero meaning defined semantics for this generic space.
Thanks,
Alok
Hi Gerd,
I really fail to see your point here. Maybe you can point out what am i
missing.
Think about the current situation, whenever there is migration to such a
tsc-is-different system , how does the guest come to know about the
frequency change, either through a $event or if it reboots it runs the
calibration algorithm.
How does asking the hypervisor for tsc instead of calibrating it, breaks
the semantics for this migration.
What special things does Xen do at migration, which would be affected by
this interface ?
> > Hmm, I am confused, from the patch i posted above, in
> > native_calibrate_tsc
> >
> > + tsc_khz = hypervisor_tsc_freq();
> > + if (tsc_khz)
> > + return tsc_khz;
> >
> > We do ignore zero values over here.
>
> Oh, ok.
>
> I expected the check explicitly coded within the hypervisor_tsc_freq()
> function. This deserves at least a comment saying that this side effect
> is actually intentional.
Yep i will document this in my next post.
Thanks,
Alok
We don't have a quorum to create a consensus, since we only have a
subset of the relevant parties present.
We can get away with that if we agree on a standard that includes
postive definition.
> 1. This solution really bloats the kernel, below are few observations
You have no basis for that assertion.
> i) What happens if two ABI signature blocks define a cpuid leaf which
> have similar semantics, how does the kernel handle this ?
We already have this situation in a number of places. The answer is
generally that there is one form that the kernel prefer over another (in
CPUID space, leaf 80000006 over leaf 2, for example) because it is
better designed/more reliable/mode complete.
> ii) Also, think about a case where a hypervisor supports 2 such ABI
> blocks - which have cpuid leafs with similar semantics - and decides to
> implement one of those and not the other leaf. How does the kernel
> handle this ?
The standard way to handle that in CPUID space is to leave the
unimplemented leaf as zero.
> I do understand that there are ways to handle this in the kernel but at
> the cost of just making the kernel more and more complicated for such
> corner cases.
The complexity is relatively minor.
> 2. Also as a side note, what you are proposing just breaks down the
> problem in to sub-problems, i.e we still need to make sure that the
> individual cpuid leafs in the "block of leaves" are still generic. Also
> we may still have situations where hypervisor "A" wants to define a
> subset of cpuid block X, hypervisor B had defined.
It gives individual sub-APIs positive identification. This is similar
to PCI capabilities, for example.
> 3. All in all, the end result of doing this would be that each
> hypervisor defines its own signature block forgetting about what the
> other hypervisor is doing. This is hardly generalizing this space.
>
> Seriously, if we want to generalize this in the current situation, how
> about doing this ?
>
> 1. Have 0x40000000 as the hypervisor signature leaf.
> 2. Have 0x40000001-0x4000000F as the hypervisor specific leafs.
> 3. 0x40000010 - 0x400000FF as the generic space.
> Use the non-zero meaning defined semantics for this generic space.
As I said, I don't think we have any kind of quorum to declare such a
"standard", and we'll see violations with failures as a result.
-hpa
No, I don't think we are. Under the circumstances I do not think
anything other than positive identification is unacceptable.
If anything, the whole concept of reusing interfaces is what
> We don't expect to see lots of proliferation of CPU leaves at all, in
> fact, we'd be flummoxed to propose more than one right now. So
> basically a nicely written comment section explaining how the SW CPUID
> registers are layed out is probably sufficient. Other vendors can add
> to it as they see fit, and Linux itself can be the central standard
> body. After all, it's what we all work on, and it makes sense for
> everyone here, even MS, to have the software leaves defined in a public
> work.
NIH is a huge factor, and MS is worse than most.
> The whole thing is software defined so it's not a big deal if one or all
> parties eventually don't play well with others, grow up to become
> bullies with ADD, or simply autistic children who ignore the whole
> thing. You can always make detection vendor dependent when that
> happens.
>
> Right now there's nothing shockingly vendor dependent, just a whole lot
> of complicated proposals about how to define what the bits are going to
> define and not enough bits of information to actually express. It seems
> perfectly okay for now to have new leaf proposals defined by fiat for
> now.
>
> As long as there is a vendor-ID leaf, nobody is blocking any forward
> progress by adding a new non-conflicting leaf. We can always add the
> meta-leafs required for decoding if something tangible materializes, but
> for now the TSC leaf seems pretty useful and I would probably want to
> proclaim it by fatwa, if I had such a power.
If someone had the power to proclaim it by fatwa we wouldn't have much
to worry about. Intel might have the power, but we as a group in this
thread definitely do not.
However, it is clear the virtualization industry doesn't have their act
together to the point where one can rely on anything but positive
identification, unlike in the hardware space, where we can rely on
implicit identification, because people aren't stepping on each other's
toes.
-hpa
At least Parallels and Virtual PC/Virtual Server.
OK, if so we should write up a formal proposal.
That's part of the problem -- it's not, and it can't be. It's about the
hypervisor providing a specific service to its guest, and although we
can explain how Linux would use this, there is going to be an
expectation -- especially for Microsoft and Apple -- that other OSes
would use the same interfaces (to the extent they care, obviously.)
Well, that should be clearly defined, that is my point. When asking the
hypervisor for the tsc instead of running a calibration loop, then we
have a small bit of paravirtualization: The guest is aware that it runs
on a hypervisor and just asks it directly. So while we are at it we can
also define a way to communicate tsc freq changes between host and
guest, so the cost of trap'n'emulate tsc reads can be avoided. Or we
define "tsc is constant" and leave it to the hypervisor to make sure it
actually appears being constant to the guest, even in case it changes on
the host. But it must be defined one way or another, so the guest knows
whenever it should expect the tsc frequency change or not. And in case
we allow tsc changes, we also need a way to signal that to the guest.
Is the tsc cpu leaf interface set in stone already (aka implemented in
vmware versions released to public)?
> What special things does Xen do at migration, which would be affected by
> this interface ?
paravirtualized xen guests have a paravirtual clock. That is a struct
containing three pieces of information: system time, tsc counter for the
last system time update, tsc frequency. The guest gets the current time
by reading the system time and adding a delta calculated from current
tsc, tsc of last systime update and tsc frequency. Handling tsc
frequency changes is obviously trivial here, just update the field on
the next systime update ;)
Features that the guest needs to enable very early on, before ACPI is
up. I don't think there are many of those.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
--
> Well, that should be clearly defined, that is my point. When asking the
> hypervisor for the tsc instead of running a calibration loop, then we
> have a small bit of paravirtualization: The guest is aware that it runs
> on a hypervisor and just asks it directly. So while we are at it we can
> also define a way to communicate tsc freq changes between host and
> guest, so the cost of trap'n'emulate tsc reads can be avoided. Or we
> define "tsc is constant" and leave it to the hypervisor to make sure it
For our purposes, we define TSC is constant.
> actually appears being constant to the guest, even in case it changes on
> the host. But it must be defined one way or another, so the guest knows
> whenever it should expect the tsc frequency change or not. And in case
> we allow tsc changes, we also need a way to signal that to the guest.
Non-constant TSCs probably won't want to use CPUID based retreival, due
to the extra trap it would require to read TSC frequency, it can't be
done at every TSC read (or else, virtualizing TSC frequency has the same
cost and you haven't won anything by making it dynamic). It's also not
clean to issue interrupts to the guest telling it TSC frequency has
changed because the guest may not notice the interrupt before making
computations using the old value, and multiple rapid changes would
require multiple interrupt injections for each affected guest.
> Is the tsc cpu leaf interface set in stone already (aka implemented in
> vmware versions released to public)?
Not to my knowledge.
Zach
On Tue, 2008-09-30 at 01:11 -0700, Gerd Hoffmann wrote:
> Alok Kataria wrote:
> > Hi Gerd,
> >
> > I really fail to see your point here. Maybe you can point out what am i
> > missing.
> > Think about the current situation, whenever there is migration to such a
> > tsc-is-different system , how does the guest come to know about the
> > frequency change, either through a $event or if it reboots it runs the
> > calibration algorithm.
>
> Well, that should be clearly defined, that is my point. When asking the
> hypervisor for the tsc instead of running a calibration loop, then we
> have a small bit of paravirtualization: The guest is aware that it runs
> on a hypervisor and just asks it directly. So while we are at it we can
> also define a way to communicate tsc freq changes between host and
> guest, so the cost of trap'n'emulate tsc reads can be avoided. Or we
> define "tsc is constant" and leave it to the hypervisor to make sure it
> actually appears being constant to the guest, even in case it changes on
> the host. But it must be defined one way or another, so the guest knows
> whenever it should expect the tsc frequency change or not.
Hi Gerd,
As Zach explained, we support a view that, tsc is constant. This Timing
CPUID leaf should be just seen as a way to get the current TSC from the
hypervisor. Also, one thing to note would be that, this interface allows
us to reinitialize the TSC frequency if the need is felt.
Coming back to the migration problem, as you too acknowledge, migration
to a host with a different frequency should be seen as a different
problem. I would be interested in learning about any proposal that you
may have thought about to handle this.
>
> Is the tsc cpu leaf interface set in stone already (aka implemented in
> vmware versions released to public)?
Yep, this interface is already implemented in the VMware workstation 6.5
product.
>
> > What special things does Xen do at migration, which would be affected by
> > this interface ?
>
> paravirtualized xen guests have a paravirtual clock. That is a struct
> containing three pieces of information: system time, tsc counter for the
> last system time update, tsc frequency. The guest gets the current time
> by reading the system time and adding a delta calculated from current
> tsc, tsc of last systime update and tsc frequency. Handling tsc
> frequency changes is obviously trivial here, just update the field on
> the next systime update ;)
Oh nice, that is convenient.
Thanks,
Alok
Ok, so the guest doesn't have to worry about possible tsc changes when
using that interface. Should go into the comment documenting the leaf.
> Coming back to the migration problem, as you too acknowledge, migration
> to a host with a different frequency should be seen as a different
> problem. I would be interested in learning about any proposal that you
> may have thought about to handle this.
xen paravirtualized is explained below.
xen full virtualized: dunno.
kvm: provide something else for timekeeping to avoid the tsc trouble
altogether if possible. hpet, pm_timer, paravirtualized clocksource.
Obviously can't work for all guests though. paravirtual clocksource
works like the xen one.
cheers,
Gerd
I believe VMware doesn't actually change cpu frequency dynamically. But
what about hypervisors that do? and what about large machines, which do
not actually have a constant tsc?
You are defining something as constant which in fact is not constant.
--
error compiling committee.c: too many arguments to function