Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] enable x2APIC without interrupt remapping under KVM

672 views
Skip to first unread message

Gleb Natapov

unread,
Jun 28, 2009, 9:00:22 AM6/28/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d1430ef..7438c5c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -260,7 +260,7 @@ config SMP

config X86_X2APIC
bool "Support x2apic"
- depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
+ depends on X86_LOCAL_APIC && X86_64
---help---
This enables x2apic support on CPUs that have this feature.

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042..48a63b6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>

unsigned int num_processors;

@@ -1363,11 +1364,10 @@ void enable_x2apic(void)
}
#endif /* CONFIG_X86_X2APIC */

-void __init enable_IR_x2apic(void)
+int __init enable_IR(void)
{
#ifdef CONFIG_INTR_REMAP
int ret;
- unsigned long flags;
struct IO_APIC_route_entry **ioapic_entries = NULL;

ret = dmar_table_init();
@@ -1381,11 +1381,10 @@ void __init enable_IR_x2apic(void)
goto ir_failed;
}

-
if (!x2apic_preenabled && skip_ioapic_setup) {
pr_info("Skipped enabling intr-remap because of skipping "
"io-apic setup\n");
- return;
+ goto ir_failed;
}

ioapic_entries = alloc_ioapic_entries();
@@ -1400,22 +1399,14 @@ void __init enable_IR_x2apic(void)
goto end;
}

- local_irq_save(flags);
mask_IO_APIC_setup(ioapic_entries);
- mask_8259A();

ret = enable_intr_remapping(x2apic_supported());
if (ret)
goto end_restore;
-
pr_info("Enabled Interrupt-remapping\n");

- if (x2apic_supported() && !x2apic_mode) {
- x2apic_mode = 1;
- enable_x2apic();
- pr_info("Enabled x2apic\n");
- }
-
+ return 1;
end_restore:
if (ret)
/*
@@ -1423,30 +1414,50 @@ end_restore:
*/
restore_IO_APIC_setup(ioapic_entries);

- unmask_8259A();
- local_irq_restore(flags);
-
end:
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);

- if (!ret)
- return;
-
ir_failed:
- if (x2apic_preenabled)
+#endif
+ return 0;
+}
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ mask_8259A();
+
+ /* IR is required if x2apic is enabled by BIOS even when running in kvm
+ * since this indicates present of APIC ID > 255 */
+ if (!enable_IR() && (x2apic_preenabled || !kvm_para_available()))
+ goto nox2apic;
+
+ if (x2apic_supported() && !x2apic_mode) {
+ x2apic_mode = 1;
+ enable_x2apic();
+ pr_info("Enabled x2apic\n");
+ }
+
+ unmask_8259A();
+ local_irq_restore(flags);
+ return;
+
+nox2apic:
+ unmask_8259A();
+ local_irq_restore(flags);
+
+ if (x2apic_preenabled) {
+#ifdef CONFIG_INTR_REMAP
panic("x2apic enabled by bios. But IR enabling failed");
- else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic,Intr-remapping\n");
#else
- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)
panic("x2apic enabled prior OS handover,"
" enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
#endif
-
+ } else if (cpu_has_x2apic)
+ pr_info("Not enabling x2apic,Intr-remapping\n");
return;
}

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Sheng Yang

unread,
Jun 29, 2009, 4:00:23 AM6/29/09
to
On Sunday 28 June 2009 20:51:14 Gleb Natapov wrote:
> KVM would like to provide x2APIC interface to a guest without emulating
> interrupt remapping device. The reason KVM prefers guest to use x2APIC
> is that x2APIC interface is better virtualizable and provides better
> performance than mmio xAPIC interface:
>
> - msr exits are faster than mmio (no page table walk, emulation)
> - no need to read back ICR to look at the busy bit
> - one 64 bit ICR write instead of two 32 bit writes
> - shared code with the Hyper-V paravirt interface
>
> Included patch changes x2APIC enabling logic to enable it even if IR
> initialization failed, but kernel runs under KVM and no apic id is
> greater than 255 (if there is one spec requires BIOS to move to x2apic
> mode before starting an OS).

[Resend, sorry for html noise...]

Add Suresh here. And some comments and unclear points below.

> Signed-off-by: Gleb Natapov <gl...@redhat.com>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index d1430ef..7438c5c 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -260,7 +260,7 @@ config SMP
>
> config X86_X2APIC
> bool "Support x2apic"
> - depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
> + depends on X86_LOCAL_APIC && X86_64
> ---help---
> This enables x2apic support on CPUs that have this feature.

We may need to note x2apic won't work without interrupt remapping or as a
guest in KVM here?

Unrelated...

Not quite understand the comment here. Could you explain why "since this
indicates present of APIC ID > 255?" In another word, why enable_IR() needed
for KVM even it would be fail at dmar_table_init()?

Another question is, we supposed KVM would use physical mode of x2apic, for
IOAPIC and MSI won't support APIC ID > 255. But seems not, or I miss
something?

--
regards
Yang, Sheng

> To unsubscribe from this list: send the line "unsubscribe kvm" in

Gleb Natapov

unread,
Jun 29, 2009, 4:20:28 AM6/29/09
to
On Mon, Jun 29, 2009 at 03:49:45PM +0800, Sheng Yang wrote:
> > Signed-off-by: Gleb Natapov <gl...@redhat.com>
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index d1430ef..7438c5c 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -260,7 +260,7 @@ config SMP
> >
> > config X86_X2APIC
> > bool "Support x2apic"
> > - depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
> > + depends on X86_LOCAL_APIC && X86_64
> > ---help---
> > This enables x2apic support on CPUs that have this feature.
>
> We may need to note x2apic won't work without interrupt remapping or as a
> guest in KVM here?
Will add help text to say this.

> Unrelated...

> Not quite understand the comment here. Could you explain why "since this
> indicates present of APIC ID > 255?" In another word, why enable_IR() needed
> for KVM even it would be fail at dmar_table_init()?
>

The x2APIC doc says:

The default will be for the BIOS to pass the control to the OS with the
local x2APICs in xAPIC mode if all x2APIC IDs reported by CPUID.0BH:EDX
are less than 255, and in x2APIC mode if there are any logical processor
reporting its x2APIC ID at 255 or greater.

So x2apic_preenabled will be true if there is an APIC ID greater then 255
and in this case we don't want to allow x2APIC use without IR even in KVM
for the reason you mention: IOAPIC and MSI will not be able to address
all CPUs. The reason to try to enable IR first is so that when KVM will
want to support more then 255 CPUs it will be possible to implement IR
device and use existing kernel without additional changes.

> Another question is, we supposed KVM would use physical mode of x2apic, for
> IOAPIC and MSI won't support APIC ID > 255. But seems not, or I miss
> something?
>

Good point. The patch should be fixed to require physical mode for x2apic.

Gleb Natapov

unread,
Jun 29, 2009, 6:10:12 AM6/29/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
---

v2:
Add help message to Kconfig
Force physical mode if IR is not available.

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d1430ef..3e5b6ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -260,12 +260,15 @@ config SMP



config X86_X2APIC
bool "Support x2apic"
- depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
+ depends on X86_LOCAL_APIC && X86_64
---help---
This enables x2apic support on CPUs that have this feature.

This allows 32-bit apic IDs (so it can support very large systems),
- and accesses the local apic via MSRs not via mmio.
+ and accesses the local apic via MSRs not via mmio. CONFIG_INTR_REMAP
+ is required for x2apic to take affect when running on physical
+ machine. If you intend to run the kernel as KVM guest x2apic can
+ be used without interrupt remapping.

If you don't know what to do here, say N.

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042..19d97ae 100644

@@ -1400,9 +1399,7 @@ void __init enable_IR_x2apic(void)


goto end;
}

- local_irq_save(flags);
mask_IO_APIC_setup(ioapic_entries);
- mask_8259A();

ret = enable_intr_remapping(x2apic_supported());
if (ret)

@@ -1410,12 +1407,7 @@ void __init enable_IR_x2apic(void)



pr_info("Enabled Interrupt-remapping\n");

- if (x2apic_supported() && !x2apic_mode) {
- x2apic_mode = 1;
- enable_x2apic();
- pr_info("Enabled x2apic\n");
- }
-
+ return 1;
end_restore:
if (ret)
/*

@@ -1423,30 +1415,57 @@ end_restore:


*/
restore_IO_APIC_setup(ioapic_entries);

- unmask_8259A();
- local_irq_restore(flags);
-
end:
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);

- if (!ret)
- return;
-
ir_failed:
- if (x2apic_preenabled)
+#endif
+ return 0;
+}
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ mask_8259A();
+

+ if (!enable_IR()) {


+ /* IR is required if x2apic is enabled by BIOS even

+ * when running in kvm since this indicates present
+ * of APIC ID > 255 */
+ if (x2apic_preenabled || !kvm_para_available())
+ goto nox2apic;
+ else
+ /* without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode */
+ x2apic_phys = 1;
+ }
+

Ingo Molnar

unread,
Jun 29, 2009, 6:30:30 AM6/29/09
to

Have you tried to build this with X86_X2APIC set but INTR_REMAP
disabled?

Ingo

Gleb Natapov

unread,
Jun 29, 2009, 6:40:14 AM6/29/09
to
Yes. I notice that X86_X2APIC depends on INTR_REMAP when I did it.

--
Gleb.

Suresh Siddha

unread,
Jun 29, 2009, 7:00:19 AM6/29/09
to
On Mon, 2009-06-29 at 03:08 -0700, Gleb Natapov wrote:
>
> - local_irq_save(flags);
> mask_IO_APIC_setup(ioapic_entries);
> - mask_8259A();

Is there a reason why the 8259 mask/unmask operations are separated from
io-apic mask/unmask operations.

Can we keep it together so that it will be easy to read and understand
that we first do the interrupt subsystem mask, try enabling IR and
x2apic and unmask the interrupt subsystem.

Otherwise I am ok with this change.

thanks,
suresh

Gleb Natapov

unread,
Jun 29, 2009, 7:30:12 AM6/29/09
to
On Mon, Jun 29, 2009 at 03:46:56AM -0700, Suresh Siddha wrote:
> On Mon, 2009-06-29 at 03:08 -0700, Gleb Natapov wrote:
> >
> > - local_irq_save(flags);
> > mask_IO_APIC_setup(ioapic_entries);
> > - mask_8259A();
>
> Is there a reason why the 8259 mask/unmask operations are separated from
> io-apic mask/unmask operations.
>
I left interrupt masking in enable_IR_x2apic() because interrupt should
be masked during transition to x2apic mode, so it can't be moved to
enable_IR(). I moved io-apic into enable_IR() because the state of
io-apic depend on whether IR was enabled or not, so I left it close to
IR enabling logic. Also io-apic masking can fail, but we still want to
enable x2apic on KVM if possible, and moving io-apic masking to
enable_IR_x2apic() will complicate the logic.

> Can we keep it together so that it will be easy to read and understand
> that we first do the interrupt subsystem mask, try enabling IR and
> x2apic and unmask the interrupt subsystem.
>

For io-apic this is not 100% symmetric. We mask io-apic, enable IR and
if this succeeds we do not unmask io-apic.

> Otherwise I am ok with this change.
>
> thanks,
> suresh

--
Gleb.

Suresh Siddha

unread,
Jun 29, 2009, 8:00:18 AM6/29/09
to
On Mon, 2009-06-29 at 04:25 -0700, Gleb Natapov wrote:
> I left interrupt masking in enable_IR_x2apic() because interrupt should
> be masked during transition to x2apic mode, so it can't be moved to
> enable_IR(). I moved io-apic into enable_IR() because the state of
> io-apic depend on whether IR was enabled or not, so I left it close to
> IR enabling logic.

This can be based on the outcome of enable_IR().

> Also io-apic masking can fail, but we still want to
> enable x2apic on KVM if possible, and moving io-apic masking to
> enable_IR_x2apic() will complicate the logic.

io-apic masking can fail because of memory allocation failures, right?
If we want a simple solution, we can skip both x2apic and IR. This is
not going to be common scenario anyhow. If we really hit this, we will
worry about more things than x2apic :)

>
> > Can we keep it together so that it will be easy to read and understand
> > that we first do the interrupt subsystem mask, try enabling IR and
> > x2apic and unmask the interrupt subsystem.
> >
> For io-apic this is not 100% symmetric. We mask io-apic, enable IR and
> if this succeeds we do not unmask io-apic.

Correct. Can we do the same, otherwise we might have a situation (under
kvm atleast) where io-apic will be active while we will be changing the
cpu mode. Just trying to be defensive as these might lead to hard to
debug corner case conditions.

thanks,
suresh

Gleb Natapov

unread,
Jun 29, 2009, 9:40:05 AM6/29/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
---

v2:
Add help message to Kconfig
Force physical mode if IR is not available.

v3:
disable io-apic in outer function. Enable x2apic mode with masked
io-apic

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d1430ef..3e5b6ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -260,12 +260,15 @@ config SMP

config X86_X2APIC
bool "Support x2apic"
- depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
+ depends on X86_LOCAL_APIC && X86_64

---help---
This enables x2apic support on CPUs that have this feature.

This allows 32-bit apic IDs (so it can support very large systems),
- and accesses the local apic via MSRs not via mmio.
+ and accesses the local apic via MSRs not via mmio. CONFIG_INTR_REMAP
+ is required for x2apic to take affect when running on physical
+ machine. If you intend to run the kernel as KVM guest x2apic can
+ be used without interrupt remapping.

If you don't know what to do here, say N.

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index 8c7c042..f5f02c3 100644


--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>

unsigned int num_processors;

@@ -1363,52 +1364,76 @@ void enable_x2apic(void)


}
#endif /* CONFIG_X86_X2APIC */

-void __init enable_IR_x2apic(void)
+int __init enable_IR(void)
{
#ifdef CONFIG_INTR_REMAP
int ret;
- unsigned long flags;

- struct IO_APIC_route_entry **ioapic_entries = NULL;

ret = dmar_table_init();
if (ret) {
pr_debug("dmar_table_init() failed with %d:\n", ret);
- goto ir_failed;
+ return 0;
}

if (!intr_remapping_supported()) {
pr_debug("intr-remapping not supported\n");
- goto ir_failed;
+ return 0;


}

-
if (!x2apic_preenabled && skip_ioapic_setup) {
pr_info("Skipped enabling intr-remap because of skipping "
"io-apic setup\n");
- return;

+ return 0;
}

+ if (enable_intr_remapping(x2apic_supported()))
+ return 0;
+
+ pr_info("Enabled Interrupt-remapping\n");
+
+ return 1;
+


+#endif
+ return 0;
+}
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;

+ struct IO_APIC_route_entry **ioapic_entries = NULL;
+ int ret, x2apic_enabled = 0;
+
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
- pr_info("Allocate ioapic_entries failed: %d\n", ret);
- goto end;
+ pr_info("Allocate ioapic_entries failed\n");
+ return;
}

ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto end;
+ free_ioapic_entries(ioapic_entries);
+ return;
}

local_irq_save(flags);
- mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
+ mask_IO_APIC_setup(ioapic_entries);

- ret = enable_intr_remapping(x2apic_supported());
- if (ret)
- goto end_restore;
+ ret = enable_IR();
+ if (!ret) {


+ /* IR is required if x2apic is enabled by BIOS even
+ * when running in kvm since this indicates present
+ * of APIC ID > 255 */
+ if (x2apic_preenabled || !kvm_para_available())
+ goto nox2apic;
+ else
+ /* without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode */
+ x2apic_phys = 1;
+ }

- pr_info("Enabled Interrupt-remapping\n");
+ x2apic_enabled = 1;

if (x2apic_supported() && !x2apic_mode) {
x2apic_mode = 1;
@@ -1416,41 +1441,28 @@ void __init enable_IR_x2apic(void)
pr_info("Enabled x2apic\n");
}

-end_restore:
- if (ret)
- /*
- * IR enabling failed
- */
+nox2apic:
+ if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
-
unmask_8259A();
local_irq_restore(flags);

-end:
- if (ioapic_entries)
- free_ioapic_entries(ioapic_entries);
+ free_ioapic_entries(ioapic_entries);

- if (!ret)
+ if (x2apic_enabled)
return;

-ir_failed:
- if (x2apic_preenabled)


+ if (x2apic_preenabled) {
+#ifdef CONFIG_INTR_REMAP
panic("x2apic enabled by bios. But IR enabling failed");
- else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic,Intr-remapping\n");
#else
- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)
panic("x2apic enabled prior OS handover,"

- " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
+ " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
#endif
-
- return;


+ } else if (cpu_has_x2apic)
+ pr_info("Not enabling x2apic,Intr-remapping\n");
}

-
#ifdef CONFIG_X86_64
/*
* Detect and enable local APICs on non-SMP boards.
--
Gleb.

Suresh Siddha

unread,
Jun 29, 2009, 11:10:08 AM6/29/09
to
On Mon, 2009-06-29 at 06:29 -0700, Gleb Natapov wrote:
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index d1430ef..3e5b6ea 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -260,12 +260,15 @@ config SMP
>
> config X86_X2APIC
> bool "Support x2apic"
> - depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
> + depends on X86_LOCAL_APIC && X86_64

Thinking more, probably we shouldn't remove this dependency. This might
encourage people (knowingly or unknowingly) to enable x2apic without
interrupt-remapping. Can we remove this? KVM mode will still work even
if we fail to enable interrupt-remapping. So this shouldn't be an issue,
correct?

Sorry I should have commented about this before.

>
> ioapic_entries = alloc_ioapic_entries();
> if (!ioapic_entries) {
> - pr_info("Allocate ioapic_entries failed: %d\n", ret);
> - goto end;
> + pr_info("Allocate ioapic_entries failed\n");
> + return;


We should go to ir_failed ..

> }
>
> ret = save_IO_APIC_setup(ioapic_entries);
> if (ret) {
> pr_info("Saving IO-APIC state failed: %d\n", ret);
> - goto end;
> + free_ioapic_entries(ioapic_entries);
> + return;

same here.

In few hours I will be on 24 hour flight. So please bear with me for
delayed responses.

thanks,
suresh

Gleb Natapov

unread,
Jun 29, 2009, 11:20:07 AM6/29/09
to
On Mon, Jun 29, 2009 at 07:58:39AM -0700, Suresh Siddha wrote:
> On Mon, 2009-06-29 at 06:29 -0700, Gleb Natapov wrote:
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index d1430ef..3e5b6ea 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -260,12 +260,15 @@ config SMP
> >
> > config X86_X2APIC
> > bool "Support x2apic"
> > - depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
> > + depends on X86_LOCAL_APIC && X86_64
>
> Thinking more, probably we shouldn't remove this dependency. This might
> encourage people (knowingly or unknowingly) to enable x2apic without
> interrupt-remapping. Can we remove this? KVM mode will still work even
> if we fail to enable interrupt-remapping. So this shouldn't be an issue,
> correct?
>
Yes, KVM will still work. I don't have strong fillings one way or the
other, but why mandate an option that is no longer mandatory. What
others think?

> Sorry I should have commented about this before.
>
> >
> > ioapic_entries = alloc_ioapic_entries();
> > if (!ioapic_entries) {
> > - pr_info("Allocate ioapic_entries failed: %d\n", ret);
> > - goto end;
> > + pr_info("Allocate ioapic_entries failed\n");
> > + return;
>
>
> We should go to ir_failed ..

Why? There is not more ir_failed.

>
> > }
> >
> > ret = save_IO_APIC_setup(ioapic_entries);
> > if (ret) {
> > pr_info("Saving IO-APIC state failed: %d\n", ret);
> > - goto end;
> > + free_ioapic_entries(ioapic_entries);
> > + return;
>
> same here.
>

Why? What ir_failed should do? Call free_ioapic_entries(ioapic_entries)
and exit?

--
Gleb.

Suresh Siddha

unread,
Jun 29, 2009, 11:20:08 AM6/29/09
to
On Mon, 2009-06-29 at 08:10 -0700, Gleb Natapov wrote:
> On Mon, Jun 29, 2009 at 07:58:39AM -0700, Suresh Siddha wrote:
> > Thinking more, probably we shouldn't remove this dependency. This might
> > encourage people (knowingly or unknowingly) to enable x2apic without
> > interrupt-remapping. Can we remove this? KVM mode will still work even
> > if we fail to enable interrupt-remapping. So this shouldn't be an issue,
> > correct?
> >
> Yes, KVM will still work. I don't have strong fillings one way or the
> other, but why mandate an option that is no longer mandatory. What
> others think?

Only under the presence of KVM we are breaking this dependency. And
typically the same kernel runs natively and under the presence of kvm
correct. So thats why we shouldn't break this dependency.


> > > ioapic_entries = alloc_ioapic_entries();
> > > if (!ioapic_entries) {
> > > - pr_info("Allocate ioapic_entries failed: %d\n", ret);
> > > - goto end;
> > > + pr_info("Allocate ioapic_entries failed\n");
> > > + return;
> >
> >
> > We should go to ir_failed ..
> Why? There is not more ir_failed.

oops. Not literally. We should goto the point where we should report the
x2apic or ir enabling failed and check for x2apic pre-enabled etc at the
end.

Gleb Natapov

unread,
Jun 29, 2009, 11:50:07 AM6/29/09
to
On Mon, Jun 29, 2009 at 08:15:05AM -0700, Suresh Siddha wrote:
> On Mon, 2009-06-29 at 08:10 -0700, Gleb Natapov wrote:
> > On Mon, Jun 29, 2009 at 07:58:39AM -0700, Suresh Siddha wrote:
> > > Thinking more, probably we shouldn't remove this dependency. This might
> > > encourage people (knowingly or unknowingly) to enable x2apic without
> > > interrupt-remapping. Can we remove this? KVM mode will still work even
> > > if we fail to enable interrupt-remapping. So this shouldn't be an issue,
> > > correct?
> > >
> > Yes, KVM will still work. I don't have strong fillings one way or the
> > other, but why mandate an option that is no longer mandatory. What
> > others think?
>
> Only under the presence of KVM we are breaking this dependency. And
> typically the same kernel runs natively and under the presence of kvm
> correct. So thats why we shouldn't break this dependency.
>
OK, I'll drop Kconfig part in the next version. Unless someone will
object till tomorrow.

>
> > > > ioapic_entries = alloc_ioapic_entries();
> > > > if (!ioapic_entries) {
> > > > - pr_info("Allocate ioapic_entries failed: %d\n", ret);
> > > > - goto end;
> > > > + pr_info("Allocate ioapic_entries failed\n");
> > > > + return;
> > >
> > >
> > > We should go to ir_failed ..
> > Why? There is not more ir_failed.
>
> oops. Not literally. We should goto the point where we should report the
> x2apic or ir enabling failed and check for x2apic pre-enabled etc at the
> end.
>

Ah, OK.

--
Gleb.

Gleb Natapov

unread,
Jun 30, 2009, 2:50:09 AM6/30/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042..351d55a 100644


--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>

unsigned int num_processors;

@@ -1363,52 +1364,75 @@ void enable_x2apic(void)

ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
- pr_info("Allocate ioapic_entries failed: %d\n", ret);
- goto end;
+ pr_info("Allocate ioapic_entries failed\n");

+ goto out;


}

ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {

pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto end;
+ goto out;


}

local_irq_save(flags);
- mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
+ mask_IO_APIC_setup(ioapic_entries);

- ret = enable_intr_remapping(x2apic_supported());
- if (ret)
- goto end_restore;
+ ret = enable_IR();
+ if (!ret) {
+ /* IR is required if x2apic is enabled by BIOS even
+ * when running in kvm since this indicates present
+ * of APIC ID > 255 */
+ if (x2apic_preenabled || !kvm_para_available())
+ goto nox2apic;
+ else
+ /* without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode */
+ x2apic_phys = 1;
+ }

- pr_info("Enabled Interrupt-remapping\n");
+ x2apic_enabled = 1;

if (x2apic_supported() && !x2apic_mode) {
x2apic_mode = 1;

@@ -1416,41 +1440,30 @@ void __init enable_IR_x2apic(void)


pr_info("Enabled x2apic\n");
}

-end_restore:
- if (ret)
- /*
- * IR enabling failed
- */
+nox2apic:
+ if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
-
unmask_8259A();
local_irq_restore(flags);

-end:

+out:
if (ioapic_entries)

Yinghai Lu

unread,
Jun 30, 2009, 3:20:07 AM6/30/09
to
> + � � � }

how about kexec second kernel in KVM ?

x2apic_preenabled will be set in second kernel.

YH

Gleb Natapov

unread,
Jun 30, 2009, 4:00:13 AM6/30/09
to
On Tue, Jun 30, 2009 at 12:18:19AM -0700, Yinghai Lu wrote:
> On Mon, Jun 29, 2009 at 11:45 PM, Gleb Natapov<gl...@redhat.com> wrote:
> > KVM would like to provide x2APIC interface to a guest without emulating
> > interrupt remapping device. The reason KVM prefers guest to use x2APIC
> > is that x2APIC interface is better virtualizable and provides better
> > performance than mmio xAPIC interface:
> >
> > - msr exits are faster than mmio (no page table walk, emulation)
> > - no need to read back ICR to look at the busy bit
> > - one 64 bit ICR write instead of two 32 bit writes
> > - shared code with the Hyper-V paravirt interface
> >
> > Included patch changes x2APIC enabling logic to enable it even if IR
> > initialization failed, but kernel runs under KVM and no apic id is
> > greater than 255 (if there is one spec requires BIOS to move to x2apic
> > mode before starting an OS).
> >
> > Signed-off-by: Gleb Natapov <gl...@redhat.com>
> > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> > index 8c7c042..351d55a 100644
> > --- a/arch/x86/kernel/apic/apic.c
> > +++ b/arch/x86/kernel/apic/apic.c
> > @@ -49,6 +49,7 @@
> > О©╫#include <asm/mtrr.h>
> > О©╫#include <asm/smp.h>
> > О©╫#include <asm/mce.h>
> > +#include <asm/kvm_para.h>
> >
> > О©╫unsigned int num_processors;

> >
> > @@ -1363,52 +1364,75 @@ void enable_x2apic(void)
> > О©╫}
> > О©╫#endif /* CONFIG_X86_X2APIC */

> >
> > -void __init enable_IR_x2apic(void)
> > +int __init enable_IR(void)
> > О©╫{
> > О©╫#ifdef CONFIG_INTR_REMAP
> > О©╫ О©╫ О©╫ О©╫int ret;
> > - О©╫ О©╫ О©╫ unsigned long flags;
> > - О©╫ О©╫ О©╫ struct IO_APIC_route_entry **ioapic_entries = NULL;
> >
> > О©╫ О©╫ О©╫ О©╫ret = dmar_table_init();
> > О©╫ О©╫ О©╫ О©╫if (ret) {
> > О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫pr_debug("dmar_table_init() failed with %d:\n", ret);
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto ir_failed;
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ return 0;
> > О©╫ О©╫ О©╫ О©╫}
> >
> > О©╫ О©╫ О©╫ О©╫if (!intr_remapping_supported()) {
> > О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫pr_debug("intr-remapping not supported\n");
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto ir_failed;
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ return 0;
> > О©╫ О©╫ О©╫ О©╫}
> >
> > -
> > О©╫ О©╫ О©╫ О©╫if (!x2apic_preenabled && skip_ioapic_setup) {
> > О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫pr_info("Skipped enabling intr-remap because of skipping "
> > О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫"io-apic setup\n");
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ return;
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ return 0;
> > О©╫ О©╫ О©╫ О©╫}
> >
> > + О©╫ О©╫ О©╫ if (enable_intr_remapping(x2apic_supported()))
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ return 0;
> > +
> > + О©╫ О©╫ О©╫ pr_info("Enabled Interrupt-remapping\n");
> > +
> > + О©╫ О©╫ О©╫ return 1;
> > +
> > +#endif
> > + О©╫ О©╫ О©╫ return 0;

> > +}
> > +
> > +void __init enable_IR_x2apic(void)
> > +{
> > + О©╫ О©╫ О©╫ unsigned long flags;
> > + О©╫ О©╫ О©╫ struct IO_APIC_route_entry **ioapic_entries = NULL;
> > + О©╫ О©╫ О©╫ int ret, x2apic_enabled = 0;
> > +
> > О©╫ О©╫ О©╫ О©╫ioapic_entries = alloc_ioapic_entries();
> > О©╫ О©╫ О©╫ О©╫if (!ioapic_entries) {
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ pr_info("Allocate ioapic_entries failed: %d\n", ret);
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto end;
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫pr_info("Allocate ioapic_entries failed\n");
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫goto out;
> > О©╫ О©╫ О©╫ О©╫}
> >
> > О©╫ О©╫ О©╫ О©╫ret = save_IO_APIC_setup(ioapic_entries);
> > О©╫ О©╫ О©╫ О©╫if (ret) {
> > О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫pr_info("Saving IO-APIC state failed: %d\n", ret);
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto end;
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto out;
> > О©╫ О©╫ О©╫ О©╫}
> >
> > О©╫ О©╫ О©╫ О©╫local_irq_save(flags);
> > - О©╫ О©╫ О©╫ mask_IO_APIC_setup(ioapic_entries);
> > О©╫ О©╫ О©╫ О©╫mask_8259A();
> > + О©╫ О©╫ О©╫ mask_IO_APIC_setup(ioapic_entries);
> >
> > - О©╫ О©╫ О©╫ ret = enable_intr_remapping(x2apic_supported());
> > - О©╫ О©╫ О©╫ if (ret)
> > - О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto end_restore;
> > + О©╫ О©╫ О©╫ ret = enable_IR();
> > + О©╫ О©╫ О©╫ if (!ret) {
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ /* IR is required if x2apic is enabled by BIOS even
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫* when running in kvm since this indicates present
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫* of APIC ID > 255 */
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ if (x2apic_preenabled || !kvm_para_available())
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ goto nox2apic;
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ else
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ /* without IR all CPUs can be addressed by IOAPIC/MSI
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫* only in physical mode */
> > + О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ О©╫ x2apic_phys = 1;
> > + О©╫ О©╫ О©╫ }

>
> how about kexec second kernel in KVM ?
>
> x2apic_preenabled will be set in second kernel.
>
Yes, bummer. But the similar problem exist now and without KVM. After
running x2apic enabled kernel you can't kexec kernel without x2apic
support. Shouldn't apic be reset to its initial state on reboot?

--
Gleb.

Eric W. Biederman

unread,
Jun 30, 2009, 5:30:16 AM6/30/09
to
Gleb Natapov <gl...@redhat.com> writes:

> KVM would like to provide x2APIC interface to a guest without emulating
> interrupt remapping device. The reason KVM prefers guest to use x2APIC
> is that x2APIC interface is better virtualizable and provides better
> performance than mmio xAPIC interface:
>
> - msr exits are faster than mmio (no page table walk, emulation)
> - no need to read back ICR to look at the busy bit
> - one 64 bit ICR write instead of two 32 bit writes
> - shared code with the Hyper-V paravirt interface
>
> Included patch changes x2APIC enabling logic to enable it even if IR
> initialization failed, but kernel runs under KVM and no apic id is
> greater than 255 (if there is one spec requires BIOS to move to x2apic
> mode before starting an OS).


How common is hotplug hardware in kvm? In particular hotplug cpus?

To support that seriously you need interrupt remapping.

Eric

Gleb Natapov

unread,
Jun 30, 2009, 5:30:18 AM6/30/09
to
On Tue, Jun 30, 2009 at 02:24:05AM -0700, Eric W. Biederman wrote:
> Gleb Natapov <gl...@redhat.com> writes:
>
> > KVM would like to provide x2APIC interface to a guest without emulating
> > interrupt remapping device. The reason KVM prefers guest to use x2APIC
> > is that x2APIC interface is better virtualizable and provides better
> > performance than mmio xAPIC interface:
> >
> > - msr exits are faster than mmio (no page table walk, emulation)
> > - no need to read back ICR to look at the busy bit
> > - one 64 bit ICR write instead of two 32 bit writes
> > - shared code with the Hyper-V paravirt interface
> >
> > Included patch changes x2APIC enabling logic to enable it even if IR
> > initialization failed, but kernel runs under KVM and no apic id is
> > greater than 255 (if there is one spec requires BIOS to move to x2apic
> > mode before starting an OS).
>
>
> How common is hotplug hardware in kvm? In particular hotplug cpus?
>
It works for Linux guests.

> To support that seriously you need interrupt remapping.
>

Can you explain why?

--
Gleb.

Gleb Natapov

unread,
Jun 30, 2009, 12:00:12 PM6/30/09
to
On Tue, Jun 30, 2009 at 12:18:19AM -0700, Yinghai Lu wrote:
> how about kexec second kernel in KVM ?
>
> x2apic_preenabled will be set in second kernel.
>
By the way anybody knows why kexec does not use BIOS reset code
(cmos 0xf offset) to jump into new kernel after hard reset?

--
Gleb.

Eric W. Biederman

unread,
Jun 30, 2009, 12:50:14 PM6/30/09
to
Gleb Natapov <gl...@redhat.com> writes:

> On Tue, Jun 30, 2009 at 02:24:05AM -0700, Eric W. Biederman wrote:
>> Gleb Natapov <gl...@redhat.com> writes:
>>
>> > KVM would like to provide x2APIC interface to a guest without emulating
>> > interrupt remapping device. The reason KVM prefers guest to use x2APIC
>> > is that x2APIC interface is better virtualizable and provides better
>> > performance than mmio xAPIC interface:
>> >
>> > - msr exits are faster than mmio (no page table walk, emulation)
>> > - no need to read back ICR to look at the busy bit
>> > - one 64 bit ICR write instead of two 32 bit writes
>> > - shared code with the Hyper-V paravirt interface
>> >
>> > Included patch changes x2APIC enabling logic to enable it even if IR
>> > initialization failed, but kernel runs under KVM and no apic id is
>> > greater than 255 (if there is one spec requires BIOS to move to x2apic
>> > mode before starting an OS).
>>
>>
>> How common is hotplug hardware in kvm? In particular hotplug cpus?
>>
> It works for Linux guests.

>
>> To support that seriously you need interrupt remapping.
>>
> Can you explain why?

Because ioapics don't fully function according to spec,
and the interrupt code on the hotplug path is a horrible
terrible broken hack for ioapics.

It is better than nothing but it certainly is not something
I would expect to work all of the time.

Interrupt remapping is the one case where we have hardware
that works according to spec and that works reasonably well.

Eric

Eric W. Biederman

unread,
Jun 30, 2009, 1:10:14 PM6/30/09
to
Gleb Natapov <gl...@redhat.com> writes:

> On Tue, Jun 30, 2009 at 12:18:19AM -0700, Yinghai Lu wrote:
>> how about kexec second kernel in KVM ?
>>
>> x2apic_preenabled will be set in second kernel.
>>
> By the way anybody knows why kexec does not use BIOS reset code
> (cmos 0xf offset) to jump into new kernel after hard reset?

After a hard reset. That simply isn't possible. A hard
reset clears everything even memory.

You might be able to get a full cpu reset but not a reset of the I/O
devices.

The premise of kexec is that we are doing things on our own, and don't
get a 3rd piece of software involved that has not been heavily tested
on the path we want to use. Occassionally it is a pain to do everything
ourselves but at least when we do and we test it we know it is going
to work.

Cpu designers lately seem fond of adding features that require all
kind of coordination to turn on and off. We handle the hardware
virtualization mode features now, and if x2apic has similar problems
being turned on and off I am certain we can handle that case in a
similar fashion.

When we can my preference is to keep code like that out of the
kexec on panic path if we can figure out how to write the software
to do something reasonable.

Once we figure out how to work without putting the interrupt controllers
in legacy mode to handle the timer interrupts I expect all kinds of
things will become simpler.

Eric

Avi Kivity

unread,
Jun 30, 2009, 1:20:06 PM6/30/09
to
On 06/30/2009 07:44 PM, Eric W. Biederman wrote:
>>> To support that seriously you need interrupt remapping.
>>>
>>>
>> Can you explain why?
>>
>
> Because ioapics don't fully function according to spec,
> and the interrupt code on the hotplug path is a horrible
> terrible broken hack for ioapics.
>
> It is better than nothing but it certainly is not something
> I would expect to work all of the time.
>

Can you elaborate? For kvm guests, the hardware is reasonably will
implemented and if not we will fix it. We need not cripple a feature
just because some hardware is broken.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

Avi Kivity

unread,
Jun 30, 2009, 1:20:08 PM6/30/09
to
On 06/30/2009 08:00 PM, Eric W. Biederman wrote:
> Gleb Natapov<gl...@redhat.com> writes:
>
>
>> On Tue, Jun 30, 2009 at 12:18:19AM -0700, Yinghai Lu wrote:
>>
>>> how about kexec second kernel in KVM ?
>>>
>>> x2apic_preenabled will be set in second kernel.
>>>
>>>
>> By the way anybody knows why kexec does not use BIOS reset code
>> (cmos 0xf offset) to jump into new kernel after hard reset?
>>
>
> After a hard reset. That simply isn't possible. A hard
> reset clears everything even memory.
>
>

So the jump-to-vector reset code cannot work on real hardware? It works
in qemu, but of course that's easy.

> You might be able to get a full cpu reset but not a reset of the I/O
> devices.
>
> The premise of kexec is that we are doing things on our own, and don't
> get a 3rd piece of software involved that has not been heavily tested
> on the path we want to use. Occassionally it is a pain to do everything
> ourselves but at least when we do and we test it we know it is going
> to work.
>
> Cpu designers lately seem fond of adding features that require all
> kind of coordination to turn on and off. We handle the hardware
> virtualization mode features now, and if x2apic has similar problems
> being turned on and off I am certain we can handle that case in a
> similar fashion.
>
> When we can my preference is to keep code like that out of the
> kexec on panic path if we can figure out how to write the software
> to do something reasonable.
>
> Once we figure out how to work without putting the interrupt controllers
> in legacy mode to handle the timer interrupts I expect all kinds of
> things will become simpler.
>

I think it makes sense to add full reset support as a non-default
option. Leaving hardware running and dmaing left and right has its risks.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--

Eric W. Biederman

unread,
Jun 30, 2009, 1:20:12 PM6/30/09
to
Gleb Natapov <gl...@redhat.com> writes:

> On Tue, Jun 30, 2009 at 12:18:19AM -0700, Yinghai Lu wrote:
>> how about kexec second kernel in KVM ?
>>
>> x2apic_preenabled will be set in second kernel.
>>
> By the way anybody knows why kexec does not use BIOS reset code
> (cmos 0xf offset) to jump into new kernel after hard reset?

What I said before plus the fact that kexec grew up on systems
that don't have a BIOS. So the option of using that piece of
was not there.

Eric

Yinghai Lu

unread,
Jun 30, 2009, 2:50:10 PM6/30/09
to
2009/6/30 Gleb Natapov <gl...@redhat.com>:

>>
>> how about kexec second kernel in KVM ?
>>
>> x2apic_preenabled will be set in second kernel.
>>
> Yes, bummer. But the similar problem exist now and without KVM. After
> running x2apic enabled kernel you can't kexec kernel without x2apic
> support.

if the first kernel enable x2apic and intr_remapping, and second
kernel has x2apic and intr_remap support.
we can kexec the second kernel.

restoring to original state is not possible for kdump case.

maybe with minor change to your patch, you could kexec the in kexec
second kernel with x2apic support.

YH

Gleb Natapov

unread,
Jun 30, 2009, 3:00:15 PM6/30/09
to
On Tue, Jun 30, 2009 at 11:43:41AM -0700, Yinghai Lu wrote:
> 2009/6/30 Gleb Natapov <gl...@redhat.com>:
>
> >>
> >> how about kexec second kernel in KVM ?
> >>
> >> x2apic_preenabled will be set in second kernel.
> >>
> > Yes, bummer. But the similar problem exist now and without KVM. After
> > running x2apic enabled kernel you can't kexec kernel without x2apic
> > support.
>
> if the first kernel enable x2apic and intr_remapping, and second
> kernel has x2apic and intr_remap support.
> we can kexec the second kernel.
>
And if it hasn't we cannot, that is what I am trying to point out.

> restoring to original state is not possible for kdump case.
>

Shouldn't it be enough to run UP kernel for kdump? So kdump can reset BSP
apic to xAPIC mode before starting a kernel with maxcpus=1.

> maybe with minor change to your patch, you could kexec the in kexec
> second kernel with x2apic support.
>

My patch can easily avoid the issue by checking max_physical_apicid > 255
instead of x2apic_preenabled.

--
Gleb.

Gleb Natapov

unread,
Jun 30, 2009, 3:10:11 PM6/30/09
to
On Tue, Jun 30, 2009 at 09:44:54AM -0700, Eric W. Biederman wrote:
> Gleb Natapov <gl...@redhat.com> writes:
>
> > On Tue, Jun 30, 2009 at 02:24:05AM -0700, Eric W. Biederman wrote:
> >> Gleb Natapov <gl...@redhat.com> writes:
> >>
> >> > KVM would like to provide x2APIC interface to a guest without emulating
> >> > interrupt remapping device. The reason KVM prefers guest to use x2APIC
> >> > is that x2APIC interface is better virtualizable and provides better
> >> > performance than mmio xAPIC interface:
> >> >
> >> > - msr exits are faster than mmio (no page table walk, emulation)
> >> > - no need to read back ICR to look at the busy bit
> >> > - one 64 bit ICR write instead of two 32 bit writes
> >> > - shared code with the Hyper-V paravirt interface
> >> >
> >> > Included patch changes x2APIC enabling logic to enable it even if IR
> >> > initialization failed, but kernel runs under KVM and no apic id is
> >> > greater than 255 (if there is one spec requires BIOS to move to x2apic
> >> > mode before starting an OS).
> >>
> >>
> >> How common is hotplug hardware in kvm? In particular hotplug cpus?
> >>
> > It works for Linux guests.
>
> >
> >> To support that seriously you need interrupt remapping.
> >>
> > Can you explain why?
>
> Because ioapics don't fully function according to spec,
> and the interrupt code on the hotplug path is a horrible
> terrible broken hack for ioapics.
>
Considering that interrupt remapping is fairly new feature
are you saying that hotplug (pci and cpu) on x86 is a horrible
hack on Linux?

> It is better than nothing but it certainly is not something
> I would expect to work all of the time.
>

Because of horrible code or non-complaint ioapic implementation out
there? If later this is not a big issue for KVM.

> Interrupt remapping is the one case where we have hardware
> that works according to spec and that works reasonably well.
>

I am sure when there was only one ioapic implementation in existence
it worked according to a spec (and if not spec was changed :)) Give
interrupt remapping some time and than we will see how above statement
holds.

--
Gleb.

Eric W. Biederman

unread,
Jun 30, 2009, 3:10:11 PM6/30/09
to
Avi Kivity <a...@redhat.com> writes:

> On 06/30/2009 07:44 PM, Eric W. Biederman wrote:
>>>> To support that seriously you need interrupt remapping.
>>>>
>>>>
>>> Can you explain why?
>>>
>>
>> Because ioapics don't fully function according to spec,
>> and the interrupt code on the hotplug path is a horrible
>> terrible broken hack for ioapics.
>>
>> It is better than nothing but it certainly is not something
>> I would expect to work all of the time.
>>
>
> Can you elaborate? For kvm guests, the hardware is reasonably will implemented
> and if not we will fix it. We need not cripple a feature just because some
> hardware is broken.

The short version is I don't know what work arounds we will ultimately
decide to deploy to work with real hardware.

I have been seriously contemplating causing a cpu hot-unplug request
to fail if we are in ioapic mode and we have irqs routed to the cpu
that is being unplugged.

Even with perfectly working hardware it is not possible in the general
case to migrate an ioapic irq from one cpu to another outside of an
interrupt handler without without risking dropping an interrupt.
There is no general way to know you have seen the last interrupt
floating around your system. PCI ordering rules don't help because
the ioapics can potentially take an out of band channel.

The interrupt remapping hardware is pci and is only present on systems
with in-band interrupts, so we can flush in-flight interrupts using
pci reads. We can safely and do reprogram the hardware to point at
different cpus at arbitrary times.

So if you possibly can please emulate hardware that is known to work
reliably for cpu hotplug.

Eric

Avi Kivity

unread,
Jun 30, 2009, 3:20:10 PM6/30/09
to
On 06/30/2009 10:08 PM, Eric W. Biederman wrote:
>> Can you elaborate? For kvm guests, the hardware is reasonably will implemented
>> and if not we will fix it. We need not cripple a feature just because some
>> hardware is broken.
>>
>
> The short version is I don't know what work arounds we will ultimately
> decide to deploy to work with real hardware.
>
> I have been seriously contemplating causing a cpu hot-unplug request
> to fail if we are in ioapic mode and we have irqs routed to the cpu
> that is being unplugged.
>

Well, obviously we need to disassociate any irqs from such a cpu. Could
be done from the kernel or only enforced by the kernel.

> Even with perfectly working hardware it is not possible in the general
> case to migrate an ioapic irq from one cpu to another outside of an
> interrupt handler without without risking dropping an interrupt.
>

Can't you generate a spurious interrupt immediately after the
migration? An extra interrupt shouldn't hurt.

> There is no general way to know you have seen the last interrupt
> floating around your system. PCI ordering rules don't help because
> the ioapics can potentially take an out of band channel.
>

Can you describe the problem scenario? an ioapic->lapic message
delivered to a dead cpu?


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--

Gleb Natapov

unread,
Jun 30, 2009, 3:20:10 PM6/30/09
to
On Tue, Jun 30, 2009 at 10:00:46AM -0700, Eric W. Biederman wrote:
> Gleb Natapov <gl...@redhat.com> writes:
>
> > On Tue, Jun 30, 2009 at 12:18:19AM -0700, Yinghai Lu wrote:
> >> how about kexec second kernel in KVM ?
> >>
> >> x2apic_preenabled will be set in second kernel.
> >>
> > By the way anybody knows why kexec does not use BIOS reset code
> > (cmos 0xf offset) to jump into new kernel after hard reset?
>
> After a hard reset. That simply isn't possible. A hard
> reset clears everything even memory.
>
I mean "soft reset". At least on 286 in order to move from protected
mode to real mode "soft reset" was issued via kbd controller or via
triple fault. BIOS reset code was used to avoid POST. Obviously back
than this kind of reset haven't cleared memory. I'll try to check what
happens on a modern HW.

BTW Linux sets BIOS reset code to "return by long jump" in
smpboot_setup_warm_reset() for some reason.

> You might be able to get a full cpu reset but not a reset of the I/O
> devices.
>
> The premise of kexec is that we are doing things on our own, and don't
> get a 3rd piece of software involved that has not been heavily tested
> on the path we want to use. Occassionally it is a pain to do everything
> ourselves but at least when we do and we test it we know it is going
> to work.
>
> Cpu designers lately seem fond of adding features that require all
> kind of coordination to turn on and off. We handle the hardware
> virtualization mode features now, and if x2apic has similar problems
> being turned on and off I am certain we can handle that case in a
> similar fashion.
>
> When we can my preference is to keep code like that out of the
> kexec on panic path if we can figure out how to write the software
> to do something reasonable.
>
> Once we figure out how to work without putting the interrupt controllers
> in legacy mode to handle the timer interrupts I expect all kinds of
> things will become simpler.
>

If the "soft reset" method can work it worth to implemented. I am not
suggesting we should make it default (just line --real-mode is not default).

--
Gleb.

Eric W. Biederman

unread,
Jun 30, 2009, 3:30:18 PM6/30/09
to
Avi Kivity <a...@redhat.com> writes:

> So the jump-to-vector reset code cannot work on real hardware? It works in
> qemu, but of course that's easy.

In general yes. Nothing regularly uses that path so there is little
point adding a new users if we can avoid it.

Heck there are machines with unreliable cmos.

>> You might be able to get a full cpu reset but not a reset of the I/O
>> devices.
>>
>> The premise of kexec is that we are doing things on our own, and don't
>> get a 3rd piece of software involved that has not been heavily tested
>> on the path we want to use. Occassionally it is a pain to do everything
>> ourselves but at least when we do and we test it we know it is going
>> to work.
>>
>> Cpu designers lately seem fond of adding features that require all
>> kind of coordination to turn on and off. We handle the hardware
>> virtualization mode features now, and if x2apic has similar problems
>> being turned on and off I am certain we can handle that case in a
>> similar fashion.
>>
>> When we can my preference is to keep code like that out of the
>> kexec on panic path if we can figure out how to write the software
>> to do something reasonable.
>>
>> Once we figure out how to work without putting the interrupt controllers
>> in legacy mode to handle the timer interrupts I expect all kinds of
>> things will become simpler.
>>
>
> I think it makes sense to add full reset support as a non-default option.
> Leaving hardware running and dmaing left and right has its risks.

It is easy enough to do that in user space in /sbin/kexec if that proves
useful for some reason.

As for the leaving hardware running and dmaing left and right. You
are talking about the kexec on panic code path. On a normal reboot
we do our darndest to cleanly shut everything down properly.

We avoid the dmaing left and right problem by reserving a chunk of
memory at boot time and we probably should be reserving a chunk
of the iommus as well for that purpose.

Hardware resets in general don't preserve the contents of memory.

In general the kexec on panic path is about teaching drivers to
initialize in the worst possible scenario and using those drivers
that can to get information out of the system.

At the same time 99% of the code that runs is in user space so
it should be easy to jump through whatever hoops make sense in
the user space component. While keeping the kexec on panic code
path as slim as possible.

Eric

Eric W. Biederman

unread,
Jun 30, 2009, 3:40:08 PM6/30/09
to
Avi Kivity <a...@redhat.com> writes:

> On 06/30/2009 10:08 PM, Eric W. Biederman wrote:
>>> Can you elaborate? For kvm guests, the hardware is reasonably will implemented
>>> and if not we will fix it. We need not cripple a feature just because some
>>> hardware is broken.
>>>
>>
>> The short version is I don't know what work arounds we will ultimately
>> decide to deploy to work with real hardware.
>>
>> I have been seriously contemplating causing a cpu hot-unplug request
>> to fail if we are in ioapic mode and we have irqs routed to the cpu
>> that is being unplugged.
>>
>
> Well, obviously we need to disassociate any irqs from such a cpu. Could be done
> from the kernel or only enforced by the kernel.

Using the normal irq migration path we can move irqs off of a cpu reliably
there just aren't any progress guarantees.

>> Even with perfectly working hardware it is not possible in the general
>> case to migrate an ioapic irq from one cpu to another outside of an
>> interrupt handler without without risking dropping an interrupt.
>>
>
> Can't you generate a spurious interrupt immediately after the migration? An
> extra interrupt shouldn't hurt.

Nope. The ioapics can't be told to send an interrupt.

>> There is no general way to know you have seen the last interrupt
>> floating around your system. PCI ordering rules don't help because
>> the ioapics can potentially take an out of band channel.
>>
>
> Can you describe the problem scenario? an ioapic->lapic message delivered to a
> dead cpu?

Dropped irqs.. Driver hangs because it is waiting for an irq. Hardware
hangs because it is waiting for the cpu to process the irq.

Potentially we get a level triggered irq that is never acked by
the cpu that won't arm until the cpu send an ack, and we can't
send an ack from another cpu.

Eric

Eric W. Biederman

unread,
Jun 30, 2009, 3:50:14 PM6/30/09
to
Gleb Natapov <gl...@redhat.com> writes:

> Considering that interrupt remapping is fairly new feature
> are you saying that hotplug (pci and cpu) on x86 is a horrible
> hack on Linux?

Just the current cpu hotplug path.

>> It is better than nothing but it certainly is not something
>> I would expect to work all of the time.
>>
> Because of horrible code or non-complaint ioapic implementation out
> there? If later this is not a big issue for KVM.

Both. And even in spec ioapics can't be made to work 100% reliably.

>> Interrupt remapping is the one case where we have hardware
>> that works according to spec and that works reasonably well.
>>
> I am sure when there was only one ioapic implementation in existence
> it worked according to a spec (and if not spec was changed :)) Give
> interrupt remapping some time and than we will see how above statement
> holds.

What we need for cpu hotplug/unplug is on the tested path now.
That is a significant difference.

Eric

Avi Kivity

unread,
Jul 1, 2009, 4:30:14 AM7/1/09
to
On 06/30/2009 10:36 PM, Eric W. Biederman wrote:
>>> The short version is I don't know what work arounds we will ultimately
>>> decide to deploy to work with real hardware.
>>>
>>> I have been seriously contemplating causing a cpu hot-unplug request
>>> to fail if we are in ioapic mode and we have irqs routed to the cpu
>>> that is being unplugged.
>>>
>>>
>> Well, obviously we need to disassociate any irqs from such a cpu. Could be done
>> from the kernel or only enforced by the kernel.
>>
>
> Using the normal irq migration path we can move irqs off of a cpu reliably
> there just aren't any progress guarantees.
>

Program the ioapic to the new cpu. Wait a few milliseconds. If it
takes more than that to get an interrupt from the ioapic to the local
apic, the machine has much bigger problems.

>>> Even with perfectly working hardware it is not possible in the general
>>> case to migrate an ioapic irq from one cpu to another outside of an
>>> interrupt handler without without risking dropping an interrupt.
>>>
>>>
>> Can't you generate a spurious interrupt immediately after the migration? An
>> extra interrupt shouldn't hurt.
>>
>
> Nope. The ioapics can't be told to send an interrupt.
>

You can program the local apic ICR to generate an interrupt with the
same vector.

>>> There is no general way to know you have seen the last interrupt
>>> floating around your system. PCI ordering rules don't help because
>>> the ioapics can potentially take an out of band channel.
>>>
>>>
>> Can you describe the problem scenario? an ioapic->lapic message delivered to a
>> dead cpu?
>>
>
> Dropped irqs.. Driver hangs because it is waiting for an irq. Hardware
> hangs because it is waiting for the cpu to process the irq.
>
> Potentially we get a level triggered irq that is never acked by
> the cpu that won't arm until the cpu send an ack, and we can't
> send an ack from another cpu.
>
>

I think a spurious interrupt generated through the local apic solves
that problem. For level-triggered interrupts, mask them before
offlining the cpu.

--
error compiling committee.c: too many arguments to function

Eric W. Biederman

unread,
Jul 1, 2009, 9:40:12 AM7/1/09
to
Avi Kivity <a...@redhat.com> writes:

> On 06/30/2009 10:36 PM, Eric W. Biederman wrote:
>>>> The short version is I don't know what work arounds we will ultimately
>>>> decide to deploy to work with real hardware.
>>>>
>>>> I have been seriously contemplating causing a cpu hot-unplug request
>>>> to fail if we are in ioapic mode and we have irqs routed to the cpu
>>>> that is being unplugged.
>>>>
>>>>
>>> Well, obviously we need to disassociate any irqs from such a cpu. Could be done
>>> from the kernel or only enforced by the kernel.
>>>
>>
>> Using the normal irq migration path we can move irqs off of a cpu reliably
>> there just aren't any progress guarantees.
>>
>
> Program the ioapic to the new cpu. Wait a few milliseconds. If it takes more
> than that to get an interrupt from the ioapic to the local apic, the machine has
> much bigger problems.

In general you can not reprogram an ioapic safely unless the interrupt
is blocked at the source. Which is why you either need the originating
device disabled or you have to do it in interrupt context.

I forget all of the details. I just know in real hardware I experimented
with it a lot, and wound up hanging the ioapic state machine of both
intel and amd ioapics.

Migrating ioapic irqs in interrupt context sucks. It just happens to
be what works reliably.

I do think the wait an eternity in computer time a short while in human
time is a valid technique when you can do nothing better. If flushing the
interrupt was my only problem that would solve it.

>>>> Even with perfectly working hardware it is not possible in the general
>>>> case to migrate an ioapic irq from one cpu to another outside of an
>>>> interrupt handler without without risking dropping an interrupt.
>>>>
>>>>
>>> Can't you generate a spurious interrupt immediately after the migration? An
>>> extra interrupt shouldn't hurt.
>>>
>>
>> Nope. The ioapics can't be told to send an interrupt.
>>
>
> You can program the local apic ICR to generate an interrupt with the same
> vector.

But you can not program the apic ICR to generate a level triggered
interrupt with the same vector. So you don't get the broadcast
behavior when you ack the apic.

>>>> There is no general way to know you have seen the last interrupt
>>>> floating around your system. PCI ordering rules don't help because
>>>> the ioapics can potentially take an out of band channel.
>>>>
>>>>
>>> Can you describe the problem scenario? an ioapic->lapic message delivered to a
>>> dead cpu?
>>>
>>
>> Dropped irqs.. Driver hangs because it is waiting for an irq. Hardware
>> hangs because it is waiting for the cpu to process the irq.
>>
>> Potentially we get a level triggered irq that is never acked by
>> the cpu that won't arm until the cpu send an ack, and we can't
>> send an ack from another cpu.
>>
>>
>
> I think a spurious interrupt generated through the local apic solves that
> problem. For level-triggered interrupts, mask them before offlining the cpu.

So now we have a masked unacked irq. It doesn't help in the slightest
that the cpu migration code puts irq migration last and request that we
do it all with interrupts disabled.

You might be right that by application of extreme ingenuity and completely
in spec ioapics there is a path that makes this all work. Currently I
don't have fully in spec ioapcis, and I don't have anyone interested enough
in cpu hotplug to be willing to rip things apart until interrupt migration
is a reasonable deal on x86. Instead all I see are patches that mitigate
the worst of the brokenness.

At the same time with the interrupt remapping hardware because it doesn't
need the irq disabled at the source when we reprogram it I can make
everything stable much more easily.

Eric

Gleb Natapov

unread,
Jul 1, 2009, 9:40:13 AM7/1/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
---

This version does not rely on x2apic_preenabled to determine max APIC id
since this logic will break after kexec. Use max_physical_apicid instead.
Also allow to override apic ops after call to enable_IR_x2apic() since
we may decide to use physical mode instead of cluster.

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042..db0c07f 100644


--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@

#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>

unsigned int num_processors;

@@ -1363,52 +1364,74 @@ void enable_x2apic(void)


}
#endif /* CONFIG_X86_X2APIC */

-void __init enable_IR_x2apic(void)
+int __init enable_IR(void)

{


#ifdef CONFIG_INTR_REMAP
int ret;
- unsigned long flags;

- struct IO_APIC_route_entry **ioapic_entries = NULL;



ret = dmar_table_init();
if (ret) {

pr_debug("dmar_table_init() failed with %d:\n", ret);

- goto ir_failed;


+ return 0;
}

if (!intr_remapping_supported()) {

pr_debug("intr-remapping not supported\n");

- goto ir_failed;


+ return 0;
}

-
if (!x2apic_preenabled && skip_ioapic_setup) {

pr_info("Skipped enabling intr-remap because of skipping "

"io-apic setup\n");
- return;


+ return 0;
}

+ if (enable_intr_remapping(x2apic_supported()))
+ return 0;
+

+ pr_info("Enabled Interrupt-remapping\n");
+
+ return 1;
+
+#endif
+ return 0;


+}
+
+void __init enable_IR_x2apic(void)
+{

+ unsigned long flags;
+ struct IO_APIC_route_entry **ioapic_entries = NULL;
+ int ret, x2apic_enabled = 0;
+


ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {

- pr_info("Allocate ioapic_entries failed: %d\n", ret);
- goto end;
+ pr_info("Allocate ioapic_entries failed\n");
+ goto out;


}

ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {

pr_info("Saving IO-APIC state failed: %d\n", ret);

- goto end;


+ goto out;
}

local_irq_save(flags);
- mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
+ mask_IO_APIC_setup(ioapic_entries);

- ret = enable_intr_remapping(x2apic_supported());
- if (ret)
- goto end_restore;
+ ret = enable_IR();
+ if (!ret) {

+ /* IR is required if x2apic is enabled by BIOS even
+ * when running in kvm since this indicates present
+ * of APIC ID > 255 */
+ if (max_physical_apicid > 255 || !kvm_para_available())
+ goto nox2apic;


+ /* without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode */
+ x2apic_phys = 1;
+ }

- pr_info("Enabled Interrupt-remapping\n");
+ x2apic_enabled = 1;

if (x2apic_supported() && !x2apic_mode) {
x2apic_mode = 1;

@@ -1416,41 +1439,30 @@ void __init enable_IR_x2apic(void)

diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880..f3b1037 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,11 +50,11 @@ static struct apic *apic_probe[] __initdata = {
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode
#ifdef CONFIG_X86_UV
- apic != &apic_x2apic_uv_x &&
+ && apic != &apic_x2apic_uv_x
#endif
- apic != &apic_x2apic_cluster)) {
+ ) {
if (x2apic_phys)
apic = &apic_x2apic_phys;
else
--
Gleb.

Suresh Siddha

unread,
Jul 1, 2009, 5:10:08 PM7/1/09
to
On Wed, 2009-07-01 at 06:30 -0700, Gleb Natapov wrote:
> KVM would like to provide x2APIC interface to a guest without emulating
> interrupt remapping device. The reason KVM prefers guest to use x2APIC
> is that x2APIC interface is better virtualizable and provides better
> performance than mmio xAPIC interface:
>
> - msr exits are faster than mmio (no page table walk, emulation)
> - no need to read back ICR to look at the busy bit
> - one 64 bit ICR write instead of two 32 bit writes
> - shared code with the Hyper-V paravirt interface
>
> Included patch changes x2APIC enabling logic to enable it even if IR
> initialization failed, but kernel runs under KVM and no apic id is
> greater than 255 (if there is one spec requires BIOS to move to x2apic
> mode before starting an OS).
>
> Signed-off-by: Gleb Natapov <gl...@redhat.com>

Acked-by: Suresh Siddha <suresh....@intel.com>

Suresh Siddha

unread,
Jul 1, 2009, 7:00:15 PM7/1/09
to
On Tue, 2009-06-30 at 12:36 -0700, Eric W. Biederman wrote:
> Dropped irqs.. Driver hangs because it is waiting for an irq. Hardware
> hangs because it is waiting for the cpu to process the irq.
>
> Potentially we get a level triggered irq that is never acked by
> the cpu that won't arm until the cpu send an ack, and we can't
> send an ack from another cpu.

Eric,

Among number of experiments you have tried in the past to fix this, have
you tried the experiment of explicitly clearing the remoteIRR by
changing the trigger mode to edge and then back to level.

Is there a problem with this?

We can send a spurious IPI (after the RTE migration) with the new vector
to another cpu and handler which services the level interrupt will check
if we saw the edge mode for a level interrupt and then the handler can
explicitly restore the level trigger and reset the remote IRR by mask
+edge and unmask+level.

We might have to work with some rough edges but do you recollect any
major issue with this approach..

thanks,
suresh

Eric W. Biederman

unread,
Jul 1, 2009, 8:20:04 PM7/1/09
to
Suresh Siddha <suresh....@intel.com> writes:

> On Tue, 2009-06-30 at 12:36 -0700, Eric W. Biederman wrote:
>> Dropped irqs.. Driver hangs because it is waiting for an irq. Hardware
>> hangs because it is waiting for the cpu to process the irq.
>>
>> Potentially we get a level triggered irq that is never acked by
>> the cpu that won't arm until the cpu send an ack, and we can't
>> send an ack from another cpu.
>
> Eric,
>
> Among number of experiments you have tried in the past to fix this, have
> you tried the experiment of explicitly clearing the remoteIRR by
> changing the trigger mode to edge and then back to level.
>
> Is there a problem with this?

The problem I had wasn't remoteIRR getting stuck, but the symptoms
were largely the same. I did try changing the trigger mode to edge
and back and that did not unstick the ioapic in all cases.

> We can send a spurious IPI (after the RTE migration) with the new vector
> to another cpu and handler which services the level interrupt will check
> if we saw the edge mode for a level interrupt and then the handler can
> explicitly restore the level trigger and reset the remote IRR by mask
> +edge and unmask+level.
>
> We might have to work with some rough edges but do you recollect any
> major issue with this approach..

This is coming up enough recently I expect it is time to cook up
a patch that does the ioapic migration in process context plus
some user space code that stress tests things. Just so people
can repeat my experiments and see what I am trying to avoid.

Eric

Ingo Molnar

unread,
Jul 3, 2009, 4:30:20 AM7/3/09
to

* Suresh Siddha <suresh....@intel.com> wrote:

> On Wed, 2009-07-01 at 06:30 -0700, Gleb Natapov wrote:
> > KVM would like to provide x2APIC interface to a guest without emulating
> > interrupt remapping device. The reason KVM prefers guest to use x2APIC
> > is that x2APIC interface is better virtualizable and provides better
> > performance than mmio xAPIC interface:
> >
> > - msr exits are faster than mmio (no page table walk, emulation)
> > - no need to read back ICR to look at the busy bit
> > - one 64 bit ICR write instead of two 32 bit writes
> > - shared code with the Hyper-V paravirt interface
> >
> > Included patch changes x2APIC enabling logic to enable it even if IR
> > initialization failed, but kernel runs under KVM and no apic id is
> > greater than 255 (if there is one spec requires BIOS to move to x2apic
> > mode before starting an OS).
> >
> > Signed-off-by: Gleb Natapov <gl...@redhat.com>
>
> Acked-by: Suresh Siddha <suresh....@intel.com>

Now, since this affects core x86 APIC code non-trivially so should
submitted to and go via the x86 tree. (Can prepare a special branch
with just this change if KVM tree wants/needs to pull it before
v2.6.32.)

Ingo

Eric W. Biederman

unread,
Jul 4, 2009, 5:40:08 AM7/4/09
to
Ingo Molnar <mi...@elte.hu> writes:

> * Suresh Siddha <suresh....@intel.com> wrote:
>
>> On Wed, 2009-07-01 at 06:30 -0700, Gleb Natapov wrote:
>> > KVM would like to provide x2APIC interface to a guest without emulating
>> > interrupt remapping device. The reason KVM prefers guest to use x2APIC
>> > is that x2APIC interface is better virtualizable and provides better
>> > performance than mmio xAPIC interface:
>> >
>> > - msr exits are faster than mmio (no page table walk, emulation)
>> > - no need to read back ICR to look at the busy bit
>> > - one 64 bit ICR write instead of two 32 bit writes
>> > - shared code with the Hyper-V paravirt interface
>> >
>> > Included patch changes x2APIC enabling logic to enable it even if IR
>> > initialization failed, but kernel runs under KVM and no apic id is
>> > greater than 255 (if there is one spec requires BIOS to move to x2apic
>> > mode before starting an OS).
>> >
>> > Signed-off-by: Gleb Natapov <gl...@redhat.com>
>>
>> Acked-by: Suresh Siddha <suresh....@intel.com>
>
> Now, since this affects core x86 APIC code non-trivially so should
> submitted to and go via the x86 tree. (Can prepare a special branch
> with just this change if KVM tree wants/needs to pull it before
> v2.6.32.)

Please don't separate the x2apic code from the dmar code for this
reason.

Supporting hotplug cpus with ioapics is torture.

Eric

Gleb Natapov

unread,
Jul 4, 2009, 6:00:26 AM7/4/09
to
What is the connection between this patch and cpu hotplug?

--
Gleb.

Eric W. Biederman

unread,
Jul 4, 2009, 10:40:10 AM7/4/09
to
Gleb Natapov <gl...@redhat.com> writes:

When I asked if cpu hotplug was a supported and more or
less common feature of kvm I was told it was.

Good cpu hotplug today means supporting interrupt remapping.
(The code you are disabling for kvm).

Therefore I don't see the point of supporting one without the other.
Especially if we don't have a case where on real hardware we need to
split the support.

Eric

Avi Kivity

unread,
Jul 4, 2009, 11:30:12 AM7/4/09
to
On 07/03/2009 11:29 AM, Ingo Molnar wrote:
> Now, since this affects core x86 APIC code non-trivially so should
> submitted to and go via the x86 tree. (Can prepare a special branch
> with just this change if KVM tree wants/needs to pull it before
> v2.6.32.)
>

The kvm tree won't depend on this change. I'm happy with it going into
2.6.32 via tip.git.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

Gleb Natapov

unread,
Jul 4, 2009, 12:00:20 PM7/4/09
to
It is supported in a sense that cpus can be created dynamically
and DSDT has objects to inform OS that a cpu state changed. I don't
know how common it is. I surely don't use it daily. But 99% of the real
work is done by a guest.

> Good cpu hotplug today means supporting interrupt remapping.
> (The code you are disabling for kvm).

From you previous mails I understand that the problem is no with cpu
hotplug, but with cpu offlining and this is the guest feature that
cannot be controlled by KVM in any way. If you think that Linux should
not support cpu offlining with ioapics fine, send a patch to disable it
and cpu hot-unplug will not be supported by KVM with Linux guests, but
note that KVM ioapic has no problems that you've mentioned and HW
makers that provide x86 hardware may be using more sophisticated
ioapics that you've tested (I don't know never saw such hardware).

>
> Therefore I don't see the point of supporting one without the other.

x2apic provide us with other benefits as commit message explains, and
doesn't add any problems that we don't have now already.

> Especially if we don't have a case where on real hardware we need to
> split the support.
>

--
Gleb.

Eric W. Biederman

unread,
Jul 4, 2009, 8:30:11 PM7/4/09
to
Gleb Natapov <gl...@redhat.com> writes:

>> Therefore I don't see the point of supporting one without the other.
> x2apic provide us with other benefits as commit message explains, and
> doesn't add any problems that we don't have now already.

If this code has a legitimate place on real hardware I am all for it.

If this is just a hack to make virtualization faster I don't like the
extra code paths in the middle core architecture code. That will
be a support burden for the foreseeable future. More code to
test etc.

Quickly skimming the patch it just appears to stir a mess.
Plus it adds weird paravirtualization checks, ???

If we are going to have a special code path for virtual hardware
can we do it right and have something nice to use that makes life
simpler? For what we want to do with ioapics they suck and are
really not suitable. The only thing that recommends them is that
they are standard. But you are deviating from the standard so
what is the point.

Eric

Avi Kivity

unread,
Jul 5, 2009, 1:30:12 AM7/5/09
to
On 07/05/2009 03:22 AM, Eric W. Biederman wrote:
> Gleb Natapov<gl...@redhat.com> writes:
>
>
>>> Therefore I don't see the point of supporting one without the other.
>>>
>> x2apic provide us with other benefits as commit message explains, and
>> doesn't add any problems that we don't have now already.
>>
>
> If this code has a legitimate place on real hardware I am all for it.
>

As I understood it, x2apic without interrupt remapping will work but is
not a validated configuration. Interrupt remapping is only necessary if
you have > 255 hardware threads + ioapics. The features are logically
separate and are only tied together by the vendor's validation practices.


> If this is just a hack to make virtualization faster I don't like the
> extra code paths in the middle core architecture code. That will
> be a support burden for the foreseeable future. More code to
> test etc.
>

There aren't any extra code paths. The patch separates a long function
into two smaller ones that each do one thing, and adds a check for kvm.

Maybe it should be split into two to makes that clear. The first patch
simplifies the code, the second adds a kvm check.

> Quickly skimming the patch it just appears to stir a mess.
> Plus it adds weird paravirtualization checks, ???
>

It adds exactly one "weird paravirtualization check ???", then one
described in the patch description.

> If we are going to have a special code path for virtual hardware
> can we do it right and have something nice to use that makes life
> simpler?

You mean, instead of adding one check in an initialization code path,
create a new irqchip, a way of describing the topology to the guest,
support code in kvm (as host)?

> For what we want to do with ioapics they suck and are
> really not suitable. The only thing that recommends them is that
> they are standard. But you are deviating from the standard so
> what is the point.
>

All of the code continues to work.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Gleb Natapov

unread,
Jul 5, 2009, 10:40:14 AM7/5/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>

---

This is the same as v5 only rebased on x86 tree (a74d2cea).

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c283..9e0b37a 100644


--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>

unsigned int num_processors;

@@ -1361,52 +1362,74 @@ void enable_x2apic(void)

@@ -1414,41 +1437,30 @@ void __init enable_IR_x2apic(void)

Suresh Siddha

unread,
Jul 7, 2009, 12:20:19 PM7/7/09
to
On Wed, 2009-07-01 at 17:17 -0700, Eric W. Biederman wrote:

> Suresh Siddha <suresh....@intel.com> writes:
> > Among number of experiments you have tried in the past to fix this, have
> > you tried the experiment of explicitly clearing the remoteIRR by
> > changing the trigger mode to edge and then back to level.
> >
> > Is there a problem with this?
>
> The problem I had wasn't remoteIRR getting stuck, but the symptoms
> were largely the same. I did try changing the trigger mode to edge
> and back and that did not unstick the ioapic in all cases.
>

I did some experiments locally here yesterday and on my old ICH5 based
system, I couldn't reset the remoteIRR by changing the trigger mode to


edge and then back to level.

However what worked was an explicit eoi to the io-apic using the
respective vector.

I guess we need to try both the things based on perhaps io-apic version
etc.

But what I am nervous about is, did you try both these things aswell and
still saw stuck interrupts?

I will cleanup my code and post it, so that we can try on different
systems. If this still doesn't work on certain HW platforms, atleast our
experiments of what works and what doesn't work and on what platforms
will be documented on the web.

thanks,
suresh

Ingo Molnar

unread,
Jul 10, 2009, 10:00:20 AM7/10/09
to

* Gleb Natapov <gl...@redhat.com> wrote:

> KVM would like to provide x2APIC interface to a guest without emulating
> interrupt remapping device. The reason KVM prefers guest to use x2APIC
> is that x2APIC interface is better virtualizable and provides better
> performance than mmio xAPIC interface:
>
> - msr exits are faster than mmio (no page table walk, emulation)
> - no need to read back ICR to look at the busy bit
> - one 64 bit ICR write instead of two 32 bit writes
> - shared code with the Hyper-V paravirt interface
>
> Included patch changes x2APIC enabling logic to enable it even if IR
> initialization failed, but kernel runs under KVM and no apic id is
> greater than 255 (if there is one spec requires BIOS to move to x2apic
> mode before starting an OS).
>
> Signed-off-by: Gleb Natapov <gl...@redhat.com>
> Acked-by: Suresh Siddha <suresh....@intel.com>
> ---
>
> This is the same as v5 only rebased on x86 tree (a74d2cea).

Looks good.

Small detail: please run the patch through scripts/checkpatch.pl and
fix the complaints it has - all 3 details it points out seem like
valid complaints to me (at a quick glance) that should be fixed.

Thanks,

Ingo

Eric W. Biederman

unread,
Jul 10, 2009, 4:40:10 PM7/10/09
to
Suresh Siddha <suresh....@intel.com> writes:

> On Wed, 2009-07-01 at 17:17 -0700, Eric W. Biederman wrote:
>> Suresh Siddha <suresh....@intel.com> writes:
>> > Among number of experiments you have tried in the past to fix this, have
>> > you tried the experiment of explicitly clearing the remoteIRR by
>> > changing the trigger mode to edge and then back to level.
>> >
>> > Is there a problem with this?
>>
>> The problem I had wasn't remoteIRR getting stuck, but the symptoms
>> were largely the same. I did try changing the trigger mode to edge
>> and back and that did not unstick the ioapic in all cases.
>>
>
> I did some experiments locally here yesterday and on my old ICH5 based
> system, I couldn't reset the remoteIRR by changing the trigger mode to
> edge and then back to level.
>
> However what worked was an explicit eoi to the io-apic using the
> respective vector.
>
> I guess we need to try both the things based on perhaps io-apic version
> etc.

In part the deep problems I ran into was something other than RemoteIRR.
If it was something as simple as the ioapic being in a documented state
I would have kept looking. At the very least the RemoteIRR bit was not
set in some of the cases I encountered.

> But what I am nervous about is, did you try both these things aswell and
> still saw stuck interrupts?

Yes. The work arounds were in the code so I tried them.

> I will cleanup my code and post it, so that we can try on different
> systems. If this still doesn't work on certain HW platforms, atleast our
> experiments of what works and what doesn't work and on what platforms
> will be documented on the web.

Sounds reasonable. My apologies for the long delayed reply. I want
to work on this but I have higher priorities.

Eric

Gleb Natapov

unread,
Jul 12, 2009, 8:10:10 AM7/12/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>
---

This is based on x86 tree commit 5b9eea3e7 with checkpatch complains
fixed.

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c283..6e67916 100644

Ingo Molnar

unread,
Jul 18, 2009, 10:10:12 AM7/18/09
to

i've only got some small stylistic gripes:

> +void __init enable_IR_x2apic(void)
> +{
> + unsigned long flags;
> + struct IO_APIC_route_entry **ioapic_entries = NULL;
> + int ret, x2apic_enabled = 0;
> +
> ioapic_entries = alloc_ioapic_entries();
> if (!ioapic_entries) {
> - pr_info("Allocate ioapic_entries failed: %d\n", ret);
> - goto end;
> + pr_info("Allocate ioapic_entries failed\n");
> + goto out;

That's probably a fatal condition so perhaps a WARN() or pr_err()
would be more appropriate.

> + ret = enable_IR();
> + if (!ret) {
> + /* IR is required if x2apic is enabled by BIOS even
> + * when running in kvm since this indicates present
> + * of APIC ID > 255 */

please use the customary (multi-line) comment style:

/*
* Comment .....
* ...... goes here.
*/

specified in Documentation/CodingStyle.

> + if (max_physical_apicid > 255 || !kvm_para_available())
> + goto nox2apic;
> + /* without IR all CPUs can be addressed by IOAPIC/MSI
> + * only in physical mode */

ditto.

can this #ifdef be avoided?

> -
> - return;
> + } else if (cpu_has_x2apic)
> + pr_info("Not enabling x2apic,Intr-remapping\n");

please put a space after commas.

Thanks,

Ingo

Gleb Natapov

unread,
Jul 19, 2009, 7:50:09 AM7/19/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>
---

This is based on x86 tree commit d7ccd190e5107 with Ingo comments
addressed (comment formatting, print error on alloc failure).

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c283..a010af4 100644


--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>

unsigned int num_processors;

@@ -1361,52 +1362,76 @@ void enable_x2apic(void)

+ pr_info("Enabled Interrupt-remapping\n");
+


+ return 1;
+
+#endif
+ return 0;
+}
+

+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ struct IO_APIC_route_entry **ioapic_entries = NULL;
+ int ret, x2apic_enabled = 0;
+
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
- pr_info("Allocate ioapic_entries failed: %d\n", ret);
- goto end;

+ pr_err("Allocate ioapic_entries failed\n");
+ goto out;
}


ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {

pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto end;


+ goto out;
}

local_irq_save(flags);
- mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
+ mask_IO_APIC_setup(ioapic_entries);

- ret = enable_intr_remapping(x2apic_supported());
- if (ret)
- goto end_restore;

+ ret = enable_IR();
+ if (!ret) {

+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */


+ if (max_physical_apicid > 255 || !kvm_para_available())
+ goto nox2apic;
+ /*

+ * without IR all CPUs can be addressed by IOAPIC/MSI


+ * only in physical mode

+ */


+ x2apic_phys = 1;
+ }

- pr_info("Enabled Interrupt-remapping\n");
+ x2apic_enabled = 1;

if (x2apic_supported() && !x2apic_mode) {
x2apic_mode = 1;

@@ -1414,41 +1439,27 @@ void __init enable_IR_x2apic(void)


pr_info("Enabled x2apic\n");
}

-end_restore:
- if (ret)
- /*
- * IR enabling failed
- */
+nox2apic:
+ if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
-
unmask_8259A();
local_irq_restore(flags);

-end:
+out:
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);

- if (!ret)
+ if (x2apic_enabled)
return;

-ir_failed:
- if (x2apic_preenabled)

- panic("x2apic enabled by bios. But IR enabling failed");


- else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic,Intr-remapping\n");

-#else


- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)

- panic("x2apic enabled prior OS handover,"


- " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");

-#endif
-
- return;
+ if (x2apic_preenabled) {
+ panic("x2apic enabled by bios. But IR enabling failed."
+ " Check that CONFIG_X86_X2APIC and CONFIG_INTR_REMAP are"
+ " enabled.");
+ } else if (cpu_has_x2apic)
+ pr_info("Not enabling x2apic, Intr-remapping\n");


}

-
#ifdef CONFIG_X86_64
/*
* Detect and enable local APICs on non-SMP boards.
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880..f3b1037 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,11 +50,11 @@ static struct apic *apic_probe[] __initdata = {
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode
#ifdef CONFIG_X86_UV
- apic != &apic_x2apic_uv_x &&
+ && apic != &apic_x2apic_uv_x
#endif
- apic != &apic_x2apic_cluster)) {
+ ) {
if (x2apic_phys)
apic = &apic_x2apic_phys;
else
--
Gleb.

Ingo Molnar

unread,
Jul 19, 2009, 12:30:15 PM7/19/09
to

* Gleb Natapov <gl...@redhat.com> wrote:

> + if (!ret) {
> + /* IR is required if there is APIC ID > 255 even when running
> + * under KVM
> + */

Eeek.

> + if (x2apic_preenabled) {
> + panic("x2apic enabled by bios. But IR enabling failed."
> + " Check that CONFIG_X86_X2APIC and CONFIG_INTR_REMAP are"
> + " enabled.");

We dont want to break such messages mid-line, it makes it hard to
git-grep for them. Also, the '. But' looks weird.

Ingo

Gleb Natapov

unread,
Jul 19, 2009, 12:50:08 PM7/19/09
to
On Sun, Jul 19, 2009 at 06:27:35PM +0200, Ingo Molnar wrote:
>
> * Gleb Natapov <gl...@redhat.com> wrote:
>
> > + if (!ret) {
> > + /* IR is required if there is APIC ID > 255 even when running
> > + * under KVM
> > + */
>
> Eeek.
>
> > + if (x2apic_preenabled) {
> > + panic("x2apic enabled by bios. But IR enabling failed."
> > + " Check that CONFIG_X86_X2APIC and CONFIG_INTR_REMAP are"
> > + " enabled.");
>
> We dont want to break such messages mid-line, it makes it hard to
> git-grep for them. Also, the '. But' looks weird.

So what should I do? Make really long line?

--
Gleb.

Ingo Molnar

unread,
Jul 19, 2009, 2:00:18 PM7/19/09
to

* Gleb Natapov <gl...@redhat.com> wrote:

> On Sun, Jul 19, 2009 at 06:27:35PM +0200, Ingo Molnar wrote:
> >
> > * Gleb Natapov <gl...@redhat.com> wrote:
> >
> > > + if (!ret) {
> > > + /* IR is required if there is APIC ID > 255 even when running
> > > + * under KVM
> > > + */
> >
> > Eeek.
> >
> > > + if (x2apic_preenabled) {
> > > + panic("x2apic enabled by bios. But IR enabling failed."
> > > + " Check that CONFIG_X86_X2APIC and CONFIG_INTR_REMAP are"
> > > + " enabled.");
> >
> > We dont want to break such messages mid-line, it makes it hard to
> > git-grep for them. Also, the '. But' looks weird.
>
> So what should I do? Make really long line?

i'd suggest short ones generally, like:

"x2apic: enabled by BIOS but kernel init failed."

also, referring to whether the config are enabled is a bit lame - we
are in the kernel so we should know whether they are enabled.

Ingo

Gleb Natapov

unread,
Jul 20, 2009, 8:30:17 AM7/20/09
to
KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>
---

This is based on x86 tree commit ff78d570fae with Ingo comments
addressed (comment formatting, print error on alloc failure, shorter
error messages).

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c283..a79e8a9 100644

+ if (!ret) {
+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */

+ if (max_physical_apicid > 255 || !kvm_para_available())
+ goto nox2apic;
+ /*
+ * without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode
+ */
+ x2apic_phys = 1;
+ }

- pr_info("Enabled Interrupt-remapping\n");
+ x2apic_enabled = 1;

if (x2apic_supported() && !x2apic_mode) {
x2apic_mode = 1;

@@ -1414,41 +1439,25 @@ void __init enable_IR_x2apic(void)


pr_info("Enabled x2apic\n");
}

-end_restore:
- if (ret)
- /*
- * IR enabling failed
- */
+nox2apic:
+ if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
-
unmask_8259A();
local_irq_restore(flags);

-end:
+out:
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);

- if (!ret)
+ if (x2apic_enabled)
return;

-ir_failed:

if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
+ panic("x2apic: enabled by BIOS but kernel init failed.");


else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic,Intr-remapping\n");
-#else
- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)
- panic("x2apic enabled prior OS handover,"
- " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
-#endif
-
- return;

+ pr_info("Not enabling x2apic, Intr-remapping init failed.\n");


}

-
#ifdef CONFIG_X86_64
/*
* Detect and enable local APICs on non-SMP boards.
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880..f3b1037 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,11 +50,11 @@ static struct apic *apic_probe[] __initdata = {
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode
#ifdef CONFIG_X86_UV
- apic != &apic_x2apic_uv_x &&
+ && apic != &apic_x2apic_uv_x
#endif
- apic != &apic_x2apic_cluster)) {
+ ) {
if (x2apic_phys)
apic = &apic_x2apic_phys;
else
--
Gleb.

tip-bot for Gleb Natapov

unread,
Aug 4, 2009, 12:41:29 PM8/4/09
to
Commit-ID: ee6a28296de295d62a3d8fad8bab58a8a48d006f
Gitweb: http://git.kernel.org/tip/ee6a28296de295d62a3d8fad8bab58a8a48d006f
Author: Gleb Natapov <gl...@redhat.com>
AuthorDate: Mon, 20 Jul 2009 15:24:17 +0300
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Tue, 4 Aug 2009 14:44:10 +0200

x86/apic: Enable x2APIC without interrupt remapping under KVM

KVM would like to provide x2APIC interface to a guest without emulating
interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>

Cc: Sheng Yang <sh...@linux.intel.com>
Cc: "a...@redhat.com" <a...@redhat.com>
LKML-Reference: <2009072012...@redhat.com>
Signed-off-by: Ingo Molnar <mi...@elte.hu>


---
arch/x86/kernel/apic/apic.c | 83 +++++++++++++++++++++-----------------
arch/x86/kernel/apic/probe_64.c | 6 +-
2 files changed, 49 insertions(+), 40 deletions(-)

Ingo Molnar

unread,
Aug 4, 2009, 12:41:34 PM8/4/09
to

* tip-bot for Gleb Natapov <gl...@redhat.com> wrote:

> Commit-ID: ee6a28296de295d62a3d8fad8bab58a8a48d006f
> Gitweb: http://git.kernel.org/tip/ee6a28296de295d62a3d8fad8bab58a8a48d006f
> Author: Gleb Natapov <gl...@redhat.com>
> AuthorDate: Mon, 20 Jul 2009 15:24:17 +0300
> Committer: Ingo Molnar <mi...@elte.hu>
> CommitDate: Tue, 4 Aug 2009 14:44:10 +0200
>
> x86/apic: Enable x2APIC without interrupt remapping under KVM

-tip testing found this build failure:

arch/x86/kernel/apic/apic.c: In function ‘enable_IR_x2apic’:
arch/x86/kernel/apic/apic.c:1431: error: ‘x2apic_phys’ undeclared (first use in this function)
arch/x86/kernel/apic/apic.c:1431: error: (Each undeclared identifier is reported only once
arch/x86/kernel/apic/apic.c:1431: error: for each function it appears in.)

with the attached config. (Please send a delta fix)

Ingo

config-Tue_Aug__4_14_43_56_CEST_2009.bad

Gleb Natapov

unread,
Aug 4, 2009, 1:00:25 PM8/4/09
to
On Tue, Aug 04, 2009 at 03:37:15PM +0200, Ingo Molnar wrote:
>
> * tip-bot for Gleb Natapov <gl...@redhat.com> wrote:
>
> > Commit-ID: ee6a28296de295d62a3d8fad8bab58a8a48d006f
> > Gitweb: http://git.kernel.org/tip/ee6a28296de295d62a3d8fad8bab58a8a48d006f
> > Author: Gleb Natapov <gl...@redhat.com>
> > AuthorDate: Mon, 20 Jul 2009 15:24:17 +0300
> > Committer: Ingo Molnar <mi...@elte.hu>
> > CommitDate: Tue, 4 Aug 2009 14:44:10 +0200
> >
> > x86/apic: Enable x2APIC without interrupt remapping under KVM
>
> -tip testing found this build failure:
>
> arch/x86/kernel/apic/apic.c: In function ן¿½enable_IR_x2apicן¿½:
> arch/x86/kernel/apic/apic.c:1431: error: ן¿½x2apic_physן¿½ undeclared (first use in this function)

> arch/x86/kernel/apic/apic.c:1431: error: (Each undeclared identifier is reported only once
> arch/x86/kernel/apic/apic.c:1431: error: for each function it appears in.)
>
> with the attached config. (Please send a delta fix)
>
> Ingo

Signed-off-by: Gleb Natapov <gl...@redhat.com>
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bb7d479..ebdbf54 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -183,6 +183,10 @@ static inline int x2apic_enabled(void)
}

#define x2apic_supported() (cpu_has_x2apic)
+static inline void x2apic_force_phys(void)
+{


+ x2apic_phys = 1;
+}

#else
static inline void check_x2apic(void)
{
@@ -194,6 +198,9 @@ static inline int x2apic_enabled(void)
{
return 0;
}
+static inline void x2apic_force_phys()
+{
+}

#define x2apic_preenabled 0
#define x2apic_supported() 0
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1221f68..de039fc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1428,7 +1428,7 @@ void __init enable_IR_x2apic(void)


* without IR all CPUs can be addressed by IOAPIC/MSI

* only in physical mode

*/
- x2apic_phys = 1;
+ x2apic_force_phys();
}

x2apic_enabled = 1;
--
Gleb.

tip-bot for Gleb Natapov

unread,
Aug 5, 2009, 7:00:13 AM8/5/09
to
Commit-ID: da5b16c8d0208eee0d0c9f427279363599ef7266
Gitweb: http://git.kernel.org/tip/da5b16c8d0208eee0d0c9f427279363599ef7266

Author: Gleb Natapov <gl...@redhat.com>
AuthorDate: Mon, 20 Jul 2009 15:24:17 +0300
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 5 Aug 2009 10:31:04 +0200

x86/apic: Enable x2APIC without interrupt remapping under KVM

KVM would like to provide x2APIC interface to a guest without emulating


interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

-v2: fix build

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>

Cc: Sheng Yang <sh...@linux.intel.com>
Cc: "a...@redhat.com" <a...@redhat.com>
LKML-Reference: <2009072012...@redhat.com>
Signed-off-by: Ingo Molnar <mi...@elte.hu>


---
arch/x86/include/asm/apic.h | 7 +++


arch/x86/kernel/apic/apic.c | 83 +++++++++++++++++++++-----------------
arch/x86/kernel/apic/probe_64.c | 6 +-

3 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bb7d479..ebdbf54 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -183,6 +183,10 @@ static inline int x2apic_enabled(void)
}

#define x2apic_supported() (cpu_has_x2apic)
+static inline void x2apic_force_phys(void)
+{

+ x2apic_phys = 1;
+}

#else
static inline void check_x2apic(void)
{
@@ -194,6 +198,9 @@ static inline int x2apic_enabled(void)
{
return 0;
}
+static inline void x2apic_force_phys()
+{
+}

#define x2apic_preenabled 0
#define x2apic_supported() 0

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0b021c5..de039fc 100644

+ x2apic_force_phys();

Ingo Molnar

unread,
Aug 5, 2009, 8:20:17 AM8/5/09
to

* Gleb Natapov <gl...@redhat.com> wrote:

> +static inline void x2apic_force_phys()
> +{
> +}

Sigh. Have you tested this patche and have you looked at the
resultig compiler output?

Ingo

Gleb Natapov

unread,
Aug 5, 2009, 8:30:16 AM8/5/09
to
On Wed, Aug 05, 2009 at 03:19:23PM +0300, Gleb Natapov wrote:

> On Wed, Aug 05, 2009 at 02:16:21PM +0200, Ingo Molnar wrote:
> >
> > * Gleb Natapov <gl...@redhat.com> wrote:
> >
> > > +static inline void x2apic_force_phys()
> > > +{
> > > +}
> >
> > Sigh. Have you tested this patche and have you looked at the
> > resultig compiler output?
> >
> I haven't looked and compiler output. What is wrong with it?
> Is defining this function as #define would be batter?
>
Actually I made it a define initially and tested with it. Later I change
it to be inline and apparently tested only with x2apic enabled. I see the
problem now. Do you want it to be define or properly defined empty inline
function like any other function in this code?

--
Gleb.

Gleb Natapov

unread,
Aug 5, 2009, 8:30:16 AM8/5/09
to
On Wed, Aug 05, 2009 at 02:16:21PM +0200, Ingo Molnar wrote:
>
> * Gleb Natapov <gl...@redhat.com> wrote:
>
> > +static inline void x2apic_force_phys()
> > +{
> > +}
>
> Sigh. Have you tested this patche and have you looked at the
> resultig compiler output?
>
I haven't looked and compiler output. What is wrong with it?
Is defining this function as #define would be batter?

--
Gleb.

tip-bot for Gleb Natapov

unread,
Aug 5, 2009, 8:40:09 AM8/5/09
to
Commit-ID: ce69a784504222c3ab6f1b3c357d09ec5772127a
Gitweb: http://git.kernel.org/tip/ce69a784504222c3ab6f1b3c357d09ec5772127a

Author: Gleb Natapov <gl...@redhat.com>
AuthorDate: Mon, 20 Jul 2009 15:24:17 +0300
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 5 Aug 2009 14:28:50 +0200

x86/apic: Enable x2APIC without interrupt remapping under KVM

KVM would like to provide x2APIC interface to a guest without emulating


interrupt remapping device. The reason KVM prefers guest to use x2APIC
is that x2APIC interface is better virtualizable and provides better
performance than mmio xAPIC interface:

- msr exits are faster than mmio (no page table walk, emulation)
- no need to read back ICR to look at the busy bit
- one 64 bit ICR write instead of two 32 bit writes
- shared code with the Hyper-V paravirt interface

Included patch changes x2APIC enabling logic to enable it even if IR
initialization failed, but kernel runs under KVM and no apic id is
greater than 255 (if there is one spec requires BIOS to move to x2apic
mode before starting an OS).

-v2: fix build
-v3: fix bug causing compiler warning

Signed-off-by: Gleb Natapov <gl...@redhat.com>
Acked-by: Suresh Siddha <suresh....@intel.com>

Cc: Sheng Yang <sh...@linux.intel.com>
Cc: "a...@redhat.com" <a...@redhat.com>
LKML-Reference: <2009072012...@redhat.com>
Signed-off-by: Ingo Molnar <mi...@elte.hu>


---
arch/x86/include/asm/apic.h | 7 +++
arch/x86/kernel/apic/apic.c | 83 +++++++++++++++++++++-----------------
arch/x86/kernel/apic/probe_64.c | 6 +-
3 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bb7d479..586b7ad 100644


--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -183,6 +183,10 @@ static inline int x2apic_enabled(void)
}

#define x2apic_supported() (cpu_has_x2apic)
+static inline void x2apic_force_phys(void)
+{

+ x2apic_phys = 1;
+}

#else
static inline void check_x2apic(void)
{
@@ -194,6 +198,9 @@ static inline int x2apic_enabled(void)
{
return 0;
}

+static inline void x2apic_force_phys(void)


+{
+}

#define x2apic_preenabled 0
#define x2apic_supported() 0

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0b021c5..de039fc 100644

+ x2apic_force_phys();

0 new messages