When the SMP kernel decides to crash_kexec() the local APICs may have
pending interrupts in their vector tables.
The setup routine for the local APIC has a deficient mechanism for
clearing these interrupts, it only handles interrupts that has already
been dispatched to the local core for servicing (the ISR register)
safely, it doesn't consider lower prioritized queued interrupts stored
in the IRR register.
If you have more than one pending interrupt within the same 32 bit word
in the LAPIC vector table registers you may find yourself entering the
IO APIC setup with pending interrupts left in the LAPIC. This is a
situation for wich the IO APIC setup is not prepared. Depending of
what/which interrupt vector/vectors are stuck in the APIC tables your
system may show various degrees of malfunctioning.
That was the reason why the check_timer() failed in our system, the
timer interrupts was blocked by pending interrupts from the old kernel
when routed trough the IO APIC.
Additional comment from Jiri Bohac:
==============
If this should go into stable release,
I'd add some kind of limit on the number of iterations, just to be safe from
hard to debug lock-ups:
+if (loops++ > MAX_LOOPS) {
+ printk("LAPIC pending clean-up")
+ break;
+}
while (queued);
with MAX_LOOPS something like 1E9 this would leave plenty of time for the
pending IRQs to be cleared and would and still cause at most a second of delay
if the loop were to lock-up for whatever reason.
==============
From tr...@suse.de:
Merged Jiri suggestion into the patch.
Also made the max_loops depend on cpu_khz. Not sure how long an apic_read
takes, as it is on the CPU it may only be one cycle and we now wait 1 sec
in WARN_ON(..) case?
CC: jbo...@novell.com
CC: "Yinghai Lu" <yin...@kernel.org>
CC: ak...@linux-foundation.org
CC: mi...@elte.hu
CC: "Kerstin Jonsson" <kerstin...@ericsson.com>
Signed-off-by: Thomas Renninger <tr...@suse.de>
---
arch/x86/kernel/apic/apic.c | 34 +++++++++++++++++++++++++---------
1 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3987e44..912dd59 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/kvm_para.h>
+#include <asm/tsc.h>
unsigned int num_processors;
@@ -1151,8 +1152,8 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned int value;
- int i, j;
+ unsigned int value, queued;
+ int i, j, acked = 0, max_loops = cpu_khz * 1000;
if (disable_apic) {
arch_disable_smp_support();
@@ -1204,13 +1205,28 @@ void __cpuinit setup_local_APIC(void)
* the interrupt. Hence a vector might get locked. It was noticed
* for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
*/
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
- }
- }
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
+ }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(!max_loops);
/*
* Now that we are all set up, enable the APIC
--
1.6.3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Thanks,
Thomas
An apic_read() can take a couple of microseconds when running
virtualized, so this loop may run for hours. On the other hand,
virtualized hardware is unlikely to misbehave.
Still I recommend using a clocksource (tsc would do) and not a loop count.
--
error compiling committee.c: too many arguments to function
It's possible (cpuid hypervisor bit), but I don't think it's a good
idea. Splitting up code paths doubles the chance of bugs. Much better
to find something that works both ways.
--
error compiling committee.c: too many arguments to function
--
From tr...@suse.de:
V2: Use tsc if avail to bail out after 1 sec due to possible virtual apic_read
calls which may take rather long (suggested by: Avi Kivity <a...@redhat.com>)
If no tsc is available bail out quickly after cpu_khz, if we broke out too
early and still have irqs pending (which should never happen?) we still
get a WARN_ON...
CC: jbo...@novell.com
CC: "Yinghai Lu" <yin...@kernel.org>
CC: ak...@linux-foundation.org
CC: mi...@elte.hu
CC: "Kerstin Jonsson" <kerstin...@ericsson.com>
CC: "Avi Kivity" <a...@redhat.com>
Signed-off-by: Thomas Renninger <tr...@suse.de>
---
arch/x86/kernel/apic/apic.c | 42 +++++++++++++++++++++++++++++++++---------
1 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3987e44..93cdb2a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/kvm_para.h>
+#include <asm/tsc.h>
unsigned int num_processors;
@@ -1151,8 +1152,12 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned int value;
- int i, j;
+ unsigned int value, queued;
+ int i, j, acked = 0;
+ unsigned long long tsc = 0, ntsc, max_loops = cpu_khz;
+
+ if (cpu_has_tsc)
+ rdtscll(ntsc);
if (disable_apic) {
arch_disable_smp_support();
@@ -1204,13 +1209,32 @@ void __cpuinit setup_local_APIC(void)
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(!max_loops);
/*
* Now that we are all set up, enable the APIC
--
1.6.3
--
Thanks,
Thomas
...
> + if (cpu_has_tsc) {
> + rdtscll(ntsc);
> + max_loops = (cpu_khz<< 10) - (ntsc - tsc);
>
Since max_loops is unsigned, this will always be positive.
> + } else
> + max_loops--;
> + } while (queued&& max_loops> 0);
> + WARN_ON(!max_loops);
>
So the loop never terminates unless queued becomes true.
--
error compiling committee.c: too many arguments to function
--
Perhaps rdtscll(tsc)?
Where is tsc modified? It remains tsc = 0 all the time?
Or I miss the snippet where it is set?
> + } else
> + max_loops--;
> + } while (queued && max_loops > 0);
> + WARN_ON(!max_loops);
>
> /*
> * Now that we are all set up, enable the APIC
> --
> 1.6.3
>
-- Cyrill
V3: - Fixed indentation -> checkpatch clean
- max_loops must be signed
CC: jbo...@novell.com
CC: "Yinghai Lu" <yin...@kernel.org>
CC: ak...@linux-foundation.org
CC: mi...@elte.hu
CC: "Kerstin Jonsson" <kerstin...@ericsson.com>
CC: "Avi Kivity" <a...@redhat.com>
Signed-off-by: Thomas Renninger <tr...@suse.de>
---
arch/x86/kernel/apic/apic.c | 41 +++++++++++++++++++++++++++++++++--------
1 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3987e44..badd661 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/kvm_para.h>
+#include <asm/tsc.h>
unsigned int num_processors;
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned int value;
- int i, j;
+ unsigned int value, queued;
+ int i, j, acked = 0;
+ unsigned long long tsc = 0, ntsc;
+ long long max_loops = cpu_khz;
+
+ if (cpu_has_tsc)
+ rdtscll(ntsc);
if (disable_apic) {
arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
* the interrupt. Hence a vector might get locked. It was noticed
* for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
*/
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
}
- }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(!max_loops);
/*
* Now that we are all set up, enable the APIC
--
1.6.3
--
V4: - Fix typo, mixed up tsc and ntsc in first rdtscll() call
CC: jbo...@novell.com
CC: "Yinghai Lu" <yin...@kernel.org>
CC: ak...@linux-foundation.org
CC: mi...@elte.hu
CC: "Kerstin Jonsson" <kerstin...@ericsson.com>
CC: "Avi Kivity" <a...@redhat.com>
Signed-off-by: Thomas Renninger <tr...@suse.de>
---
arch/x86/kernel/apic/apic.c | 41 +++++++++++++++++++++++++++++++++--------
1 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3987e44..414a5df 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/kvm_para.h>
+#include <asm/tsc.h>
unsigned int num_processors;
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned int value;
- int i, j;
+ unsigned int value, queued;
+ int i, j, acked = 0;
+ unsigned long long tsc = 0, ntsc;
+ long long max_loops = cpu_khz;
+
+ if (cpu_has_tsc)
+ rdtscll(tsc);
if (disable_apic) {
arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
* the interrupt. Hence a vector might get locked. It was noticed
* for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
*/
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
}
- }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(!max_loops);
/*
* Now that we are all set up, enable the APIC
--
1.6.3
--
> > + if (cpu_has_tsc) {
> > + rdtscll(ntsc);
> > + max_loops = (cpu_khz << 10) - (ntsc - tsc);
>
> Where is tsc modified? It remains tsc = 0 all the time?
> Or I miss the snippet where it is set?
Yes, you are right..., next version, thanks a lot for looking at this,
Thomas
I have verified the patch on target HW:
model name : Dual Core AMD Opteron(tm) Processor 165
cpu MHz : 1800.056
model name : Intel(R) Xeon(R) CPU L5408 @ 2.13GHz
cpu MHz : 2127.988
and in kvm:
(QEMU PC emulator version 0.10.6 (qemu-kvm-78.0.10.6-0.3.1))
hosted by a:
model name : Intel(R) Xeon(R) CPU E5405 @ 2.00GHz
cpu MHz : 1994.988
It still flushes multiple pending interrupts in the APIC tables -
i.e. my crash kernel boots up OK even when subjected to "ISR mayhem"
prior to crash.
If I force it to stay in the flush loop, it times out in approx. 1.02s in
all different target environments, close enough I'd say.
I do, however, have tsc support in all of them, had I not I'd probably
found it a bit tedious to wait for the kvm loop (if against all odds it
would get stuck) due to longer loop-time in kvm it would take ~100s to
perform (max_loops=cpu_khz) rounds. But then again, my host machine is
old, with better virtualization support in more modern machines and it
is an unlikely case, etc. I guess it won't really be a problem.
WARN_ON(!max_loops); max_loops< 0 will probably be the most common error exit in a system that has tsc...
V5: Adjust WARN_ON() condition to also catch error in cpu_has_tsc case
CC: jbo...@novell.com
CC: "Yinghai Lu" <yin...@kernel.org>
CC: ak...@linux-foundation.org
CC: mi...@elte.hu
CC: "Kerstin Jonsson" <kerstin...@ericsson.com>
CC: "Avi Kivity" <a...@redhat.com>
Signed-off-by: Thomas Renninger <tr...@suse.de>
---
arch/x86/kernel/apic/apic.c | 41 +++++++++++++++++++++++++++++++++--------
1 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 00187f1..cfcc87f 100644
- if (value & (1<<j))
- ack_APIC_irq();
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
}
- }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(max_loops <= 0);
/*
* Now that we are all set up, enable the APIC
--
1.6.3
I have been seeing this and for some reason I thought I was dying
in calibrate_delay_loop(). But this is much later and much easier
to deal with. Since we make it to smp_init() there isn't any
good excuse for us to fail to come up.
I'm curious how much testing have you been able to do on this piece
of code?
Eric
> Andrew thanks for finding this. I have a test case for this that
> reproduces about every other time, and I will plug this patch in and
> see it helps. I'm not wild about how the max_loops variable is
> reused both as a timer and as a countdown timer, but the basic
> principle feels solid.
>
> I have been seeing this and for some reason I thought I was dying
> in calibrate_delay_loop(). But this is much later and much easier
> to deal with. Since we make it to smp_init() there isn't any
> good excuse for us to fail to come up.
>
> I'm curious how much testing have you been able to do on this piece
> of code?
This code definitely makes things better in my test case.
I had the patience to wait for 12 iterations and I was
expecting 6 failures and I saw none.
I have reservations about the timeout, but the rest of the patch
is definitely doing the right thing, and something is a lot better
than nothing.
Tested-by: "Eric W. Biederman" <ebie...@xmission.com>
Tested-by: "Kerstin Jonsson" <kerstin...@ericsson.com>
>>> - if (value& (1<<j))
>>> - ack_APIC_irq();
>>> + do {
>>> + queued = 0;
>>> + for (i = APIC_ISR_NR - 1; i>= 0; i--)
>>> + queued |= apic_read(APIC_IRR + i*0x10);
>>> +
>>> + for (i = APIC_ISR_NR - 1; i>= 0; i--) {
>>> + value = apic_read(APIC_ISR + i*0x10);
>>> + for (j = 31; j>= 0; j--) {
>>> + if (value& (1<<j)) {
>>> + ack_APIC_irq();
>>> + acked++;
>>> + }
>>> + }
>>> }
>>> - }
>>> + if (acked> 256) {
>>> + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
>>> + acked);
>>> + break;
>>> + }
>>> + if (cpu_has_tsc) {
>>> + rdtscll(ntsc);
>>> + max_loops = (cpu_khz<< 10) - (ntsc - tsc);
>>> + } else
>>> + max_loops--;
>>> + } while (queued&& max_loops> 0);
Alright. In general I am in favor of loop limits when dealing with hardware
peculiarities. We should have all of the ioapics disabled from sending anything
at this point so we should be guaranteed not to loop, but occasionally things
don't happen as they should, so it seems like a healthy precaution.
Can't MRST install a quick ballpark value into cpu_khz?
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
> On 06/16/2010 02:11 PM, Pan, Jacob jun wrote:
>>
>> W.R.T. the loop limits, is it possible to use a default max_loops value in
>> case when cpu_khz is not set? The reason is that on Moorestown platform
>> we need to do an early APIC setup before tsc_init(), so cpu_khz is 0 at the
>> time we setup local APIC. The result is that we hit WARN_ON(max_loops<= 0)
>> on Moorestown for early APIC setup.
So you get a nasty warning but the system still boots?
>> The early APIC setup is needed because Moorestown does not have a PIT and the
>> system timer interrupts are routed via IOAPIC.
>>
>
> Can't MRST install a quick ballpark value into cpu_khz?
Looking at the code the initialization order in init/main.c is:
early_irq_init()
init_IRQ()
init_timers()
time_init()
tsc_init()
local_irq_enable()
So arguably if we simply switched those lines around we could make
this work, as you must be initializing the tsc with interrupts
disabled.
That said given the current ordering it appears that using the tsc
in setup_local_APIC is just broken because if it is properly called
from init_IRQ() the code doesn't work properly.
The question is what do we consider more important the current code
initialization ordering, or virtual processors having such an expensive
apic_read that we need a 1 second timeout.
I think the virtual processor concern is silly. Most of the time
this loop should execute exactly once after having confirmed there
is nothing to do. On a bad day this loop should execute at most twice.
If the vitalization is too expensive that this loop cannot execute
twice, bailing out early is a correctness concern.
I think we should set max_loops to 5. Leave the WARN_ON, and call it
good.
Does the following patch work cleanly on Moorestown?
Eric
From fc298618e87233de748c7a289f41bcc68ed8fc64 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman <ebie...@xmission.com>
Date: Wed, 16 Jun 2010 18:42:39 -0700
Subject: [PATCH] x86/apic: Don't use the tsc in setup_local_APIC
If everything is initialized in the order of init/main.c we should have:
init_IRQ()
setup_local_APIC()
time_init()
tsc_init()
local_irq_enable()
Which means the only reason the current use of the tsc in setup_local_APIC
works is because we are calling setup_local_APIC late on most x86
platforms.
The justification given for using a 1 second timeout on this loop
is because virtualized platforms have a very expensive apic_read().
Typically this loop should execute exactly once after confirming there
is nothing to do. On a bad day this loop should execute twice
after clearing the ISR and the IRR unacked irqs. If it is too
expensive to execute this loop twice on a virtualized platform
that is not our problem.
Let's set max_loops to 5. Which is more than enough and that
removes the need for messing with the tsc.
Signed-off-by: Eric W. Biederman <ebie...@xmission.com>
---
arch/x86/kernel/apic/apic.c | 13 ++-----------
1 files changed, 2 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c02cc69..d1a2b19 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,7 +51,6 @@
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/kvm_para.h>
-#include <asm/tsc.h>
unsigned int num_processors;
@@ -1154,11 +1153,7 @@ void __cpuinit setup_local_APIC(void)
{
unsigned int value, queued;
int i, j, acked = 0;
- unsigned long long tsc = 0, ntsc;
- long long max_loops = cpu_khz;
-
- if (cpu_has_tsc)
- rdtscll(tsc);
+ int max_loops = 5;
if (disable_apic) {
arch_disable_smp_support();
@@ -1229,11 +1224,7 @@ void __cpuinit setup_local_APIC(void)
acked);
break;
}
- if (cpu_has_tsc) {
- rdtscll(ntsc);
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
+ max_loops--;
} while (queued && max_loops > 0);
WARN_ON(max_loops <= 0);
--
1.6.5.2.143.g8cc62
Seems like a reasonable thing to do to me.