Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH 2/9] x86/fpu: Hard-disable lazy fpu mode

23 views
Skip to first unread message

ri...@redhat.com

unread,
Oct 4, 2016, 8:40:05 PM10/4/16
to
From: Andy Lutomirski <lu...@kernel.org>

Since commit 58122bf1d856 ("x86/fpu: Default eagerfpu=on on all
CPUs") in Linux 4.6, eager FPU mode has been the default on all x86
systems, and no one has reported any regressions.

This patch removes the ability to enable lazy mode: use_eager_fpu()
becomes "return true" and all of the FPU mode selection machinery is
removed.

Signed-off-by: Rik van Riel <ri...@redhat.com>
Signed-off-by: Andy Lutomirski <lu...@kernel.org>
---
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/fpu/internal.h | 2 +-
arch/x86/kernel/fpu/init.c | 91 ++-----------------------------------
3 files changed, 5 insertions(+), 90 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1188bc849ee3..b212b862314a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -104,7 +104,7 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+/* free, was #define X86_FEATURE_EAGER_FPU ( 3*32+29) * "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */

/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2737366ea583..8852e3afa1ad 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -62,7 +62,7 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
*/
static __always_inline __pure bool use_eager_fpu(void)
{
- return static_cpu_has(X86_FEATURE_EAGER_FPU);
+ return true;
}

static __always_inline __pure bool use_xsaveopt(void)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2f2b8c7ccb85..1a09d133c801 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -15,10 +15,7 @@
*/
static void fpu__init_cpu_ctx_switch(void)
{
- if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
- stts();
- else
- clts();
+ clts();
}

/*
@@ -233,82 +230,16 @@ static void __init fpu__init_system_xstate_size_legacy(void)
}

/*
- * FPU context switching strategies:
- *
- * Against popular belief, we don't do lazy FPU saves, due to the
- * task migration complications it brings on SMP - we only do
- * lazy FPU restores.
- *
- * 'lazy' is the traditional strategy, which is based on setting
- * CR0::TS to 1 during context-switch (instead of doing a full
- * restore of the FPU state), which causes the first FPU instruction
- * after the context switch (whenever it is executed) to fault - at
- * which point we lazily restore the FPU state into FPU registers.
- *
- * Tasks are of course under no obligation to execute FPU instructions,
- * so it can easily happen that another context-switch occurs without
- * a single FPU instruction being executed. If we eventually switch
- * back to the original task (that still owns the FPU) then we have
- * not only saved the restores along the way, but we also have the
- * FPU ready to be used for the original task.
- *
- * 'lazy' is deprecated because it's almost never a performance win
- * and it's much more complicated than 'eager'.
- *
- * 'eager' switching is by default on all CPUs, there we switch the FPU
- * state during every context switch, regardless of whether the task
- * has used FPU instructions in that time slice or not. This is done
- * because modern FPU context saving instructions are able to optimize
- * state saving and restoration in hardware: they can detect both
- * unused and untouched FPU state and optimize accordingly.
- *
- * [ Note that even in 'lazy' mode we might optimize context switches
- * to use 'eager' restores, if we detect that a task is using the FPU
- * frequently. See the fpu->counter logic in fpu/internal.h for that. ]
- */
-static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
-
-/*
* Find supported xfeatures based on cpu features and command-line input.
* This must be called after fpu__init_parse_early_param() is called and
* xfeatures_mask is enumerated.
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
- /* Support all xfeatures known to us */
- if (eagerfpu != DISABLE)
- return XCNTXT_MASK;
-
- /* Warning of xfeatures being disabled for no eagerfpu mode */
- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- }
-
- /* Return a mask that masks out all features requiring eagerfpu mode */
- return ~XFEATURE_MASK_EAGER;
-}
-
-/*
- * Disable features dependent on eagerfpu.
- */
-static void __init fpu__clear_eager_fpu_features(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_MPX);
+ return XCNTXT_MASK;
}

-/*
- * Pick the FPU context switching strategy:
- *
- * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
- * the following is true:
- *
- * (1) the cpu has xsaveopt, as it has the optimization and doing eager
- * FPU switching has a relatively low cost compared to a plain xsave;
- * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
- * switching. Should the kernel boot with noxsaveopt, we support MPX
- * with eager FPU switching at a higher cost.
- */
+/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
static bool on_boot_cpu __initdata = 1;
@@ -317,17 +248,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;

WARN_ON_FPU(current->thread.fpu.fpstate_active);
-
- if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
- eagerfpu = ENABLE;
-
- if (xfeatures_mask & XFEATURE_MASK_EAGER)
- eagerfpu = ENABLE;
-
- if (eagerfpu == ENABLE)
- setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
- printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
}

/*
@@ -336,11 +256,6 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
- eagerfpu = DISABLE;
- fpu__clear_eager_fpu_features();
- }
-
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);

--
2.7.4

Paolo Bonzini

unread,
Oct 5, 2016, 3:20:06 AM10/5/16
to


On 05/10/2016 02:34, ri...@redhat.com wrote:
> From: Andy Lutomirski <lu...@kernel.org>
>
> Since commit 58122bf1d856 ("x86/fpu: Default eagerfpu=on on all
> CPUs") in Linux 4.6, eager FPU mode has been the default on all x86
> systems, and no one has reported any regressions.
>
> This patch removes the ability to enable lazy mode: use_eager_fpu()
> becomes "return true" and all of the FPU mode selection machinery is
> removed.

I haven't quite followed up on my promise to benchmark lazy vs. eager
FPU, but I probably should do that now...

I see two possible issues with this. First, AMD as far as I know does
not have XSAVEOPT. Second, when using virtualization, depending on how
you configure your cluster it's enough to have one pre-SandyBridge Intel
machine to force no XSAVE on all machines.

Thanks,

Paolo

Rik van Riel

unread,
Oct 5, 2016, 10:00:06 AM10/5/16
to
The "OPT" part of XSAVEOPT does not work across the
host/guest boundary, anyway.

One of the items used in the tuple that determines
whether the optimization can be used is whether
or not the system is in the VMX root, or in a guest.

In other words, across a VMEXIT / VMENTER boundary,
it does full saves & restores, if I am reading the
manual right.

--
All Rights Reversed.
signature.asc

Paolo Bonzini

unread,
Oct 5, 2016, 10:10:06 AM10/5/16
to


On 05/10/2016 15:57, Rik van Riel wrote:
> On Wed, 2016-10-05 at 09:14 +0200, Paolo Bonzini wrote:
>>
>> On 05/10/2016 02:34, ri...@redhat.com wrote:
>>>
>>> From: Andy Lutomirski <lu...@kernel.org>
>>>
>>> Since commit 58122bf1d856 ("x86/fpu: Default eagerfpu=on on all
>>> CPUs") in Linux 4.6, eager FPU mode has been the default on all x86
>>> systems, and no one has reported any regressions.
>>>
>>> This patch removes the ability to enable lazy mode: use_eager_fpu()
>>> becomes "return true" and all of the FPU mode selection machinery
>>> is
>>> removed.
>>
>> I haven't quite followed up on my promise to benchmark lazy vs. eager
>> FPU, but I probably should do that now...
>>
>> I see two possible issues with this. First, AMD as far as I know does
>> not have XSAVEOPT. Second, when using virtualization, depending on
>> how you configure your cluster it's enough to have one pre-SandyBridge
>> Intel machine to force no XSAVE on all machines.
>
> The "OPT" part of XSAVEOPT does not work across the
> host/guest boundary, anyway.

Yes, but it works for bare metal (and in fact eager FPU was keyed on
XSAVEOPT before 58122bf1d856, not XSAVE).

I'm not talking about KVM here; I am just saying that the lazy FPU code
might be used more than we'd like to, because of AMD machines and of
cases where XSAVE is hidden altogether from guests. Of course it is
quite unlikely that it be reported as a regression, since things just
work. But as far as I know 58122bf1d856 went in without any substantial
(or not-so-substantial) benchmarking.

Paolo

Andy Lutomirski

unread,
Oct 5, 2016, 12:10:05 PM10/5/16
to
I actually benchmarked the underlying instructions quite a bit on
Intel. (Not on AMD, but I doubt the results are very different.)
Writes to CR0.TS are *incredibly* slow, as are device-not-available
exceptions. Keep in mind that, while there's a (slow) CLTS
instruction, there is no corresponding STTS instruction, so we're left
with a fully serializing, slowly microcoded move to CR0. On SVM, I
think it's worse, because IIRC SVM doesn't have fancy execution
controls that let MOV to CR0 avoid exiting. We're talking a couple
hundred cycles best case for a TS set/clear pair, and thousands of
cycles if we actually take a fault.

In contrast, an unconditional XSAVE + XRSTOR was considerably faster.

This leads to the counterintuitive result that, if we switch from task
A to B and back and task A is heavily using the FPU, then it's faster
to unconditoinally save and restore the full state both ways than it
is to set and clear TS so we can avoid it.

I would guess that the lazy mode hasn't been a win under most
workloads for many years. It's worse on 64-bit CPUs, since almost all
userspace uses XMM regs for memcpy. At least on 32-bit CPUs, SIMD
instructions weren't always available and userspace was conservative.

--Andy

Paolo Bonzini

unread,
Oct 5, 2016, 12:20:06 PM10/5/16
to


On 05/10/2016 17:59, Andy Lutomirski wrote:
> I actually benchmarked the underlying instructions quite a bit on
> Intel. (Not on AMD, but I doubt the results are very different.)
> Writes to CR0.TS are *incredibly* slow, as are device-not-available
> exceptions. Keep in mind that, while there's a (slow) CLTS
> instruction, there is no corresponding STTS instruction, so we're left
> with a fully serializing, slowly microcoded move to CR0. On SVM, I
> think it's worse, because IIRC SVM doesn't have fancy execution
> controls that let MOV to CR0 avoid exiting.

SVM lets you choose whether to trap on TS and MP; update_cr0_intercept
is where KVM does that (the "selective CR0 write" intercept is always
on, while the "CR0 write" intercept is toggled in that function).

> We're talking a couple
> hundred cycles best case for a TS set/clear pair, and thousands of
> cycles if we actually take a fault.
>
> In contrast, an unconditional XSAVE + XRSTOR was considerably faster.

Did you also do a comparison against FXSAVE/FXRSTOR (on either pre- or
post-SandyBridge processors)?

But yeah, it's possible that the lack of STTS screws the whole plan,
despite the fpu.preload optimization in switch_fpu_prepare.

Paolo

tip-bot for Andy Lutomirski

unread,
Oct 7, 2016, 5:50:06 AM10/7/16
to
Commit-ID: ca6938a1cd8a1c5e861a99b67f84ac166fc2b9e7
Gitweb: http://git.kernel.org/tip/ca6938a1cd8a1c5e861a99b67f84ac166fc2b9e7
Author: Andy Lutomirski <lu...@kernel.org>
AuthorDate: Tue, 4 Oct 2016 20:34:31 -0400
Committer: Ingo Molnar <mi...@kernel.org>
CommitDate: Fri, 7 Oct 2016 11:14:17 +0200

x86/fpu: Hard-disable lazy FPU mode

Since commit:

58122bf1d856 ("x86/fpu: Default eagerfpu=on on all CPUs")

... in Linux 4.6, eager FPU mode has been the default on all x86
systems, and no one has reported any regressions.

This patch removes the ability to enable lazy mode: use_eager_fpu()
becomes "return true" and all of the FPU mode selection machinery is
removed.

Signed-off-by: Andy Lutomirski <lu...@kernel.org>
Signed-off-by: Rik van Riel <ri...@redhat.com>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Brian Gerst <brg...@gmail.com>
Cc: Dave Hansen <dave....@linux.intel.com>
Cc: Denys Vlasenko <dvla...@redhat.com>
Cc: Fenghua Yu <fengh...@intel.com>
Cc: H. Peter Anvin <h...@zytor.com>
Cc: Josh Poimboeuf <jpoi...@redhat.com>
Cc: Linus Torvalds <torv...@linux-foundation.org>
Cc: Oleg Nesterov <ol...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Quentin Casasnovas <quentin.c...@oracle.com>
Cc: Thomas Gleixner <tg...@linutronix.de>
Cc: pbon...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-3-...@redhat.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/fpu/internal.h | 2 +-
arch/x86/kernel/fpu/init.c | 91 ++-----------------------------------
3 files changed, 5 insertions(+), 90 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1188bc8..b212b86 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -104,7 +104,7 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+/* free, was #define X86_FEATURE_EAGER_FPU ( 3*32+29) * "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */

/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2737366..8852e3a 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -62,7 +62,7 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
*/
static __always_inline __pure bool use_eager_fpu(void)
{
- return static_cpu_has(X86_FEATURE_EAGER_FPU);
+ return true;
}

static __always_inline __pure bool use_xsaveopt(void)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2f2b8c7..1a09d13 100644
0 new messages