[RFC 1/5] x86: Clean up cr4 manipulation

Andy Lutomirski

unread,

Oct 14, 2014, 7:00:02 PM10/14/14

to

CR4 manipulation was split, seemingly at random, between direct
(write_cr4) and using a helper (set/clear_in_cr4). Unfortunately,
the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
which only a small subset of users actually wanted.

This patch replaces all cr4 access in functions that don't leave cr4
exactly the way they found it with new helpers cr4_set, cr4_clear,
and cr4_set_and_update_boot.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---
arch/x86/include/asm/processor.h | 33 --------------------------------
arch/x86/include/asm/tlbflush.h | 37 ++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/virtext.h | 3 ++-
arch/x86/kernel/cpu/common.c | 10 +++++-----
arch/x86/kernel/cpu/mcheck/mce.c | 3 ++-
arch/x86/kernel/cpu/mcheck/p5.c | 3 ++-
arch/x86/kernel/cpu/mcheck/winchip.c | 3 ++-
arch/x86/kernel/cpu/perf_event.c | 7 ++++---
arch/x86/kernel/i387.c | 3 ++-
arch/x86/kernel/process.c | 5 +++--
arch/x86/kernel/xsave.c | 3 ++-
arch/x86/kvm/vmx.c | 4 ++--
arch/x86/mm/init.c | 4 ++--
arch/x86/xen/enlighten.c | 4 ++--
14 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eb71ec794732..ddd8d13a010f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -578,39 +578,6 @@ static inline void load_sp0(struct tss_struct *tss,
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT */

-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features |= mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features &= ~mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
-}
-
typedef struct {
unsigned long seg;
} mm_segment_t;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..95b672f8b493 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,43 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif

+/* Set in this cpu's CR4. */
+static inline void cr4_set(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set(mask);
+}
+
static inline void __native_flush_tlb(void)
{
native_write_cr3(native_read_cr3());
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..7503a642294a 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@

#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/tlbflush.h>

/*
* VMX functions:
@@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void)
static inline void cpu_vmxoff(void)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear(X86_CR4_VMXE);
}

static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e4ab2b42bd6f..7d8400a4b192 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -276,7 +276,7 @@ __setup("nosmep", setup_disable_smep);
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set(X86_CR4_SMEP);
}

static __init int setup_disable_smap(char *arg)
@@ -296,9 +296,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)

if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear(X86_CR4_SMAP);
#endif
}
}
@@ -1307,7 +1307,7 @@ void cpu_init(void)

pr_debug("Initializing CPU#%d\n", cpu);

- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

/*
* Initialize the per-CPU GDT with the boot GDT,
@@ -1392,7 +1392,7 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);

if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

load_current_idt();
switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bd9ccda8087f..d1b4903d82bb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
#include <linux/export.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -1455,7 +1456,7 @@ static void __mcheck_cpu_init_generic(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set(X86_CR4_MCE);

rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..fed632fd79bb 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,7 @@
#include <linux/smp.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -59,7 +60,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
"Intel old style machine check architecture supported.\n");

/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..e4b151428915 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,6 +7,7 @@
#include <linux/types.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -31,7 +32,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set(X86_CR4_MCE);

printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2879ecdaac43..2401593df4a3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
@@ -1326,7 +1327,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)

case CPU_STARTING:
if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1832,9 +1833,9 @@ static void change_rdpmc(void *info)
bool enable = !!(unsigned long)info;

if (enable)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set(X86_CR4_PCE);
else
- clear_in_cr4(X86_CR4_PCE);
+ cr4_clear(X86_CR4_PCE);
}

static ssize_t set_attr_rdpmc(struct device *cdev,
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..dc369b52e949 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
@@ -180,7 +181,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set(cr4_mask);

cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f804dc935d2a..d7e4c27b731f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -139,7 +140,7 @@ void flush_thread(void)

static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set(X86_CR4_TSD);
}

void disable_TSC(void)
@@ -156,7 +157,7 @@ void disable_TSC(void)

static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear(X86_CR4_TSD);
}

static void enable_TSC(void)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 940b142cc11f..c4057fd56c38 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>

/*
@@ -452,7 +453,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe11cf124a1..a48c26d01ab8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2737,7 +2737,7 @@ static int hardware_enable(void *garbage)
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ cr4_set(X86_CR4_VMXE);

if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);
@@ -2774,7 +2774,7 @@ static void hardware_disable(void *garbage)
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear(X86_CR4_VMXE);
}

static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 66dba36f2343..f64386652bd5 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -144,11 +144,11 @@ static void __init probe_page_size_mask(void)

/* Enable PSE if available */
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_and_update_boot(X86_CR4_PSE);

/* Enable PGE if available */
if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0cb11fb5008..9acf0af34ed1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1482,10 +1482,10 @@ static void xen_pvh_set_cr_flags(int cpu)
* set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
*/
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_and_update_boot(X86_CR4_PSE);

if (cpu_has_pge)
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_and_update_boot(X86_CR4_PGE);
}

/*
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Andy Lutomirski

unread,

Oct 14, 2014, 7:00:02 PM10/14/14

to

Hi Peter, etc,

This little series tightens up rdpmc permissions. With it applied,
rdpmc can only be used if a perf_event is actually mmapped. For now,
this is only really useful for seccomp.

At some point this could be further tightened up to only allow rdpmc
if an actual self-monitoring perf event that is compatible with
rdpmc is mapped. (Is the current code there even correct? What
happens if you set up a perf counter targetting a different pid and
then try to rdpmc it? Do you end up reading the right value or does
perf event context switching mess you up?)

This should add <50ns to context switches between rdpmc-capable and
rdpmc-non-capable mms. I suspect that this is well under 10%
overhead, given that perf already adds some context switch latency.

If needed, we could add a global switch that turns this behavior
off. We could also rehink the logic.

I think that patches 1-3 are a good idea regardless of any rdpmc changes.

(Please don't apply quite yet, because there'll be a merge conflict
with some changes that haven't landed yet. This is against 3.17.)

Andy Lutomirski (5):
x86: Clean up cr4 manipulation
x86: Store a per-cpu shadow copy of CR4
x86: Add a comment clarifying LDT context switching
perf: Add pmu callbacks to track event mapping and unmapping
x86,perf: Only allow rdpmc if a perf_event is mapped

arch/x86/include/asm/mmu.h | 2 +
arch/x86/include/asm/mmu_context.h | 30 +++++++++++++-
arch/x86/include/asm/processor.h | 33 ----------------
arch/x86/include/asm/tlbflush.h | 77 ++++++++++++++++++++++++++++++++----
arch/x86/include/asm/virtext.h | 3 +-
arch/x86/kernel/cpu/common.c | 17 +++++---
arch/x86/kernel/cpu/mcheck/mce.c | 3 +-
arch/x86/kernel/cpu/mcheck/p5.c | 3 +-
arch/x86/kernel/cpu/mcheck/winchip.c | 3 +-
arch/x86/kernel/cpu/perf_event.c | 61 +++++++++++++++++++++-------
arch/x86/kernel/head32.c | 1 +
arch/x86/kernel/head64.c | 2 +
arch/x86/kernel/i387.c | 3 +-
arch/x86/kernel/process.c | 5 ++-
arch/x86/kernel/xsave.c | 3 +-
arch/x86/kvm/vmx.c | 8 ++--
arch/x86/mm/init.c | 12 +++++-
arch/x86/mm/tlb.c | 3 --
arch/x86/xen/enlighten.c | 4 +-
include/linux/perf_event.h | 7 ++++
kernel/events/core.c | 9 +++++
21 files changed, 210 insertions(+), 79 deletions(-)

Andy Lutomirski

unread,

Oct 14, 2014, 7:00:03 PM10/14/14

to

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

include/linux/perf_event.h | 7 +++++++
kernel/events/core.c | 9 +++++++++
2 files changed, 16 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 707617a8c0f6..88069e319a3f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -206,6 +206,13 @@ struct pmu {
*/
int (*event_init) (struct perf_event *event);

+ /*
+ * Notification that the event was mapped or unmapped. Called
+ * in the context of the mapping task.
+ */
+ void (*event_mapped) (struct perf_event *event); /*optional*/
+ void (*event_unmapped) (struct perf_event *event); /*optional*/
+
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 963bf139e2b2..dc73e9723748 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4008,6 +4008,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)

atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
}

/*
@@ -4027,6 +4030,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);

+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
atomic_dec(&rb->mmap_count);

if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4228,6 +4234,9 @@ unlock:
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;

+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
return ret;

Andy Lutomirski

unread,

Oct 14, 2014, 7:00:03 PM10/14/14

to

Context switches and TLB flushes can change individual bits of CR4.
CR4 reads take several cycles, so store a shadow copy of CR4 in a
per-cpu variable.

To avoid wasting a cache line, I added the CR4 shadow to
cpu_tlbstate, which is already touched during context switches.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

arch/x86/include/asm/tlbflush.h | 52 ++++++++++++++++++++++++++++++-----------
arch/x86/kernel/cpu/common.c | 7 ++++++
arch/x86/kernel/head32.c | 1 +
arch/x86/kernel/head64.c | 2 ++
arch/x86/kvm/vmx.c | 4 ++--
arch/x86/mm/init.c | 8 +++++++
arch/x86/mm/tlb.c | 3 ---
7 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 95b672f8b493..a04cad4bcbc3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,14 +15,37 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif

+struct tlb_state {
+#ifdef CONFIG_SMP
+ struct mm_struct *active_mm;
+ int state;
+#endif
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+ unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+ this_cpu_write(cpu_tlbstate.cr4, read_cr4());
+}
+

/* Set in this cpu's CR4. */

static inline void cr4_set(unsigned long mask)

{
unsigned long cr4;

- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);

+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if (!(cr4 & mask)) {
+ cr4 |= mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ write_cr4(cr4);
+ }

}

/* Clear in this cpu's CR4. */

@@ -30,9 +53,18 @@ static inline void cr4_clear(unsigned long mask)
{
unsigned long cr4;

- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);

+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if (cr4 & mask) {
+ cr4 &= ~mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ write_cr4(cr4);
+ }
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
}

/*
@@ -61,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
{
unsigned long cr4;

- cr4 = native_read_cr4();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
@@ -221,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2

-struct tlb_state {
- struct mm_struct *active_mm;
- int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
static inline void reset_lazy_tlbstate(void)
{
this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d8400a4b192..ec73485b00c5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>
@@ -1285,6 +1286,12 @@ void cpu_init(void)
int i;

/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)

asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);

/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..3b241f0ca005 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -155,6 +155,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a48c26d01ab8..c3927506e0f4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2710,7 +2710,7 @@ static int hardware_enable(void *garbage)
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;

- if (read_cr4() & X86_CR4_VMXE)
+ if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;

INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -4237,7 +4237,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
struct desc_ptr dt;

vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
+ vmcs_writel(HOST_CR4, cr4_read_shadow()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */

vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f64386652bd5..866244267192 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -687,3 +687,11 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}

+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
#include <asm/uv/uv.h>
#include <linux/debugfs.h>

-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
- = { &init_mm, 0, };
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.

Andy Lutomirski

unread,

Oct 14, 2014, 7:00:03 PM10/14/14

to

The code is correct, but only for a rather subtle reason. This
confused me for quite a while when I read switch_mm, so clarify the
code to avoid confusing other people, too.

TBH, I wouldn't be surprised if this code was only correct by
accident.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

arch/x86/include/asm/mmu_context.h | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 166af2a8e865..04478103df37 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -53,7 +53,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));

- /* Load the LDT, if the LDT is different: */
+ /*
+ * Load the LDT, if the LDT is different.
+ *
+ * It's possible leave_mm(prev) has been called. If so,
+ * then prev->context.ldt could be out of sync with the
+ * LDT descriptor or the LDT register. This can only happen
+ * if prev->context.ldt is non-null, since we never free
+ * an LDT. But LDTs can't be shared across mms, so
+ * prev->context.ldt won't be equal to next->context.ldt.
+ */
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);

Andy Lutomirski

unread,

Oct 14, 2014, 7:00:06 PM10/14/14

to

We currently allow any process to use rdpmc. This significantly
weakens the protection offered by PR_TSC_DISABLED, and it could be
helpful to users attempting to exploit timing attacks.

Since we can't enable access to individual counters, use a very
coarse heuristic to limit access to rdpmc: allow access only when
a perf_event is mmapped. This protects seccomp sandboxes.

There is plenty of room to further tighen these restrictions. For
example, on x86, *all* perf_event mappings set cap_user_rdpmc. This
should probably be changed to only apply to perf_events that are
accessible using rdpmc.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 19 ++++++++++++
arch/x86/kernel/cpu/perf_event.c | 60 +++++++++++++++++++++++++++++---------
3 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..2e5e4925a63f 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {

struct mutex lock;
void __user *vdso;
+
+ atomic_t perf_mmap_count; /* number of mmapped perf_events */
} mm_context_t;

#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 04478103df37..10239af45a9d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -19,6 +19,21 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
}
#endif /* !CONFIG_PARAVIRT */

+#ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_enabled;
+
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+ if (static_key_true(&rdpmc_enabled) &&
+ atomic_read(&mm->context.perf_mmap_count))
+ cr4_set(X86_CR4_PCE);
+ else
+ cr4_clear(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
/*
* Used for LDT copy/destruction.
*/
@@ -53,6 +68,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,

/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));

+ /* Load per-mm CR4 state */
+ load_mm_cr4(next);
+
/*

* Load the LDT, if the LDT is different.

*
@@ -86,6 +104,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
load_LDT_nolock(&next->context);
}
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2401593df4a3..7d42cf60f3b3 100644

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>

+#include <asm/mmu_context.h>

#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>

@@ -44,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

+struct static_key rdpmc_enabled = STATIC_KEY_INIT_TRUE;
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -133,6 +136,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)

static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
+static DEFINE_MUTEX(rdpmc_enable_mutex);

#ifdef CONFIG_X86_LOCAL_APIC

@@ -1326,8 +1330,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;

case CPU_STARTING:
- if (x86_pmu.attr_rdpmc)
- cr4_set(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1806,6 +1808,27 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}

+static void refresh_pce(void *ignored)
+{
+ if (current->mm)
+ load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+ if (atomic_inc_return(&current->mm->context.perf_mmap_count) == 1)
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+ if (!current->mm)
+ return;
+
+ if (atomic_dec_and_test(&current->mm->context.perf_mmap_count))
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
static int x86_pmu_event_idx(struct perf_event *event)
{
int idx = event->hw.idx;
@@ -1828,16 +1851,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}

-static void change_rdpmc(void *info)
-{
- bool enable = !!(unsigned long)info;
-
- if (enable)
- cr4_set(X86_CR4_PCE);
- else
- cr4_clear(X86_CR4_PCE);
-}
-

static ssize_t set_attr_rdpmc(struct device *cdev,

struct device_attribute *attr,
const char *buf, size_t count)
@@ -1852,10 +1865,26 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;

+ mutex_lock(&rdpmc_enable_mutex);
if (!!val != !!x86_pmu.attr_rdpmc) {
- x86_pmu.attr_rdpmc = !!val;
- on_each_cpu(change_rdpmc, (void *)val, 1);
+ if (val) {
+ static_key_slow_inc(&rdpmc_enabled);
+ on_each_cpu(refresh_pce, NULL, 1);
+ smp_wmb();
+ x86_pmu.attr_rdpmc = 1;
+ } else {
+ /*
+ * This direction can race against existing
+ * rdpmc-capable mappings. Try our best regardless.
+ */
+ x86_pmu.attr_rdpmc = 0;
+ smp_wmb();
+ static_key_slow_dec(&rdpmc_enabled);
+ WARN_ON(static_key_true(&rdpmc_enabled));
+ on_each_cpu(refresh_pce, NULL, 1);
+ }
}
+ mutex_unlock(&rdpmc_enable_mutex);

return count;
}
@@ -1899,6 +1928,9 @@ static struct pmu pmu = {

.event_init = x86_pmu_event_init,

+ .event_mapped = x86_pmu_event_mapped,
+ .event_unmapped = x86_pmu_event_unmapped,
+
.add = x86_pmu_add,
.del = x86_pmu_del,
.start = x86_pmu_start,

Hillf Danton

unread,

Oct 15, 2014, 3:40:02 AM10/15/14

to

Hey Andy

>
> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched during context switches.
>

> Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
> ---

What if cr4 contains bit_A and mask contains bits A and B?

Hillf

> + cr4 |= mask;
> + this_cpu_write(cpu_tlbstate.cr4, cr4);
> + write_cr4(cr4);
> + }
> }
>

[...]

Andy Lutomirski

unread,

Oct 15, 2014, 10:50:02 AM10/15/14

to

A malfunction. Whoops :) Will fix.

--Andy

Peter Zijlstra

unread,

Oct 16, 2014, 4:20:03 AM10/16/14

to

On Tue, Oct 14, 2014 at 03:57:35PM -0700, Andy Lutomirski wrote:
> +/* Set in this cpu's CR4. */
> +static inline void cr4_set(unsigned long mask)
> +{
> + unsigned long cr4;
> +
> + cr4 = read_cr4();
> + cr4 |= mask;
> + write_cr4(cr4);
> +}
> +
> +/* Clear in this cpu's CR4. */
> +static inline void cr4_clear(unsigned long mask)
> +{
> + unsigned long cr4;
> +
> + cr4 = read_cr4();
> + cr4 &= ~mask;
> + write_cr4(cr4);
> +}

I would have called these cr4_or() and cr4_nand(). When first reading
this I expected cr4_set() to be a pure write_cr4() and cr4_clear() to do
write_cr4(0) -- which obviously doesn't make too much sense.

cr4_{set,clear}_mask() might maybe be clearer but is more typing, and
the logical ops as suggested should have unambiguous meaning.

Peter Zijlstra

unread,

Oct 16, 2014, 4:30:02 AM10/16/14

to

On Tue, Oct 14, 2014 at 03:57:36PM -0700, Andy Lutomirski wrote:
> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched during context switches.

I'm a little confused. We should be more specific I suppose, context
switches don't always change mm, but CR4 state is per task.

From a quick look, only switch_mm() pokes at tlb, switch_to() does not.

Peter Zijlstra

unread,

Oct 16, 2014, 4:50:01 AM10/16/14

to

On Tue, Oct 14, 2014 at 03:57:39PM -0700, Andy Lutomirski wrote:
> We currently allow any process to use rdpmc. This significantly
> weakens the protection offered by PR_TSC_DISABLED, and it could be
> helpful to users attempting to exploit timing attacks.
>
> Since we can't enable access to individual counters, use a very
> coarse heuristic to limit access to rdpmc: allow access only when
> a perf_event is mmapped. This protects seccomp sandboxes.
>
> There is plenty of room to further tighen these restrictions. For
> example, on x86, *all* perf_event mappings set cap_user_rdpmc. This
> should probably be changed to only apply to perf_events that are
> accessible using rdpmc.

So I suppose this patch is a little over engineered,

why do you care about that rdpmc_enabled static key thing? Also you
should not expose static key control to userspace like this, they can
totally wreck the system. At the very least it should be
static_key_slow_dec_deferred() -- gawd I hate the static_key API.

Borislav Petkov

unread,

Oct 16, 2014, 7:20:02 AM10/16/14

to

Or maybe ...{set,clear}_bits() because we're setting/clearing a mask of
a bunch of bits...

--
Regards/Gruss,
Boris.
--

Borislav Petkov

unread,

Oct 16, 2014, 7:40:02 AM10/16/14

to

On Thu, Oct 16, 2014 at 01:18:58PM +0200, Borislav Petkov wrote:
> On Thu, Oct 16, 2014 at 10:16:33AM +0200, Peter Zijlstra wrote:
> > On Tue, Oct 14, 2014 at 03:57:35PM -0700, Andy Lutomirski wrote:
> > > +/* Set in this cpu's CR4. */
> > > +static inline void cr4_set(unsigned long mask)
> > > +{
> > > + unsigned long cr4;
> > > +
> > > + cr4 = read_cr4();
> > > + cr4 |= mask;
> > > + write_cr4(cr4);
> > > +}
> > > +
> > > +/* Clear in this cpu's CR4. */
> > > +static inline void cr4_clear(unsigned long mask)
> > > +{
> > > + unsigned long cr4;
> > > +
> > > + cr4 = read_cr4();
> > > + cr4 &= ~mask;
> > > + write_cr4(cr4);
> > > +}

Btw, on a related note, we probably should check whether the bits we
want to set/clear are already set/cleared and then save us the CR4
write.

It probably won't be the case all that much but still...

Borislav Petkov

unread,

Oct 16, 2014, 7:50:02 AM10/16/14

to

On Tue, Oct 14, 2014 at 03:57:36PM -0700, Andy Lutomirski wrote:

> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched during context switches.

So does this even show in any workloads as any improvement?

Also, what's the rule with reading the shadow CR4? kvm only? Because
svm_set_cr4() in svm.c reads the host CR4 too.

Should we make all code access the shadow CR4 maybe...

--
Regards/Gruss,
Boris.
--

Andy Lutomirski

unread,

Oct 16, 2014, 11:40:02 AM10/16/14

to

That's in patch 2. I can move it here, though.

>
> --
> Regards/Gruss,
> Boris.
> --

--

Andy Lutomirski
AMA Capital Management, LLC

Andy Lutomirski

unread,

Oct 16, 2014, 11:40:03 AM10/16/14

to

On Thu, Oct 16, 2014 at 1:42 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Tue, Oct 14, 2014 at 03:57:39PM -0700, Andy Lutomirski wrote:
>> We currently allow any process to use rdpmc. This significantly
>> weakens the protection offered by PR_TSC_DISABLED, and it could be
>> helpful to users attempting to exploit timing attacks.
>>
>> Since we can't enable access to individual counters, use a very
>> coarse heuristic to limit access to rdpmc: allow access only when
>> a perf_event is mmapped. This protects seccomp sandboxes.
>>
>> There is plenty of room to further tighen these restrictions. For
>> example, on x86, *all* perf_event mappings set cap_user_rdpmc. This
>> should probably be changed to only apply to perf_events that are
>> accessible using rdpmc.
>
> So I suppose this patch is a little over engineered,

:) I won't argue.

This particular control is only available to root, so I don't think it
matters too much. I did it this way to avoid hitting an extra dcache
line on every switch_mm.

A nicer solution might be to track whether rdpmc is allowed for each
perf_event and to count the number that allow rdpmc. That would cause
'echo 0 > rdpmc' to only work for new perf_events, but it fixes a
race.

Doing this will require passing more info to
arch_perf_update_userpage, I think. Should I do that? It'll probably
get a better result, but this patchset will get even longer and even
more overengineered.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

Andy Lutomirski

unread,

Oct 16, 2014, 11:40:03 AM10/16/14

to

On Oct 16, 2014 4:49 AM, "Borislav Petkov" <b...@alien8.de> wrote:
>
> On Tue, Oct 14, 2014 at 03:57:36PM -0700, Andy Lutomirski wrote:
> > Context switches and TLB flushes can change individual bits of CR4.
> > CR4 reads take several cycles, so store a shadow copy of CR4 in a
> > per-cpu variable.
> >
> > To avoid wasting a cache line, I added the CR4 shadow to
> > cpu_tlbstate, which is already touched during context switches.
>
> So does this even show in any workloads as any improvement?
>

Unlear. Assuming that mov from cr4 is more expensive than a cache
miss (which may or may not be true), then kernel TLB flushes will get
cheaper. The main reason I did this is to make switching TSD and PCE
a little cheaper, which might be worthwhile.

I think this may be a huge win on many workloads when running as an
SVM guest. IIUC SVM doesn't have accelerated guest CR4 access.

> Also, what's the rule with reading the shadow CR4? kvm only? Because
> svm_set_cr4() in svm.c reads the host CR4 too.

Whoops.

>
> Should we make all code access the shadow CR4 maybe...

That was the intent. In v2, I'll probably rename read_cr4 to
__read_cr4 to make it difficult to miss things.

Borislav Petkov

unread,

Oct 16, 2014, 11:50:02 AM10/16/14

to

On Tue, Oct 14, 2014 at 03:57:37PM -0700, Andy Lutomirski wrote:
> The code is correct, but only for a rather subtle reason. This
> confused me for quite a while when I read switch_mm, so clarify the
> code to avoid confusing other people, too.
>
> TBH, I wouldn't be surprised if this code was only correct by
> accident.
>
> Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
> ---
> arch/x86/include/asm/mmu_context.h | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 166af2a8e865..04478103df37 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -53,7 +53,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> /* Stop flush ipis for the previous mm */
> cpumask_clear_cpu(cpu, mm_cpumask(prev));
>
> - /* Load the LDT, if the LDT is different: */
> + /*
> + * Load the LDT, if the LDT is different.
> + *
> + * It's possible leave_mm(prev) has been called. If so,
> + * then prev->context.ldt could be out of sync with the
> + * LDT descriptor or the LDT register. This can only happen

I'm staring at the code and trying to figure out where on the leave_mm()
path this could happen. Got any code pointers?

:-)

> + * if prev->context.ldt is non-null, since we never free
> + * an LDT. But LDTs can't be shared across mms, so
> + * prev->context.ldt won't be equal to next->context.ldt.
> + */
> if (unlikely(prev->context.ldt != next->context.ldt))
> load_LDT_nolock(&next->context);
> }
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majo...@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

--
Regards/Gruss,
Boris.
--

Borislav Petkov

unread,

Oct 16, 2014, 11:50:03 AM10/16/14

to

On Thu, Oct 16, 2014 at 08:32:57AM -0700, Andy Lutomirski wrote:
> That's in patch 2. I can move it here, though.

Right, saw it after sending. And nah, don't bother.

--
Regards/Gruss,
Boris.
--
--

Borislav Petkov

unread,

Oct 16, 2014, 12:00:02 PM10/16/14

to

On Thu, Oct 16, 2014 at 10:42:27AM +0200, Peter Zijlstra wrote:
> gawd I hate the static_key API.

Yahaa, I like to twist my brain for fun just by staring at
sched_clock_stable() logic.

Oh, and it needs fixing btw because if you look at the asm that comes
out, after the jump labels have run, you get an unconditional jump to
the likely code doing RDTSC.

This should be avoided and we should have the likely code be laid out
first with the NOP and have the jump labels patch in the unconditional
JMP only on the small amount of systems where we're unlikely to be using
the TSC... For that we'd need to pound on jump labels a bit more though.

--
Regards/Gruss,
Boris.
--

Andy Lutomirski

unread,

Oct 16, 2014, 12:30:02 PM10/16/14

to

I think it's the same as in the other case in switch_mm. leave_mm
does cpumask_clear_cpu(cpu, mm_cpumask(active_mm)), and, once that has
happened, modify_ldt won't send an IPI to this CPU. So, if leave_mm
runs, and then another CPU calls modify_ldt on the mm that is in lazy
mode here, it won't update our LDT register, so the LDT register and
prev->context.ldt might not match.

I think that, if that's the case, then prev->context.ldt can't be NULL
and can't have the same non-NULL value as next->context.ldt. I hope.

I'll try to make the comment clearer in v2.

--Andy

>
> :-)
>
>> + * if prev->context.ldt is non-null, since we never free
>> + * an LDT. But LDTs can't be shared across mms, so
>> + * prev->context.ldt won't be equal to next->context.ldt.
>> + */
>> if (unlikely(prev->context.ldt != next->context.ldt))
>> load_LDT_nolock(&next->context);
>> }
>> --
>> 1.9.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majo...@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>
> --
> Regards/Gruss,
> Boris.
> --

--

Andy Lutomirski
AMA Capital Management, LLC

Andy Lutomirski

unread,

Oct 16, 2014, 8:10:01 PM10/16/14

to

The current cap_user_rdpmc code seems rather confused to me. On x86,
*all* events set cap_user_rdpmc if the global rdpmc control is set.
But only x86_pmu events define .event_idx, so amd uncore events won't
actually expose their rdpmc index to userspace.

Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
that gets set on all events created while rdpmc == 1, to change
x86_pmu_event_idx to do something like:

if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
return event->hw.event_base_rdpmc + 1;
else
return 0;

and to change arch_perf_update_userpage cap_user_rdpmc to match
PERF_X86_EVENT_RDPMC_PERMITTED?

Then we could ditch the static key and greatly simplify writes to the
rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.

This would be a user-visible change on AMD, and I can't test it.

On a semi-related note: would this all be nicer if there were vdso
function __u64 __vdso_perf_event__read_count(int fd, void *userpage)?
This is very easy to do nowadays. If we got *really* fancy, it would
be possible to have an rdpmc_safe in the vdso, which has some
benefits, although it would be a bit evil and wouldn't work if
userspace tracers like pin are in use.

--Andy

Andy Lutomirski

unread,

Oct 19, 2014, 4:30:01 PM10/19/14

to

On Thu, Oct 16, 2014 at 5:00 PM, Andy Lutomirski <lu...@amacapital.net> wrote:
> The current cap_user_rdpmc code seems rather confused to me. On x86,
> *all* events set cap_user_rdpmc if the global rdpmc control is set.
> But only x86_pmu events define .event_idx, so amd uncore events won't
> actually expose their rdpmc index to userspace.
>
> Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
> that gets set on all events created while rdpmc == 1, to change
> x86_pmu_event_idx to do something like:
>
> if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
> return event->hw.event_base_rdpmc + 1;
> else
> return 0;
>
> and to change arch_perf_update_userpage cap_user_rdpmc to match
> PERF_X86_EVENT_RDPMC_PERMITTED?
>
> Then we could ditch the static key and greatly simplify writes to the
> rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
>
> This would be a user-visible change on AMD, and I can't test it.
>
>
> On a semi-related note: would this all be nicer if there were vdso
> function __u64 __vdso_perf_event__read_count(int fd, void *userpage)?
> This is very easy to do nowadays. If we got *really* fancy, it would
> be possible to have an rdpmc_safe in the vdso, which has some
> benefits, although it would be a bit evil and wouldn't work if
> userspace tracers like pin are in use.
>

Also, I don't understand the purpose of cap_user_time. Wouldn't it be
easier to just record the last CLOCK_MONOTONIC time and let the user
call __vdso_clock_gettime if they need an updated time?

Peter Zijlstra

unread,

Oct 19, 2014, 5:40:01 PM10/19/14

to

On Thu, Oct 16, 2014 at 05:00:56PM -0700, Andy Lutomirski wrote:
> The current cap_user_rdpmc code seems rather confused to me. On x86,
> *all* events set cap_user_rdpmc if the global rdpmc control is set.
> But only x86_pmu events define .event_idx, so amd uncore events won't
> actually expose their rdpmc index to userspace.
>
> Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
> that gets set on all events created while rdpmc == 1, to change
> x86_pmu_event_idx to do something like:
>
> if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
> return event->hw.event_base_rdpmc + 1;
> else
> return 0;
>
> and to change arch_perf_update_userpage cap_user_rdpmc to match
> PERF_X86_EVENT_RDPMC_PERMITTED?
>
> Then we could ditch the static key and greatly simplify writes to the
> rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
>
> This would be a user-visible change on AMD, and I can't test it.

I have AMD hardware to test this. But yes something like that seems
fine.

Peter Zijlstra

unread,

Oct 19, 2014, 5:40:01 PM10/19/14

to

Because perf doesn't use CLOCK_MONOTONIC. Due to performance
considerations we used the sched_clock stuff, which tries its best to
make the best of the TSC without reverting to HPET and the like.

Not to mention that CLOCK_MONOTONIC was not available from NMI context
until very recently.

Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
from perf sample timestamps") seem to suggest people actually use TSC
for things as well.

Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
fallback to use the sched_clock stuff on time challenged hardware) in
order to ease the correlation between other trace thingies, but even
then it makes sense to have this, having it here and reading the TSC
within the seqcount loop ensures you've got consistent data and touch
less cachelines for reading.

Andy Lutomirski

unread,

Oct 19, 2014, 6:10:03 PM10/19/14

to

I'm only talking about the userspace access to when an event was
enabled and how long it's been running. I think that's what the
cap_user_time stuff is for. I don't think those parameters are
touched from NMI, right?

Point taken about sched_clock, though.

>
> Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
> from perf sample timestamps") seem to suggest people actually use TSC
> for things as well.
>
> Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
> fallback to use the sched_clock stuff on time challenged hardware) in
> order to ease the correlation between other trace thingies, but even
> then it makes sense to have this, having it here and reading the TSC
> within the seqcount loop ensures you've got consistent data and touch
> less cachelines for reading.

True.

OTOH, people (i.e. I) have optimized the crap out of
__vdso_clock_gettime, and __vdso_perf_event_whatever could be
similarly optimized.

--Andy

Peter Zijlstra

unread,

Oct 19, 2014, 6:30:02 PM10/19/14

to

On Sun, Oct 19, 2014 at 03:05:42PM -0700, Andy Lutomirski wrote:
> On Oct 19, 2014 2:33 PM, "Peter Zijlstra" <pet...@infradead.org> wrote:

> > > Also, I don't understand the purpose of cap_user_time. Wouldn't it be
> > > easier to just record the last CLOCK_MONOTONIC time and let the user
> > > call __vdso_clock_gettime if they need an updated time?
> >
> > Because perf doesn't use CLOCK_MONOTONIC. Due to performance
> > considerations we used the sched_clock stuff, which tries its best to
> > make the best of the TSC without reverting to HPET and the like.
> >
> > Not to mention that CLOCK_MONOTONIC was not available from NMI context
> > until very recently.
>
> I'm only talking about the userspace access to when an event was
> enabled and how long it's been running. I think that's what the
> cap_user_time stuff is for. I don't think those parameters are
> touched from NMI, right?
>
> Point taken about sched_clock, though.

Well, mixing two time bases, one TSC based and one CLOCK_MONOTONIC is
just asking for trouble IMO ;-)

> > Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
> > from perf sample timestamps") seem to suggest people actually use TSC
> > for things as well.
> >
> > Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
> > fallback to use the sched_clock stuff on time challenged hardware) in
> > order to ease the correlation between other trace thingies, but even
> > then it makes sense to have this, having it here and reading the TSC
> > within the seqcount loop ensures you've got consistent data and touch
> > less cachelines for reading.
>
> True.
>
> OTOH, people (i.e. I) have optimized the crap out of
> __vdso_clock_gettime, and __vdso_perf_event_whatever could be
> similarly optimized.

Maybe, but at that point we commit to yet another ABI... I'd rather just
put a 'sane' implementation in a library or so.

Andy Lutomirski

unread,

Oct 19, 2014, 7:00:01 PM10/19/14

to

On Sun, Oct 19, 2014 at 3:20 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Sun, Oct 19, 2014 at 03:05:42PM -0700, Andy Lutomirski wrote:
>> On Oct 19, 2014 2:33 PM, "Peter Zijlstra" <pet...@infradead.org> wrote:
>
>> > > Also, I don't understand the purpose of cap_user_time. Wouldn't it be
>> > > easier to just record the last CLOCK_MONOTONIC time and let the user
>> > > call __vdso_clock_gettime if they need an updated time?
>> >
>> > Because perf doesn't use CLOCK_MONOTONIC. Due to performance
>> > considerations we used the sched_clock stuff, which tries its best to
>> > make the best of the TSC without reverting to HPET and the like.
>> >
>> > Not to mention that CLOCK_MONOTONIC was not available from NMI context
>> > until very recently.
>>
>> I'm only talking about the userspace access to when an event was
>> enabled and how long it's been running. I think that's what the
>> cap_user_time stuff is for. I don't think those parameters are
>> touched from NMI, right?
>>
>> Point taken about sched_clock, though.
>
> Well, mixing two time bases, one TSC based and one CLOCK_MONOTONIC is
> just asking for trouble IMO ;-)

Fair enough.

>
>> > Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
>> > from perf sample timestamps") seem to suggest people actually use TSC
>> > for things as well.
>> >
>> > Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
>> > fallback to use the sched_clock stuff on time challenged hardware) in
>> > order to ease the correlation between other trace thingies, but even
>> > then it makes sense to have this, having it here and reading the TSC
>> > within the seqcount loop ensures you've got consistent data and touch
>> > less cachelines for reading.
>>
>> True.
>>
>> OTOH, people (i.e. I) have optimized the crap out of
>> __vdso_clock_gettime, and __vdso_perf_event_whatever could be
>> similarly optimized.
>
> Maybe, but at that point we commit to yet another ABI... I'd rather just
> put a 'sane' implementation in a library or so.

This cuts both ways, though. For vdso timekeeping, the underlying
data structure has changed repeatedly, sometimes to add features, and
sometimes for performance, and the vdso has done a good job insulating
userspace from it. (In fact, until 3.16, even the same exact kernel
version couldn't be relied on to have the same data structure with
different configs, and even now, no one really wants to teach user
libraries how to parse the pvclock data structures.)

I would certainly not suggest putting anything beyond the bare minimum
into the vdso.

FWIW, something should probably specify exactly when it's safe to try
a userspace rdpmc. I think that the answer is that, for a perf event
watching a pid, only that pid can do it (in particular, other threads
must not try). For a perf event monitoring a whole cpu, the answer is
less clear to me.

--Andy

Andy Lutomirski

unread,

Oct 19, 2014, 8:10:02 PM10/19/14

to

On Sun, Oct 19, 2014 at 2:35 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Thu, Oct 16, 2014 at 05:00:56PM -0700, Andy Lutomirski wrote:
>> The current cap_user_rdpmc code seems rather confused to me. On x86,
>> *all* events set cap_user_rdpmc if the global rdpmc control is set.
>> But only x86_pmu events define .event_idx, so amd uncore events won't
>> actually expose their rdpmc index to userspace.
>>
>> Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
>> that gets set on all events created while rdpmc == 1, to change
>> x86_pmu_event_idx to do something like:
>>
>> if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
>> return event->hw.event_base_rdpmc + 1;
>> else
>> return 0;
>>
>> and to change arch_perf_update_userpage cap_user_rdpmc to match
>> PERF_X86_EVENT_RDPMC_PERMITTED?
>>
>> Then we could ditch the static key and greatly simplify writes to the
>> rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
>>
>> This would be a user-visible change on AMD, and I can't test it.
>
> I have AMD hardware to test this. But yes something like that seems
> fine.

Before I totally screw this up: is .event_idx used for anything except
userspace rdpmc? There are a whole bunch of implementations of that
callback:

- perf_event_idx_default seems fishy
- power_pmu_event_idx seems even fishier
- cpumsf_pmu_event_idx is the same as power_pmu_event_idx.
- perf_swevent_event_idx returns 0.

etc.

x86 is the only implementation of arch_perf_update_userpage, which
makes me think that the .event_idx callback should just be removed and
that arch_perf_update_userpage should be responsible for filling it in
if needed.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

Peter Zijlstra

unread,

Oct 20, 2014, 4:40:02 AM10/20/14

to

On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
> > Maybe, but at that point we commit to yet another ABI... I'd rather just
> > put a 'sane' implementation in a library or so.
>
> This cuts both ways, though. For vdso timekeeping, the underlying
> data structure has changed repeatedly, sometimes to add features, and
> sometimes for performance, and the vdso has done a good job insulating
> userspace from it. (In fact, until 3.16, even the same exact kernel
> version couldn't be relied on to have the same data structure with
> different configs, and even now, no one really wants to teach user
> libraries how to parse the pvclock data structures.)

Fair enough, but as it stands we've already committed to the data
structure exposed to userspace.

> I would certainly not suggest putting anything beyond the bare minimum
> into the vdso.

Depends on what you really want to do I suppose, if you've got a pinned
event and know there cannot be multiplexing, not doing the time reads
the multiplications and all that saves a ton of cycles. But in generic I
suppose you have to do all that.

> FWIW, something should probably specify exactly when it's safe to try
> a userspace rdpmc. I think that the answer is that, for a perf event
> watching a pid, only that pid can do it (in particular, other threads
> must not try). For a perf event monitoring a whole cpu, the answer is
> less clear to me.

This all was really only meant to be used for self-monitoring, so where
an event is attached to the very same task, anything else and I'm find
disabling it.

Peter Zijlstra

unread,

Oct 20, 2014, 4:50:02 AM10/20/14

to

On Sun, Oct 19, 2014 at 05:08:08PM -0700, Andy Lutomirski wrote:
> Before I totally screw this up: is .event_idx used for anything except
> userspace rdpmc?

It should only be used for that.

> There are a whole bunch of implementations of that
> callback:
>
> - perf_event_idx_default seems fishy

I suppose I did that to encode the rule that 0 := disabled, figuring
that if there is no actual instruction to access the data, all of this
would be pointless anyhow.

> - power_pmu_event_idx seems even fishier

Agreed, I preserved behaviour in 35edc2a5095e ("perf, arch: Rework
perf_event_index()") and at that time asked about why this is. Nobody
replied, lets try again.

Michael, Anton?

> - cpumsf_pmu_event_idx is the same as power_pmu_event_idx.

Oh cute, lets ask the s390 people, do you guys have a userspace
instruction to read the actual counter value?

> - perf_swevent_event_idx returns 0.

Right, guaranteed no way to access any of that, maybe we should make
the default like that too.

> etc.
>
> x86 is the only implementation of arch_perf_update_userpage, which
> makes me think that the .event_idx callback should just be removed and
> that arch_perf_update_userpage should be responsible for filling it in
> if needed.

Its a per pmu thing, with the x86 hardware pmu being the only one that's
actually 'known' working to me.

All the other x86 pmus like the software, uncore, etc. don't support
this.

That said, there's definitely room for improvement here.

Martin Schwidefsky

unread,

Oct 20, 2014, 5:30:08 AM10/20/14

to

On Mon, 20 Oct 2014 10:48:13 +0200
Peter Zijlstra <pet...@infradead.org> wrote:

> > - cpumsf_pmu_event_idx is the same as power_pmu_event_idx.
>
> Oh cute, lets ask the s390 people, do you guys have a userspace
> instruction to read the actual counter value?

The "extract cpu counter" ECCTR instruction can be authorized to
be used in user-space with a bit in CR0. With the current code
this bit is not set, the cpu measurement facility is not usable
in user space without a patch to the kernel.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

Hendrik Brueckner

unread,

Oct 20, 2014, 7:00:03 AM10/20/14

to

On Mon, Oct 20, 2014 at 10:48:13AM +0200, Peter Zijlstra wrote:
> On Sun, Oct 19, 2014 at 05:08:08PM -0700, Andy Lutomirski wrote:
> > There are a whole bunch of implementations of that
> > callback:
> >
> > - perf_event_idx_default seems fishy
>
> I suppose I did that to encode the rule that 0 := disabled, figuring
> that if there is no actual instruction to access the data, all of this
> would be pointless anyhow.

See comment on "perf_swevent_event_idx returns 0".

>
> > - cpumsf_pmu_event_idx is the same as power_pmu_event_idx.
>
> Oh cute, lets ask the s390 people, do you guys have a userspace
> instruction to read the actual counter value?

For the hardware sampling pmu (cpumsf) there is no instruction to read
sampling data in user space. Also the event->hw.index is always set
to zero anyway. I am going to remove this function and let the core
event code handle the proper default.

As Martin Schwidefsky pointed out in his mail, there is an instruction
to read some hardware counter data (different to the sampling data).
User space is prohibited to use this instruction. Instead, user space
can use another approach which is called runtime instrumentation, for
self-monitoring.

>
> > - perf_swevent_event_idx returns 0.
>
> Right, guaranteed no way to access any of that, maybe we should make
> the default like that too.

I think it would makes sense to return 0 as default in the
perf_event_idx_default() and let each PMU/arch that actually supports
reading PMCs from user space return the proper index. And according
to tools/perf/design.txt, index must be non-zero to trigger a user space
read.

Thanks and kind regards,
Hendrik

Andy Lutomirski

unread,

Oct 20, 2014, 12:50:06 PM10/20/14

to

On Mon, Oct 20, 2014 at 1:33 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
>> > Maybe, but at that point we commit to yet another ABI... I'd rather just
>> > put a 'sane' implementation in a library or so.
>>
>> This cuts both ways, though. For vdso timekeeping, the underlying
>> data structure has changed repeatedly, sometimes to add features, and
>> sometimes for performance, and the vdso has done a good job insulating
>> userspace from it. (In fact, until 3.16, even the same exact kernel
>> version couldn't be relied on to have the same data structure with
>> different configs, and even now, no one really wants to teach user
>> libraries how to parse the pvclock data structures.)
>
> Fair enough, but as it stands we've already committed to the data
> structure exposed to userspace.

True.

OTOH, if a vdso function gets added, a few releases go by, and all the
userspace tools get updated, then the old data structure could be
dropped if needed by clearing cap_user_rdpmc.

Anyway, this is so far out of scope for the current project that I'm
going to ignore it.

>> FWIW, something should probably specify exactly when it's safe to try
>> a userspace rdpmc. I think that the answer is that, for a perf event
>> watching a pid, only that pid can do it (in particular, other threads
>> must not try). For a perf event monitoring a whole cpu, the answer is
>> less clear to me.
>
> This all was really only meant to be used for self-monitoring, so where
> an event is attached to the very same task, anything else and I'm find
> disabling it.

Actually implementing this might be a touch awkward. I can check
whether an event has a task context that matches the creating task,
but that's not necessarily the same thing as the task that mmaps it.

--Andy

Andy Lutomirski

unread,

Oct 20, 2014, 1:50:02 PM10/20/14

to

On Mon, Oct 20, 2014 at 9:49 AM, Andy Lutomirski <lu...@amacapital.net> wrote:
> On Mon, Oct 20, 2014 at 1:33 AM, Peter Zijlstra <pet...@infradead.org> wrote:
>> On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
>>> > Maybe, but at that point we commit to yet another ABI... I'd rather just
>>> > put a 'sane' implementation in a library or so.
>>>
>>> This cuts both ways, though. For vdso timekeeping, the underlying
>>> data structure has changed repeatedly, sometimes to add features, and
>>> sometimes for performance, and the vdso has done a good job insulating
>>> userspace from it. (In fact, until 3.16, even the same exact kernel
>>> version couldn't be relied on to have the same data structure with
>>> different configs, and even now, no one really wants to teach user
>>> libraries how to parse the pvclock data structures.)
>>
>> Fair enough, but as it stands we've already committed to the data
>> structure exposed to userspace.
>
> True.
>
> OTOH, if a vdso function gets added, a few releases go by, and all the
> userspace tools get updated, then the old data structure could be
> dropped if needed by clearing cap_user_rdpmc.
>
> Anyway, this is so far out of scope for the current project that I'm
> going to ignore it.

OK, I lied.

I haven't tested it, but it looks like any existing users of
cap_user_rdpmc may have serious issues. That flag is set to 1 for
essentially all perf_events on x86, even events that aren't part of
the x86_pmu. Since the default .event_idx callback doesn't return
zero, lots of other events will appear to be rdpmcable. This includes
the AMD uncore pmu, which looks like it actually supports rdpmc, but
hwc->idx seems to be missing an offset.

If this is the case, then user code can't reliably use the userpage
rdpmc mechanism, so maybe it should be deprecated (or at least get a
new flag bit).

Vince Weaver

unread,

Oct 21, 2014, 12:20:03 AM10/21/14

to

On Tue, 14 Oct 2014, Andy Lutomirski wrote:

> This little series tightens up rdpmc permissions. With it applied,
> rdpmc can only be used if a perf_event is actually mmapped. For now,
> this is only really useful for seccomp.

So just to be difficult...

I am aware of at least one group who is doing low-latency performance
measures using rdpmc on Linux.

They start the counters manually by poking the MSRs directly (bypassing
perf_event_open()).

They use rdpmc, grateful for the fact that currently CR4 is set up so they
can do this w/o patching the kernel.

These patches of course would break this use case...

Vince

Andy Lutomirski

unread,

Oct 21, 2014, 12:30:02 AM10/21/14

to

On Mon, Oct 20, 2014 at 9:06 PM, Vince Weaver <vi...@deater.net> wrote:
> On Tue, 14 Oct 2014, Andy Lutomirski wrote:
>
>> This little series tightens up rdpmc permissions. With it applied,
>> rdpmc can only be used if a perf_event is actually mmapped. For now,
>> this is only really useful for seccomp.
>
> So just to be difficult...
>
> I am aware of at least one group who is doing low-latency performance
> measures using rdpmc on Linux.
>
> They start the counters manually by poking the MSRs directly (bypassing
> perf_event_open()).
>
> They use rdpmc, grateful for the fact that currently CR4 is set up so they
> can do this w/o patching the kernel.
>
> These patches of course would break this use case...

ISTM it would be a lot better to use the perf subsystem for this. You
can probably pin an event to a pmu. I don't know whether you can pin
an event systemwide. I also don't know whether pinning an event will
prevent perf from changing its value periodically, and it certainly
doesn't magically make wraparound issues go away.

It would be easy to add a new setting for rdpmc to deal with the PCE part.

E.g. rdpmc = 2 could allow all tasks to use rdpmc regardless of
whether they have an event mapped. (And I would have to re-add the
static key. Sigh.)

--Andy

P.S. Hey, Intel, you forgot to add rdpmcp :-/

Borislav Petkov

unread,

Oct 21, 2014, 1:50:01 AM10/21/14

to

On Thu, Oct 16, 2014 at 09:21:42AM -0700, Andy Lutomirski wrote:
> I think it's the same as in the other case in switch_mm. leave_mm does
> cpumask_clear_cpu(cpu, mm_cpumask(active_mm)), and, once that has
> happened, modify_ldt won't send an IPI to this CPU. So, if leave_mm
> runs, and then another CPU calls modify_ldt on the mm that is in lazy
> mode here, it won't update our LDT register, so the LDT register and
> prev->context.ldt might not match.

Ok, let me see if I can follow with an example:

We call leave_mm() on, say, cpu 3 and mm_cpumask(active_mm) has cpu 3 and
4 set. Then, on cpu 4 we call modify_ldt on that same mm and there in
alloc_ldt() we have this:

if (!cpumask_equal(mm_cpumask(current->mm),
cpumask_of(smp_processor_id())))
smp_call_function(flush_ldt, current->mm, 1);

and since we've cleared cpu 3 from the cpumask, we don't flush_ldt()
on it and there you have the difference.

Am I close?

Thanks.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

Andy Lutomirski

unread,

Oct 21, 2014, 1:50:02 AM10/21/14

to

On Mon, Oct 20, 2014 at 10:41 PM, Borislav Petkov <b...@alien8.de> wrote:
> On Thu, Oct 16, 2014 at 09:21:42AM -0700, Andy Lutomirski wrote:
>> I think it's the same as in the other case in switch_mm. leave_mm does
>> cpumask_clear_cpu(cpu, mm_cpumask(active_mm)), and, once that has
>> happened, modify_ldt won't send an IPI to this CPU. So, if leave_mm
>> runs, and then another CPU calls modify_ldt on the mm that is in lazy
>> mode here, it won't update our LDT register, so the LDT register and
>> prev->context.ldt might not match.
>
> Ok, let me see if I can follow with an example:
>
> We call leave_mm() on, say, cpu 3 and mm_cpumask(active_mm) has cpu 3 and
> 4 set. Then, on cpu 4 we call modify_ldt on that same mm and there in
> alloc_ldt() we have this:
>
> if (!cpumask_equal(mm_cpumask(current->mm),
> cpumask_of(smp_processor_id())))
> smp_call_function(flush_ldt, current->mm, 1);
>
> and since we've cleared cpu 3 from the cpumask, we don't flush_ldt()
> on it and there you have the difference.
>
> Am I close?

You're exactly correct, or at least you seem to understand it the way I do :)

--Andy

Borislav Petkov

unread,

Oct 21, 2014, 2:10:01 AM10/21/14

to

On Mon, Oct 20, 2014 at 10:44:18PM -0700, Andy Lutomirski wrote:
> You're exactly correct, or at least you seem to understand it the way I do :)

Ok, cool.

Now, if I had more time, I'd take a guest and add some debugging
code to see when exactly that happens and how prev->context.ldt and
next->context.ldt look... Maybe later. :-)

Thanks.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

Peter Zijlstra

unread,

Oct 21, 2014, 5:00:02 AM10/21/14

to

Seeing how we've not actually had bugreports on this, I suspect we can
still fix that.

Peter Zijlstra

unread,

Oct 21, 2014, 5:20:02 AM10/21/14

to

On Mon, Oct 20, 2014 at 12:51:10PM +0200, Hendrik Brueckner wrote:
> I think it would makes sense to return 0 as default in the
> perf_event_idx_default() and let each PMU/arch that actually supports
> reading PMCs from user space return the proper index. And according
> to tools/perf/design.txt, index must be non-zero to trigger a user space
> read.

OK, I've got something like the below. Michael/Anton, would you please
clarify the ppc book3s capabilities?

---
Subject: perf: Clean up pmu::event_idx
From: Peter Zijlstra <pet...@infradead.org>
Date: Tue Oct 21 11:10:21 CEST 2014

Andy reported that the current state of event_idx is rather confused.
So remove all but the x86_pmu implementation and change the default to
return 0 (the safe option).

Reported-by: Andy Lutomirski <lu...@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
arch/powerpc/perf/hv-24x7.c | 6 ------
arch/powerpc/perf/hv-gpci.c | 6 ------
arch/s390/kernel/perf_cpum_sf.c | 6 ------
kernel/events/core.c | 15 +--------------
kernel/events/hw_breakpoint.c | 7 -------
5 files changed, 1 insertion(+), 39 deletions(-)

--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_
return 0;
}

-static int h_24x7_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu h_24x7_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = {
.start = h_24x7_event_start,
.stop = h_24x7_event_stop,
.read = h_24x7_event_update,
- .event_idx = h_24x7_event_idx,
};

static int hv_24x7_init(void)
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf
return 0;
}

-static int h_gpci_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu h_gpci_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = {
.start = h_gpci_event_start,
.stop = h_gpci_event_stop,
.read = h_gpci_event_update,
- .event_idx = h_gpci_event_idx,
};

static int hv_gpci_init(void)
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_e
perf_pmu_enable(event->pmu);
}

-static int cpumsf_pmu_event_idx(struct perf_event *event)
-{
- return event->hw.idx;
-}
-
CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF);
CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);

@@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = {
.stop = cpumsf_pmu_stop,
.read = cpumsf_pmu_read,

- .event_idx = cpumsf_pmu_event_idx,
.attr_groups = cpumsf_pmu_attr_groups,
};

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf
return 0;
}

-static int perf_swevent_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,

@@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};

#ifdef CONFIG_EVENT_TRACING
@@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};

static inline void perf_tp_register(void)
@@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};

/*
@@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};

static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct p

static int perf_event_idx_default(struct perf_event *event)
{
- return event->hw.idx + 1;
+ return 0;
}

/*
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct pe
bp->hw.state = PERF_HES_STOPPED;
}

-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
- return 0;
-}
-
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */

@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .event_idx = hw_breakpoint_event_idx,
};

int __init init_hw_breakpoint(void)

Vince Weaver

unread,

Oct 21, 2014, 11:00:03 AM10/21/14

to

On Mon, 20 Oct 2014, Andy Lutomirski wrote:

> ISTM it would be a lot better to use the perf subsystem for this. You
> can probably pin an event to a pmu.

No, you cannot pin an event to a counter with perf_event.
That's one of the big differences between perf_event and, say, perfmon2.

With perf_event the kernel controls which events go in which counters and
the user has no say. That's part of why you need to check the mmap page
every time you want to use rdpmc because there's no other way of knowing
which counter to read to get the event you want.

perf_event is also fairly high overhead for setting up and starting
events, and mildly high overhead when doing a proper rdpmc call (due to
the required looking at mmap, and the fact that you need to do two rdpmc
calls before/after to get your value). This is why people really worried
about low-latency measurements bypass as much of perf_event as possible.

Vince

Andy Lutomirski

unread,

Oct 21, 2014, 12:00:03 PM10/21/14

to

On Tue, Oct 21, 2014 at 2:14 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Mon, Oct 20, 2014 at 12:51:10PM +0200, Hendrik Brueckner wrote:
>> I think it would makes sense to return 0 as default in the
>> perf_event_idx_default() and let each PMU/arch that actually supports
>> reading PMCs from user space return the proper index. And according
>> to tools/perf/design.txt, index must be non-zero to trigger a user space
>> read.
>
> OK, I've got something like the below. Michael/Anton, would you please
> clarify the ppc book3s capabilities?
>
> ---
> Subject: perf: Clean up pmu::event_idx
> From: Peter Zijlstra <pet...@infradead.org>
> Date: Tue Oct 21 11:10:21 CEST 2014
>
> Andy reported that the current state of event_idx is rather confused.
> So remove all but the x86_pmu implementation and change the default to
> return 0 (the safe option).

I wrote essentially the same patch yesterday, so looks good to me :)

Andy Lutomirski
AMA Capital Management, LLC

Peter Zijlstra

unread,

Oct 21, 2014, 12:10:01 PM10/21/14

to

On Tue, Oct 21, 2014 at 11:00:26AM -0400, Vince Weaver wrote:
> On Mon, 20 Oct 2014, Andy Lutomirski wrote:
>
> > ISTM it would be a lot better to use the perf subsystem for this. You
> > can probably pin an event to a pmu.
>
> No, you cannot pin an event to a counter with perf_event.
> That's one of the big differences between perf_event and, say, perfmon2.
>
> With perf_event the kernel controls which events go in which counters and
> the user has no say. That's part of why you need to check the mmap page
> every time you want to use rdpmc because there's no other way of knowing
> which counter to read to get the event you want.
>
> perf_event is also fairly high overhead for setting up and starting
> events,

Which you only do once at the start, so is that really a problem?

> and mildly high overhead when doing a proper rdpmc call (due to
> the required looking at mmap, and the fact that you need to do two rdpmc
> calls before/after to get your value). This is why people really worried
> about low-latency measurements bypass as much of perf_event as possible.

I still don't get that argument, 2 rdpmc's is cheaper than doing wrmsr,
not to mention doing wrmsr through a syscall. And looking at that mmap
page is 1 cacheline. Is that cacheline read (assuming you miss) the real
problem?

Vince Weaver

unread,

Oct 21, 2014, 1:10:03 PM10/21/14

to

On Tue, 21 Oct 2014, Peter Zijlstra wrote:

> > perf_event is also fairly high overhead for setting up and starting
> > events,
>
> Which you only do once at the start, so is that really a problem?

There are various reasons why you might want to start events at times
other than the beginning of the program. Some people don't like kernel
multiplexing so they start/stop manually if they want to switch eventsets.

But no, I suppose you could ask anyone wanting to use rdpmc to open some
sort of dummy event at startup just to get cr4 enabled.

> I still don't get that argument, 2 rdpmc's is cheaper than doing wrmsr,
> not to mention doing wrmsr through a syscall. And looking at that mmap
> page is 1 cacheline. Is that cacheline read (assuming you miss) the real
> problem?

Well at least by default the first read of the mmap page causes a
pagefault which adds a few thousand cycles of latency. Though you can
somewhat get around this by prefaulting it in at some point.

Anyway I'm just reporting numbers I get when measuring the overhead of
the old perfctr interface vs perf_event on typical PAPI workloads. It's
true you can re-arrange calls and such so that perf_event behaves better
but that involves redoing a lot of existing code.

I do appreciate the trouble you've gone through keeping self-monitoring
working considering the fact that I'm the only user admitting to using it.

Adding perf_event rdpmc support to PAPI has been stalled for a while due
to various reasons. So that's why I haven't been finding the various bugs
that have been turning up. The PAPI perf_event component really needs a
complete from-scratch re-write, but that's made tricky because we have to
be backwards compatible and workaround all the pre-2.6.36 perf_event bugs.
You wouldn't think anyone would care, but the most vocal users are all
RHEL 6 users running the monstrosity of a 2.6.32 kernel that is patched
full of all kinds of crazy back-ported perf_event patches, and that is
always breaking PAPI in fun and exciting ways.

Vince

Peter Zijlstra

unread,

Oct 23, 2014, 7:50:06 AM10/23/14

to

On Tue, Oct 21, 2014 at 01:05:49PM -0400, Vince Weaver wrote:
> On Tue, 21 Oct 2014, Peter Zijlstra wrote:
>
> > > perf_event is also fairly high overhead for setting up and starting
> > > events,
> >
> > Which you only do once at the start, so is that really a problem?
>
> There are various reasons why you might want to start events at times
> other than the beginning of the program. Some people don't like kernel
> multiplexing so they start/stop manually if they want to switch eventsets.

I suppose you could pre-create all events and use ioctl()s to start/stop
them where/when desired, this should be faster I think. But yes, this is
not a use-case I've though much about.

> But no, I suppose you could ask anyone wanting to use rdpmc to open some
> sort of dummy event at startup just to get cr4 enabled.

That's one work-around :-)

> > I still don't get that argument, 2 rdpmc's is cheaper than doing wrmsr,
> > not to mention doing wrmsr through a syscall. And looking at that mmap
> > page is 1 cacheline. Is that cacheline read (assuming you miss) the real
> > problem?
>
> Well at least by default the first read of the mmap page causes a
> pagefault which adds a few thousand cycles of latency. Though you can
> somewhat get around this by prefaulting it in at some point.

MAP_POPULATE is your friend there, but yes manually prefaulting is
perfectly fine too, and the HPC people are quite familiar with the
concept, they do it for a lot of things.

> Anyway I'm just reporting numbers I get when measuring the overhead of
> the old perfctr interface vs perf_event on typical PAPI workloads. It's
> true you can re-arrange calls and such so that perf_event behaves better
> but that involves redoing a lot of existing code.

OK agreed, having to change existing code is often subject to various
forms of inertia/resistance. And yes I cannot deny that some of the
features perf has come at the expense of various overheads, however hard
we're trying to keep costs down.

> I do appreciate the trouble you've gone through keeping self-monitoring
> working considering the fact that I'm the only user admitting to using it.

I have some code somewhere that uses it too, I've tried pushing it off
to other people but so far there are no takers :-)

Vince Weaver

unread,

Oct 24, 2014, 8:50:10 AM10/24/14

to

On Thu, 23 Oct 2014, Peter Zijlstra wrote:

> On Tue, Oct 21, 2014 at 01:05:49PM -0400, Vince Weaver wrote:

> > There are various reasons why you might want to start events at times
> > other than the beginning of the program. Some people don't like kernel
> > multiplexing so they start/stop manually if they want to switch eventsets.
>
> I suppose you could pre-create all events and use ioctl()s to start/stop
> them where/when desired, this should be faster I think. But yes, this is
> not a use-case I've though much about.

The scheduling step is most of what makes the perf_event start call have
high overhead. The other annoyance is the fact that due to the NMI
watchdog your can successfully perf_event_open() an event group but still
have it fail at start time, so a lot of code has to be done that does
extraneous open/start/close calls to make sure the events really fit.

> MAP_POPULATE is your friend there, but yes manually prefaulting is
> perfectly fine too, and the HPC people are quite familiar with the
> concept, they do it for a lot of things.

MAP_POPULATE actually has noticably more overhead than manually
prefaulting. It's on my todo list to drop ftrace on there and find out
why, but I've been stuck chasing kernel-crashing fuzzer bugs instead in my
spare time.

perfctr and possibly perfmon2 would automatically pre-fault the mmap page
for you in the kernel, so there was no need for the user to do it.

In any case I wasn't really trying to make trouble here, it's just I came
across the people using rdpmc w/o perf_event just the other day (on USENET
of all places). They were so happy it worked w/o patches now, that I felt
bad breaking it to them that there were patches floating around that were
going to make their usecase not work anymore.

I guess like all things though, you can't have anything fun and useful in
the kernel without the security people taking it away.

Vince

Andy Lutomirski

unread,

Oct 24, 2014, 6:20:05 PM10/24/14

to

On Fri, Oct 24, 2014 at 5:41 AM, Vince Weaver <vi...@deater.net> wrote:
> On Thu, 23 Oct 2014, Peter Zijlstra wrote:
>
>> On Tue, Oct 21, 2014 at 01:05:49PM -0400, Vince Weaver wrote:
>
>> > There are various reasons why you might want to start events at times
>> > other than the beginning of the program. Some people don't like kernel
>> > multiplexing so they start/stop manually if they want to switch eventsets.
>>
>> I suppose you could pre-create all events and use ioctl()s to start/stop
>> them where/when desired, this should be faster I think. But yes, this is
>> not a use-case I've though much about.
>
> The scheduling step is most of what makes the perf_event start call have
> high overhead. The other annoyance is the fact that due to the NMI
> watchdog your can successfully perf_event_open() an event group but still
> have it fail at start time, so a lot of code has to be done that does
> extraneous open/start/close calls to make sure the events really fit.
>
>> MAP_POPULATE is your friend there, but yes manually prefaulting is
>> perfectly fine too, and the HPC people are quite familiar with the
>> concept, they do it for a lot of things.
>
> MAP_POPULATE actually has noticably more overhead than manually
> prefaulting. It's on my todo list to drop ftrace on there and find out
> why, but I've been stuck chasing kernel-crashing fuzzer bugs instead in my
> spare time.

Have you checked recently? IIRC Michael Lespinasse put some effort
into improving MAP_POPULATE recently.

>
> perfctr and possibly perfmon2 would automatically pre-fault the mmap page
> for you in the kernel, so there was no need for the user to do it.
>
>
> In any case I wasn't really trying to make trouble here, it's just I came
> across the people using rdpmc w/o perf_event just the other day (on USENET
> of all places). They were so happy it worked w/o patches now, that I felt
> bad breaking it to them that there were patches floating around that were
> going to make their usecase not work anymore.
>
> I guess like all things though, you can't have anything fun and useful in
> the kernel without the security people taking it away.

I'm sympathetic enough to this use case that I don't really mind
adding a bit of code to support rdpmc=2 meaning that rdpmc is always
allowed. Switching in and out of rdpmc=2 mode will be expensive
(static key and IPI to all CPUs). PeterZ, is that OK with you?

--Andy

>
> Vince

--
Andy Lutomirski
AMA Capital Management, LLC

Andy Lutomirski

unread,

Oct 24, 2014, 7:00:06 PM10/24/14

to

This little series tightens up rdpmc permissions. With it applied,
rdpmc can only be used if a perf_event is actually mmapped. For now,
this is only really useful for seccomp.

At some point this could be further tightened up to only allow rdpmc
if an actual self-monitoring perf event that is compatible with
rdpmc is mapped.

This should add <50ns to context switches between rdpmc-capable and
rdpmc-non-capable mms. I suspect that this is well under 10%
overhead, given that perf already adds some context switch latency.

I think that patches 1-3 are a good idea regardless of any rdpmc changes.

AMD Uncore userspace rdpmc is broken by these patches (cap_user_rdpmc
will be zero), but it was broken anyway.

Changes from v1 (aka RFC):
- Rebased on top of the KVM CR4 fix. This applies to a very recent -linus.
- Renamed the cr4 helpers (Peter, Borislav)
- Fixed buggy cr4 helpers (Hilf)
- Improved lots of comments (everyone)
- Renamed read_cr4 and write_cr4 to make sure I didn't miss anything.
(NB: This will introduce conflicts with Andi's FSGSBASE work. This is
a good thing.)

Andy Lutomirski (7):
x86: Clean up cr4 manipulation
x86: Store a per-cpu shadow copy of CR4
x86: Add a comment clarifying LDT context switching
perf: Add pmu callbacks to track event mapping and unmapping
perf: Pass the event to arch_perf_update_userpage
x86, perf: Only allow rdpmc if a perf_event is mapped
x86, perf: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks

Peter Zijlstra (1):

perf: Clean up pmu::event_idx

Andy Lutomirski

unread,

Oct 24, 2014, 7:00:06 PM10/24/14

to

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---
arch/x86/kernel/cpu/perf_event.c | 3 ++-
kernel/events/core.c | 5 +++--
2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2176647fdbb7..00fbab7aa587 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1923,7 +1923,8 @@ static struct pmu pmu = {
.flush_branch_stack = x86_pmu_flush_branch_stack,
};

-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3b1ec69d189f..e366f98a94d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3947,7 +3947,8 @@ unlock:
rcu_read_unlock();
}

-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(
+ struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

@@ -3997,7 +3998,7 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);

- arch_perf_update_userpage(userpg, now);
+ arch_perf_update_userpage(event, userpg, now);

barrier();
++userpg->lock;

Andy Lutomirski

unread,

Oct 24, 2014, 7:00:06 PM10/24/14

to

From: Peter Zijlstra <pet...@infradead.org>

Andy reported that the current state of event_idx is rather confused.
So remove all but the x86_pmu implementation and change the default to
return 0 (the safe option).

Reported-by: Andy Lutomirski <lu...@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
arch/powerpc/perf/hv-24x7.c | 6 ------
arch/powerpc/perf/hv-gpci.c | 6 ------
arch/s390/kernel/perf_cpum_sf.c | 6 ------
kernel/events/core.c | 15 +--------------
kernel/events/hw_breakpoint.c | 7 -------
5 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 6c8710dd90c9..dba34088da28 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_event *event, int flags)

return 0;
}

-static int h_24x7_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu h_24x7_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = {
.start = h_24x7_event_start,
.stop = h_24x7_event_stop,
.read = h_24x7_event_update,
- .event_idx = h_24x7_event_idx,
};

static int hv_24x7_init(void)

diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c
index 15fc76c93022..a051fe946c63 100644
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf_event *event)

return 0;
}

-static int h_gpci_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu h_gpci_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = {
.start = h_gpci_event_start,
.stop = h_gpci_event_stop,
.read = h_gpci_event_update,
- .event_idx = h_gpci_event_idx,
};

static int hv_gpci_init(void)

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 08e761318c17..b878f12a9597 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_event *event, int flags)

perf_pmu_enable(event->pmu);
}

-static int cpumsf_pmu_event_idx(struct perf_event *event)
-{
- return event->hw.idx;
-}
-
CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF);
CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);

@@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = {
.stop = cpumsf_pmu_stop,
.read = cpumsf_pmu_read,

- .event_idx = cpumsf_pmu_event_idx,
.attr_groups = cpumsf_pmu_attr_groups,
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1425d07018de..2b02c9fda790 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event)

@@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)

static int perf_event_idx_default(struct perf_event *event)
{
- return event->hw.idx + 1;
+ return 0;
}

/*

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)

bp->hw.state = PERF_HES_STOPPED;
}

-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
- return 0;
-}
-
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */

@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .event_idx = hw_breakpoint_event_idx,
};

int __init init_hw_breakpoint(void)

Andy Lutomirski

unread,

Oct 24, 2014, 7:00:07 PM10/24/14

to

Context switches and TLB flushes can change individual bits of CR4.
CR4 reads take several cycles, so store a shadow copy of CR4 in a
per-cpu variable.

To avoid wasting a cache line, I added the CR4 shadow to
cpu_tlbstate, which is already touched in switch_mm. The heaviest
users of the cr4 shadow will be switch_mm and __switch_to_xtra, and
__switch_to_xtra is called shortly after switch_mm during context
switch, so the cacheline is likely to be hot.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/setup.c | 2 +-

arch/x86/kvm/svm.c | 2 +-
arch/x86/kvm/vmx.c | 6 ++---
arch/x86/mm/fault.c | 2 +-
arch/x86/mm/init.c | 8 ++++++
arch/x86/mm/tlb.c | 3 ---
arch/x86/power/cpu.c | 11 +++-----
arch/x86/realmode/init.c | 2 +-
20 files changed, 84 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cd6e1610e29e..a9d76e02301b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}

-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
}

-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
}
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..6a4b00fafb00 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}

-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
}

-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return native_read_cr4_safe();
}

-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc0c4bc356ce..cd791948b286 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,14 +15,37 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif

+struct tlb_state {
+#ifdef CONFIG_SMP
+ struct mm_struct *active_mm;
+ int state;
+#endif
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+ unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+ this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
/* Set in this cpu's CR4. */
static inline void cr4_set_bits(unsigned long mask)
{
unsigned long cr4;

- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 | mask) != cr4) {
+ cr4 |= mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+ }
}

/* Clear in this cpu's CR4. */
@@ -30,9 +53,18 @@ static inline void cr4_clear_bits(unsigned long mask)
{
unsigned long cr4;

- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 & ~mask) != cr4) {
+ cr4 &= ~mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+ }
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
}

/*
@@ -61,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
{
unsigned long cr4;

- cr4 = native_read_cr4();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
@@ -221,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2

-struct tlb_state {
- struct mm_struct *active_mm;
- int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
static inline void reset_lazy_tlbstate(void)
{
this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index f41e19ca717b..cce9ee68e335 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -46,7 +46,7 @@ static inline void cpu_vmxoff(void)

static inline int cpu_vmx_enabled(void)
{
- return read_cr4() & X86_CR4_VMXE;
+ return __read_cr4() & X86_CR4_VMXE;
}

/** Disable VMX if it is enabled on the current CPU
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..d1daead5fcdd 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)

header->pmode_cr0 = read_cr0();
if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = __read_cr4();
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 847d62b78431..1b008f8e5219 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>
@@ -1302,6 +1303,12 @@ void cpu_init(void)
wait_for_master_cpu(cpu);

/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)

/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}

/*
@@ -171,7 +171,7 @@ static void post_set(void)

/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
}

static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)

/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}

/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)

/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)

asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);

/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..3b241f0ca005 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -155,6 +155,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..603c4f99cb5a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4_safe();
+ cr4 = __read_cr4_safe();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3ed4a68d4013..7e2810252d88 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4();
+ cr4 = __read_cr4();

printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 235cfd39e0d7..2d7cab9adaf6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1176,7 +1176,7 @@ void __init setup_arch(char **cmdline_p)

if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = read_cr4();
+ mmu_cr4_features = __read_cr4();
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 65510f624dfe..42c3a90281ee 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1581,7 +1581,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;

if (cr4 & X86_CR4_VMXE)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 93e2649fb786..de464ddcff20 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2738,7 +2738,7 @@ static int hardware_enable(void)
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;

- if (read_cr4() & X86_CR4_VMXE)
+ if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;

INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -4274,7 +4274,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */

/* Save the most likely value for this task's CR4 in the VMCS. */
- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
vmx->host_state.vmcs_host_cr4 = cr4;

@@ -7546,7 +7546,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
vmx->host_state.vmcs_host_cr4 = cr4;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d973e61e450d..5239ac2e189a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
if (pte && pte_present(*pte) && pte_exec(*pte) &&
(pgd_flags(*pgd) & _PAGE_USER) &&
- (read_cr4() & X86_CR4_SMEP))
+ (__read_cr4() & X86_CR4_SMEP))
printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73b37cd0d253..0a59a63bcdad 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -687,3 +687,11 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}

+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
#include <asm/uv/uv.h>
#include <linux/debugfs.h>

-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
- = { &init_mm, 0, };
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6ec7910f59bf..3e32ed5648a0 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,11 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
-#ifdef CONFIG_X86_32
- ctxt->cr4 = read_cr4_safe();
-#else
-/* CONFIG_X86_64 */
- ctxt->cr4 = read_cr4();
+ ctxt->cr4 = __read_cr4_safe();
+#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
#endif
ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
@@ -175,12 +172,12 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
/* cr4 was introduced in the Pentium CPU */
#ifdef CONFIG_X86_32
if (ctxt->cr4)
- write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4);
#else
/* CONFIG X86_64 */
wrmsrl(MSR_EFER, ctxt->efer);
write_cr8(ctxt->cr8);
- write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4);
#endif
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index bad628a620c4..0b7a63d98440 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -81,7 +81,7 @@ void __init setup_real_mode(void)

trampoline_header->start = (u64) secondary_startup_64;
trampoline_cr4_features = &trampoline_header->cr4;
- *trampoline_cr4_features = read_cr4();
+ *trampoline_cr4_features = __read_cr4();

trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;

Andy Lutomirski

unread,

Oct 24, 2014, 7:00:06 PM10/24/14

to

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

include/linux/perf_event.h | 7 +++++++
kernel/events/core.c | 9 +++++++++
2 files changed, 16 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 893a0d07986f..c93aff25a459 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -207,6 +207,13 @@ struct pmu {
*/
int (*event_init) (struct perf_event *event);

+ /*
+ * Notification that the event was mapped or unmapped. Called
+ * in the context of the mapping task.
+ */
+ void (*event_mapped) (struct perf_event *event); /*optional*/
+ void (*event_unmapped) (struct perf_event *event); /*optional*/
+
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2b02c9fda790..3b1ec69d189f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4139,6 +4139,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)

atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
}

/*
@@ -4158,6 +4161,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);

+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
atomic_dec(&rb->mmap_count);

if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4359,6 +4365,9 @@ unlock:
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;

+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
return ret;

Andy Lutomirski

unread,

Oct 24, 2014, 7:00:07 PM10/24/14

to

While perfmon2 is a sufficiently evil library (it pokes MSRs
directly) that breaking it is fair game, it's still useful, so we
might as well try to support it. This allows users to write 2 to
/sys/devices/cpu/rdpmc to disable all rdpmc protection so that hack
like perfmon2 can continue to work.

At some point, if perf_event becomes fast enough to replace
perfmon2, then this can go.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

arch/x86/include/asm/mmu_context.h | 5 ++++-
arch/x86/kernel/cpu/perf_event.c | 21 ++++++++++++++++++++-
2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index ccad8d616038..6133f0d02ea4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -20,9 +20,12 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
#endif /* !CONFIG_PARAVIRT */

#ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_always_available;
+
static inline void load_mm_cr4(struct mm_struct *mm)
{
- if (atomic_read(&mm->context.perf_rdpmc_allowed))
+ if (static_key_true(&rdpmc_always_available) ||
+ atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
cr4_clear_bits(X86_CR4_PCE);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3e875b3b30f2..31fc31e0cf9b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -45,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1878,10 +1880,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (ret)
return ret;

+ if (val > 2)
+ return -EINVAL;
+
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;

- x86_pmu.attr_rdpmc = !!val;
+ if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+ /*
+ * Changing into or out of always available, aka
+ * perf-event-bypassing mode. This path is extremely slow,
+ * but only root can trigger it, so it's okay.
+ */
+ if (val == 2)
+ static_key_slow_inc(&rdpmc_always_available);
+ else
+ static_key_slow_dec(&rdpmc_always_available);
+ on_each_cpu(refresh_pce, NULL, 1);
+ }
+
+ x86_pmu.attr_rdpmc = val;
+
return count;

Andy Lutomirski

unread,

Oct 24, 2014, 7:10:06 PM10/24/14

to

CR4 manipulation was split, seemingly at random, between direct
(write_cr4) and using a helper (set/clear_in_cr4). Unfortunately,
the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
which only a small subset of users actually wanted.

This patch replaces all cr4 access in functions that don't leave cr4
exactly the way they found it with new helpers cr4_set_bits,
cr4_clear_bits, and cr4_set_bits_and_update_boot.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
---

arch/x86/include/asm/processor.h | 33 --------------------------------
arch/x86/include/asm/tlbflush.h | 37 ++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/virtext.h | 3 ++-
arch/x86/kernel/cpu/common.c | 10 +++++-----
arch/x86/kernel/cpu/mcheck/mce.c | 3 ++-
arch/x86/kernel/cpu/mcheck/p5.c | 3 ++-
arch/x86/kernel/cpu/mcheck/winchip.c | 3 ++-
arch/x86/kernel/cpu/perf_event.c | 7 ++++---
arch/x86/kernel/i387.c | 3 ++-
arch/x86/kernel/process.c | 5 +++--
arch/x86/kernel/xsave.c | 3 ++-
arch/x86/kvm/vmx.c | 4 ++--
arch/x86/mm/init.c | 4 ++--
arch/x86/xen/enlighten.c | 4 ++--
drivers/lguest/x86/core.c | 4 ++--
15 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eb71ec794732..ddd8d13a010f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -578,39 +578,6 @@ static inline void load_sp0(struct tss_struct *tss,
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT */

-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features |= mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;

- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);

-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features &= ~mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;

- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);

-}
-
typedef struct {
unsigned long seg;
} mm_segment_t;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..fc0c4bc356ce 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,43 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif

+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{

+ unsigned long cr4;
+

+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{

+ unsigned long cr4;
+

+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
static inline void __native_flush_tlb(void)
{
native_write_cr3(native_read_cr3());
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..f41e19ca717b 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@

#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/tlbflush.h>

/*
* VMX functions:
@@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void)
static inline void cpu_vmxoff(void)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}

static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b4f78c9ba19..847d62b78431 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -276,7 +276,7 @@ __setup("nosmep", setup_disable_smep);
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set_bits(X86_CR4_SMEP);
}

static __init int setup_disable_smap(char *arg)
@@ -296,9 +296,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)

if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set_bits(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear_bits(X86_CR4_SMAP);
#endif
}
}
@@ -1320,7 +1320,7 @@ void cpu_init(void)

pr_debug("Initializing CPU#%d\n", cpu);

- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

/*
* Initialize the per-CPU GDT with the boot GDT,
@@ -1401,7 +1401,7 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);

if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

load_current_idt();
switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..617bc12ef575 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
#include <linux/export.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -1455,7 +1456,7 @@ static void __mcheck_cpu_init_generic(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);

rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..30692ac88d1e 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,7 @@
#include <linux/smp.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -59,7 +60,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
"Intel old style machine check architecture supported.\n");

/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..590cc753ba8f 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,6 +7,7 @@
#include <linux/types.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -31,7 +32,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);

printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1b8299dd3d91..2176647fdbb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
@@ -1336,7 +1337,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)

case CPU_STARTING:
if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set_bits(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1842,9 +1843,9 @@ static void change_rdpmc(void *info)
bool enable = !!(unsigned long)info;

if (enable)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set_bits(X86_CR4_PCE);
else
- clear_in_cr4(X86_CR4_PCE);
+ cr4_clear_bits(X86_CR4_PCE);

}

static ssize_t set_attr_rdpmc(struct device *cdev,

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..87727b03196d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
@@ -180,7 +181,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set_bits(cr4_mask);

cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..046e2d620bbe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -141,7 +142,7 @@ void flush_thread(void)

static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set_bits(X86_CR4_TSD);
}

void disable_TSC(void)
@@ -158,7 +159,7 @@ void disable_TSC(void)

static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear_bits(X86_CR4_TSD);
}

static void enable_TSC(void)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 4c540c4719d8..9828fdff6d59 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>

/*
@@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0acac81f198b..93e2649fb786 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2765,7 +2765,7 @@ static int hardware_enable(void)
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ cr4_set_bits(X86_CR4_VMXE);

if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);
@@ -2802,7 +2802,7 @@ static void hardware_disable(void)
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}

static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 66dba36f2343..73b37cd0d253 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -144,11 +144,11 @@ static void __init probe_page_size_mask(void)

/* Enable PSE if available */
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);

/* Enable PGE if available */
if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1a3f0445432a..03920ef71f9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1483,10 +1483,10 @@ static void xen_pvh_set_cr_flags(int cpu)
* set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
*/
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);

if (cpu_has_pge)
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
}

/*
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 922a1acbf652..e1700ff370dd 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -452,9 +452,9 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
static void adjust_pge(void *on)
{
if (on)
- write_cr4(read_cr4() | X86_CR4_PGE);
+ cr4_set_bits(X86_CR4_PGE);
else
- write_cr4(read_cr4() & ~X86_CR4_PGE);
+ cr4_clear_bits(X86_CR4_PGE);
}

/*H:020

Peter Zijlstra

unread,

Oct 31, 2014, 11:10:06 AM10/31/14

to

This looks ok I suppose, although the x86 people should pass verdict on
the first few patches.

Andy Lutomirski

unread,

Oct 31, 2014, 1:20:06 PM10/31/14

to

On Fri, Oct 31, 2014 at 8:09 AM, Peter Zijlstra <pet...@infradead.org> wrote:
>
>
> This looks ok I suppose, although the x86 people should pass verdict on
> the first few patches.

Adding some KVM people, too. Patches 2 and 3 affect KVM, and that
code has been buggy in the recent past. They should mostly fix the
performance regression that the fix for CVE-2014-3690 caused.

For reference, the series is here:

http://thread.gmane.org/gmane.linux.kernel/1813271/focus=1818661

Patch 1 has already been applied.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

Thomas Gleixner

unread,

Nov 1, 2014, 4:00:07 PM11/1/14

to

On Fri, 24 Oct 2014, Andy Lutomirski wrote:

I'm probably missing something.

Why is this tied to the mmap? If I just open a counter then I should
be able to read the counter from user space w/o mmapping something in
the first place.

Thanks,

tglx

Thomas Gleixner

unread,

Nov 1, 2014, 4:00:07 PM11/1/14

to

On Fri, 24 Oct 2014, Andy Lutomirski wrote:

> CR4 manipulation was split, seemingly at random, between direct
> (write_cr4) and using a helper (set/clear_in_cr4). Unfortunately,
> the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
> which only a small subset of users actually wanted.
>
> This patch replaces all cr4 access in functions that don't leave cr4
> exactly the way they found it with new helpers cr4_set_bits,
> cr4_clear_bits, and cr4_set_bits_and_update_boot.
>
> Signed-off-by: Andy Lutomirski <lu...@amacapital.net>

Reviewed-by: Thomas Gleixner <tg...@linutronix.de>

Thomas Gleixner

unread,

Nov 1, 2014, 4:00:07 PM11/1/14

to

On Fri, 24 Oct 2014, Andy Lutomirski wrote:

> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched in switch_mm. The heaviest
> users of the cr4 shadow will be switch_mm and __switch_to_xtra, and
> __switch_to_xtra is called shortly after switch_mm during context
> switch, so the cacheline is likely to be hot.
>
> Signed-off-by: Andy Lutomirski <lu...@amacapital.net>

Reviewed-by: Thomas Gleixner <tg...@linutronix.de>

Thomas Gleixner

unread,

Nov 1, 2014, 4:40:06 PM11/1/14

to

On Sat, 1 Nov 2014, Andy Lutomirski wrote:

> On Sat, Nov 1, 2014 at 12:59 PM, Thomas Gleixner <tg...@linutronix.de> wrote:
> > On Fri, 24 Oct 2014, Andy Lutomirski wrote:
> >
> > I'm probably missing something.
> >
> > Why is this tied to the mmap? If I just open a counter then I should
> > be able to read the counter from user space w/o mmapping something in
> > the first place.
>

> You can read it with read(2), which this patch shouldn't affect. If
> you want to read it with rdpmc, then you need to know what rdpmc index
> to use, and the API for that is to mmap the event, check the
> userpage's cap_user_rdpmc, and then rdpmc on ->idx - 1 (assuming that
> ->idx != 0). You can't safely make any assumptions about which rdpmc
> index it will be without explicitly checking, because perf reserves
> the right to change the index whenever it wants.

Got it. As I expected: I was missing something :)

> There's plenty of room to tighten up the restrictions further, but
> this is, I think, a decent first step, and it solves the problem of
> information leaking into seccomp sandboxes.

In which way?

Andy Lutomirski

unread,

Nov 1, 2014, 4:40:07 PM11/1/14

to

On Sat, Nov 1, 2014 at 12:59 PM, Thomas Gleixner <tg...@linutronix.de> wrote:

> On Fri, 24 Oct 2014, Andy Lutomirski wrote:
>
> I'm probably missing something.
>
> Why is this tied to the mmap? If I just open a counter then I should
> be able to read the counter from user space w/o mmapping something in
> the first place.

You can read it with read(2), which this patch shouldn't affect. If
you want to read it with rdpmc, then you need to know what rdpmc index
to use, and the API for that is to mmap the event, check the
userpage's cap_user_rdpmc, and then rdpmc on ->idx - 1 (assuming that
->idx != 0). You can't safely make any assumptions about which rdpmc
index it will be without explicitly checking, because perf reserves
the right to change the index whenever it wants.

There's plenty of room to tighten up the restrictions further, but
this is, I think, a decent first step, and it solves the problem of
information leaking into seccomp sandboxes.

Does that help? I tested this with the only reasonable test case I
could find, which was Andi Kleen's "self" program. It still worked.

--Andy

Andy Lutomirski

unread,

Nov 1, 2014, 5:50:07 PM11/1/14

to

On Nov 1, 2014 1:39 PM, "Thomas Gleixner" <tg...@linutronix.de> wrote:
>
> On Sat, 1 Nov 2014, Andy Lutomirski wrote:
> > On Sat, Nov 1, 2014 at 12:59 PM, Thomas Gleixner <tg...@linutronix.de> wrote:
> > > On Fri, 24 Oct 2014, Andy Lutomirski wrote:
> > >
> > > I'm probably missing something.
> > >
> > > Why is this tied to the mmap? If I just open a counter then I should
> > > be able to read the counter from user space w/o mmapping something in
> > > the first place.
> >
> > You can read it with read(2), which this patch shouldn't affect. If
> > you want to read it with rdpmc, then you need to know what rdpmc index
> > to use, and the API for that is to mmap the event, check the
> > userpage's cap_user_rdpmc, and then rdpmc on ->idx - 1 (assuming that
> > ->idx != 0). You can't safely make any assumptions about which rdpmc
> > index it will be without explicitly checking, because perf reserves
> > the right to change the index whenever it wants.
>
> Got it. As I expected: I was missing something :)
>
> > There's plenty of room to tighten up the restrictions further, but
> > this is, I think, a decent first step, and it solves the problem of
> > information leaking into seccomp sandboxes.
>
> In which way?

All the performance counters were readable without using any syscalls.
That leaks hints as to which events are in use, and it possibly leaks
interesting side channel information. With this series applied, you
need a at least mmap an rdpmc-able event, which most seccomp sandboxes
won't allow.

Unfortunately, rdpmc access to counters can't be controlled
individually, so it's hard to do all that much better than this.

--Andy

Thomas Gleixner

unread,

Nov 1, 2014, 6:20:06 PM11/1/14

to

On Sat, 1 Nov 2014, Andy Lutomirski wrote:
> On Nov 1, 2014 1:39 PM, "Thomas Gleixner" <tg...@linutronix.de> wrote:
> > On Sat, 1 Nov 2014, Andy Lutomirski wrote:
> > > There's plenty of room to tighten up the restrictions further, but
> > > this is, I think, a decent first step, and it solves the problem of
> > > information leaking into seccomp sandboxes.
> >
> > In which way?
>
> All the performance counters were readable without using any syscalls.
> That leaks hints as to which events are in use, and it possibly leaks
> interesting side channel information. With this series applied, you
> need a at least mmap an rdpmc-able event, which most seccomp sandboxes
> won't allow.

Ok. So you are preventing the seccomp sandboxes to open/mmap a counter.

> Unfortunately, rdpmc access to counters can't be controlled
> individually, so it's hard to do all that much better than this.

Yeah, I know ...

Andy Lutomirski

unread,

Nov 2, 2014, 3:20:06 PM11/2/14

to

On Sat, Nov 1, 2014 at 3:10 PM, Thomas Gleixner <tg...@linutronix.de> wrote:
> On Sat, 1 Nov 2014, Andy Lutomirski wrote:
>> On Nov 1, 2014 1:39 PM, "Thomas Gleixner" <tg...@linutronix.de> wrote:
>> > On Sat, 1 Nov 2014, Andy Lutomirski wrote:
>> > > There's plenty of room to tighten up the restrictions further, but
>> > > this is, I think, a decent first step, and it solves the problem of
>> > > information leaking into seccomp sandboxes.
>> >
>> > In which way?
>>
>> All the performance counters were readable without using any syscalls.
>> That leaks hints as to which events are in use, and it possibly leaks
>> interesting side channel information. With this series applied, you
>> need a at least mmap an rdpmc-able event, which most seccomp sandboxes
>> won't allow.
>
> Ok. So you are preventing the seccomp sandboxes to open/mmap a counter.
>

Yes.

Conversely, if someone lets perf_event_open through a seccomp filter,
then the sandboxed code can probably gather more interesting
information using perf_event_open the normal way than they can by
poking at rdpmc.

--Andy

>> Unfortunately, rdpmc access to counters can't be controlled
>> individually, so it's hard to do all that much better than this.
>
> Yeah, I know ...
>
> Thanks,
>
> tglx

--

Andy Lutomirski
AMA Capital Management, LLC

Andy Lutomirski

unread,

Nov 12, 2014, 6:40:07 PM11/12/14

to

What's the status of these? I think that "perf: Clean up
pmu::event_idx" and "x86: Add a comment clarifying LDT context
switching" are in -tip, the two cr4 cleanups ("x86: Clean up cr4
manipulation" and "x86: Store a per-cpu shadow copy of CR4") are
reviewed but will conflict with Andi's fsgsbase work, and the rest are
waiting for review.

Thanks,
Andy

Andy Lutomirski
AMA Capital Management, LLC

Andy Lutomirski

unread,

Jan 13, 2015, 8:00:05 PM1/13/15

to

On Fri, Oct 31, 2014 at 8:09 AM, Peter Zijlstra <pet...@infradead.org> wrote:
>
>

> This looks ok I suppose, although the x86 people should pass verdict on
> the first few patches.

Hi all-

What's the status of this series?

Thanks,
Andy

Thomas Gleixner

unread,

Jan 22, 2015, 5:50:05 PM1/22/15

to

On Tue, 13 Jan 2015, Andy Lutomirski wrote:
> On Fri, Oct 31, 2014 at 8:09 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> >
> >
> > This looks ok I suppose, although the x86 people should pass verdict on
> > the first few patches.
>
> Hi all-
>
> What's the status of this series?

I've reviewed the x86 parts except for the perf stuff. So if Peter is
happy with that he should take it via perf as I expect conflicts there
rather than in the non perf areas

Thanks,

tglx

Peter Zijlstra

unread,

Jan 23, 2015, 3:40:06 AM1/23/15

to

On Thu, Jan 22, 2015 at 11:42:49PM +0100, Thomas Gleixner wrote:
> On Tue, 13 Jan 2015, Andy Lutomirski wrote:
> > On Fri, Oct 31, 2014 at 8:09 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> > >
> > >
> > > This looks ok I suppose, although the x86 people should pass verdict on
> > > the first few patches.
> >
> > Hi all-
> >
> > What's the status of this series?
>
> I've reviewed the x86 parts except for the perf stuff. So if Peter is
> happy with that he should take it via perf as I expect conflicts there
> rather than in the non perf areas

Ah indeed you did; let me queue the lot then. Thanks!

tip-bot for Andy Lutomirski

unread,

Feb 4, 2015, 9:50:07 AM2/4/15

to

Commit-ID: 375074cc736ab1d89a708c0a8d7baa4a70d5d476
Gitweb: http://git.kernel.org/tip/375074cc736ab1d89a708c0a8d7baa4a70d5d476
Author: Andy Lutomirski <lu...@amacapital.net>
AuthorDate: Fri, 24 Oct 2014 15:58:07 -0700
Committer: Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 4 Feb 2015 12:10:41 +0100

x86: Clean up cr4 manipulation

CR4 manipulation was split, seemingly at random, between direct
(write_cr4) and using a helper (set/clear_in_cr4). Unfortunately,
the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
which only a small subset of users actually wanted.

This patch replaces all cr4 access in functions that don't leave cr4
exactly the way they found it with new helpers cr4_set_bits,
cr4_clear_bits, and cr4_set_bits_and_update_boot.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>

Reviewed-by: Thomas Gleixner <tg...@linutronix.de>

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

Cc: Andrea Arcangeli <aarc...@redhat.com>
Cc: Vince Weaver <vi...@deater.net>
Cc: "hillf.zj" <hill...@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.K...@vt.edu>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@kernel.org>
Cc: Kees Cook <kees...@chromium.org>
Cc: Linus Torvalds <torv...@linux-foundation.org>
Link: http://lkml.kernel.org/r/495a10bdc9e67016b8fd3945700d46...@amacapital.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>

drivers/lguest/x86/core.c | 5 +++--
15 files changed, 70 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a092a0c..ec1c935 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss,

index 04905bf..fc0c4bc 100644

index 5da71c2..f41e19c 100644

--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@

#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/tlbflush.h>

/*
* VMX functions:
@@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void)
static inline void cpu_vmxoff(void)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}

static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index c604965..9d8fc49 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -278,7 +278,7 @@ __setup("nosmep", setup_disable_smep);

static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set_bits(X86_CR4_SMEP);
}

static __init int setup_disable_smap(char *arg)

@@ -298,9 +298,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)

if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set_bits(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear_bits(X86_CR4_SMAP);
#endif
}
}

@@ -1312,7 +1312,7 @@ void cpu_init(void)

pr_debug("Initializing CPU#%d\n", cpu);

- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

/*
* Initialize the per-CPU GDT with the boot GDT,

@@ -1393,7 +1393,7 @@ void cpu_init(void)

printk(KERN_INFO "Initializing CPU#%d\n", cpu);

if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

load_current_idt();
switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index d231799..15ad3ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -44,6 +44,7 @@

#include <asm/processor.h>
#include <asm/traps.h>

+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -1449,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void)

bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);

rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c

index ec2663a..737b0ad 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,6 +9,7 @@

#include <asm/processor.h>
#include <asm/traps.h>

+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -65,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)

"Intel old style machine check architecture supported.\n");

/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c

index bd5d46a..44f1382 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,6 +8,7 @@

#include <asm/processor.h>
#include <asm/traps.h>

+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -36,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)

lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);

printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 143e5f5..6b5acd5 100644

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>

@@ -1328,7 +1329,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)

case CPU_STARTING:
if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set_bits(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;

@@ -1834,9 +1835,9 @@ static void change_rdpmc(void *info)

bool enable = !!(unsigned long)info;

if (enable)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set_bits(X86_CR4_PCE);
else
- clear_in_cr4(X86_CR4_PCE);
+ cr4_clear_bits(X86_CR4_PCE);
}

static ssize_t set_attr_rdpmc(struct device *cdev,
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c

index a9a4229..87727b0 100644

--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
@@ -180,7 +181,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set_bits(cr4_mask);

cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index e127dda..046e2d6 100644

--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -141,7 +142,7 @@ void flush_thread(void)

static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set_bits(X86_CR4_TSD);
}

void disable_TSC(void)
@@ -158,7 +159,7 @@ void disable_TSC(void)

static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear_bits(X86_CR4_TSD);
}

static void enable_TSC(void)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c

index 0de1fae..34f66e5 100644

--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>

/*
@@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index d4c58d8..db77537 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2812,7 +2812,7 @@ static int hardware_enable(void)

/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ cr4_set_bits(X86_CR4_VMXE);

if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);

@@ -2849,7 +2849,7 @@ static void hardware_disable(void)

vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}

static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

index 079c3b6..d4eddbd 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -173,11 +173,11 @@ static void __init probe_page_size_mask(void)

/* Enable PSE if available */
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);

/* Enable PGE if available */
if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index 78a881b..bd8b845 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1494,10 +1494,10 @@ static void xen_pvh_set_cr_flags(int cpu)

* set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
*/
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);

if (cpu_has_pge)
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
}

/*
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c

index 922a1ac..6adfd7b 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -47,6 +47,7 @@
#include <asm/lguest.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
+#include <asm/tlbflush.h>
#include "../lg.h"

static int cpu_had_pge;
@@ -452,9 +453,9 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)

static void adjust_pge(void *on)
{
if (on)
- write_cr4(read_cr4() | X86_CR4_PGE);
+ cr4_set_bits(X86_CR4_PGE);
else
- write_cr4(read_cr4() & ~X86_CR4_PGE);
+ cr4_clear_bits(X86_CR4_PGE);
}

/*H:020
--

tip-bot for Andy Lutomirski

unread,

Feb 4, 2015, 9:50:07 AM2/4/15

to

Commit-ID: a66734297f78707ce39d756b656bfae861d53f62
Gitweb: http://git.kernel.org/tip/a66734297f78707ce39d756b656bfae861d53f62
Author: Andy Lutomirski <lu...@amacapital.net>
AuthorDate: Fri, 24 Oct 2014 15:58:13 -0700
Committer: Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 4 Feb 2015 12:10:49 +0100

perf/x86: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks

While perfmon2 is a sufficiently evil library (it pokes MSRs
directly) that breaking it is fair game, it's still useful, so we
might as well try to support it. This allows users to write 2 to
/sys/devices/cpu/rdpmc to disable all rdpmc protection so that hack
like perfmon2 can continue to work.

At some point, if perf_event becomes fast enough to replace
perfmon2, then this can go.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@kernel.org>
Cc: Kees Cook <kees...@chromium.org>
Cc: Andrea Arcangeli <aarc...@redhat.com>
Cc: Vince Weaver <vi...@deater.net>
Cc: "hillf.zj" <hill...@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.K...@vt.edu>
Cc: Linus Torvalds <torv...@linux-foundation.org>

Link: http://lkml.kernel.org/r/caac3c1c707dcca48ecbc35f4def21...@amacapital.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>

---
arch/x86/include/asm/mmu_context.h | 5 ++++-
arch/x86/kernel/cpu/perf_event.c | 21 ++++++++++++++++++++-
2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h

index 89c1fec..883f6b9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -19,9 +19,12 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,

#endif /* !CONFIG_PARAVIRT */

#ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_always_available;
+
static inline void load_mm_cr4(struct mm_struct *mm)
{
- if (atomic_read(&mm->context.perf_rdpmc_allowed))
+ if (static_key_true(&rdpmc_always_available) ||
+ atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
cr4_clear_bits(X86_CR4_PCE);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index bec5cff..b71a7f8 100644

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -45,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]

@@ -1870,10 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,

tip-bot for Andy Lutomirski

unread,

Feb 4, 2015, 9:50:08 AM2/4/15

to

Commit-ID: 1e0fb9ec679c9273a641f1d6f3d25ea47baef2bb
Gitweb: http://git.kernel.org/tip/1e0fb9ec679c9273a641f1d6f3d25ea47baef2bb
Author: Andy Lutomirski <lu...@amacapital.net>
AuthorDate: Fri, 24 Oct 2014 15:58:10 -0700
Committer: Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 4 Feb 2015 12:10:45 +0100

perf: Add pmu callbacks to track event mapping and unmapping

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@kernel.org>
Cc: Kees Cook <kees...@chromium.org>
Cc: Andrea Arcangeli <aarc...@redhat.com>
Cc: Vince Weaver <vi...@deater.net>
Cc: "hillf.zj" <hill...@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.K...@vt.edu>
Cc: Linus Torvalds <torv...@linux-foundation.org>

Link: http://lkml.kernel.org/r/266afcba1d1f91ea5501e4e16e94bb...@amacapital.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>

---
include/linux/perf_event.h | 7 +++++++
kernel/events/core.c | 9 +++++++++
2 files changed, 16 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 5cad0e6..3326200 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -202,6 +202,13 @@ struct pmu {

*/
int (*event_init) (struct perf_event *event);

+ /*
+ * Notification that the event was mapped or unmapped. Called
+ * in the context of the mapping task.
+ */
+ void (*event_mapped) (struct perf_event *event); /*optional*/
+ void (*event_unmapped) (struct perf_event *event); /*optional*/
+
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 7f2fbb8..cc14871 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4293,6 +4293,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)

atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
}

/*

@@ -4312,6 +4315,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)

int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);

+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
atomic_dec(&rb->mmap_count);

if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))

@@ -4513,6 +4519,9 @@ unlock:

vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;

+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
return ret;
}

--

tip-bot for Andy Lutomirski

unread,

Feb 4, 2015, 9:50:09 AM2/4/15

to

Commit-ID: c1317ec2b906442930318d9d6e51425c5a69e9cb
Gitweb: http://git.kernel.org/tip/c1317ec2b906442930318d9d6e51425c5a69e9cb
Author: Andy Lutomirski <lu...@amacapital.net>
AuthorDate: Fri, 24 Oct 2014 15:58:11 -0700
Committer: Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 4 Feb 2015 12:10:46 +0100

perf: Pass the event to arch_perf_update_userpage()

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@kernel.org>
Cc: Kees Cook <kees...@chromium.org>
Cc: Andrea Arcangeli <aarc...@redhat.com>
Cc: Vince Weaver <vi...@deater.net>
Cc: "hillf.zj" <hill...@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.K...@vt.edu>
Cc: Linus Torvalds <torv...@linux-foundation.org>

Link: http://lkml.kernel.org/r/0fea9a7fac3c1eea86cb0a5954184e...@amacapital.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>

---
arch/x86/kernel/cpu/perf_event.c | 3 ++-
kernel/events/core.c | 5 +++--
2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 6b5acd5..73e84a3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1915,7 +1915,8 @@ static struct pmu pmu = {

.flush_branch_stack = x86_pmu_flush_branch_stack,
};

-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;

diff --git a/kernel/events/core.c b/kernel/events/core.c

index cc14871..13209a9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4101,7 +4101,8 @@ unlock:

rcu_read_unlock();
}

-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(
+ struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

@@ -4151,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event)

userpg->time_running = running +
atomic64_read(&event->child_total_time_running);

- arch_perf_update_userpage(userpg, now);
+ arch_perf_update_userpage(event, userpg, now);

barrier();
++userpg->lock;
--

tip-bot for Andy Lutomirski

unread,

Feb 4, 2015, 9:50:08 AM2/4/15

to

Commit-ID: 1e02ce4cccdcb9688386e5b8d2c9fa4660b45389
Gitweb: http://git.kernel.org/tip/1e02ce4cccdcb9688386e5b8d2c9fa4660b45389
Author: Andy Lutomirski <lu...@amacapital.net>
AuthorDate: Fri, 24 Oct 2014 15:58:08 -0700
Committer: Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 4 Feb 2015 12:10:42 +0100

x86: Store a per-cpu shadow copy of CR4

Context switches and TLB flushes can change individual bits of CR4.
CR4 reads take several cycles, so store a shadow copy of CR4 in a
per-cpu variable.

To avoid wasting a cache line, I added the CR4 shadow to
cpu_tlbstate, which is already touched in switch_mm. The heaviest
users of the cr4 shadow will be switch_mm and __switch_to_xtra, and
__switch_to_xtra is called shortly after switch_mm during context
switch, so the cacheline is likely to be hot.

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>

Reviewed-by: Thomas Gleixner <tg...@linutronix.de>

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

Cc: Kees Cook <kees...@chromium.org>
Cc: Andrea Arcangeli <aarc...@redhat.com>
Cc: Vince Weaver <vi...@deater.net>
Cc: "hillf.zj" <hill...@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.K...@vt.edu>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Arnaldo Carvalho de Melo <ac...@kernel.org>
Cc: Linus Torvalds <torv...@linux-foundation.org>

Link: http://lkml.kernel.org/r/3a54dd3353fffbf84804398e00dfdc...@amacapital.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>

arch/x86/mm/init.c | 9 +++++++

arch/x86/mm/tlb.c | 3 ---
arch/x86/power/cpu.c | 11 +++-----
arch/x86/realmode/init.c | 2 +-

20 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 32444ae..965c47d 100644

--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}

-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
}

-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
}
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h

index e820c08..6a4b00f 100644

--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}

-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
}

-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return native_read_cr4_safe();
}

-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

index fc0c4bc..cd79194 100644

index f41e19c..cce9ee6 100644

--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -46,7 +46,7 @@ static inline void cpu_vmxoff(void)

static inline int cpu_vmx_enabled(void)
{
- return read_cr4() & X86_CR4_VMXE;
+ return __read_cr4() & X86_CR4_VMXE;
}

/** Disable VMX if it is enabled on the current CPU
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c

index 3136820..d1daead 100644

--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)

header->pmode_cr0 = read_cr0();
if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = __read_cr4();
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 9d8fc49..07f2fc3 100644

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>

@@ -1294,6 +1295,12 @@ void cpu_init(void)

wait_for_master_cpu(cpu);

/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c

index 9e451b0..f8c81ba 100644

--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)

/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}

/*
@@ -171,7 +171,7 @@ static void post_set(void)

/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
}

static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c

index 0e25a1b..7d74f7b 100644

--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)

/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}

/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)

/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c

index d6c1b983..2911ef3 100644

--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)

asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);

/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c

index eda1a86..3b241f0 100644

--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -155,6 +155,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index 8f3ebfe..603c4f9 100644

--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4_safe();
+ cr4 = __read_cr4_safe();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index 5a2c029..67fcc43 100644

--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4();
+ cr4 = __read_cr4();

printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index ab4734e..04e6c62 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1178,7 +1178,7 @@ void __init setup_arch(char **cmdline_p)

if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = read_cr4();
+ mmu_cr4_features = __read_cr4();
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 41dd038..496a548 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1583,7 +1583,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;

if (cr4 & X86_CR4_VMXE)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index db77537..8dca6cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2785,7 +2785,7 @@ static int hardware_enable(void)

u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;

- if (read_cr4() & X86_CR4_VMXE)
+ if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;

INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));

@@ -4255,7 +4255,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)

vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */

/* Save the most likely value for this task's CR4 in the VMCS. */
- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
vmx->host_state.vmcs_host_cr4 = cr4;

@@ -7784,7 +7784,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)

if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
vmx->host_state.vmcs_host_cr4 = cr4;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index e3ff27a..ede025f 100644

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
if (pte && pte_present(*pte) && pte_exec(*pte) &&
(pgd_flags(*pgd) & _PAGE_USER) &&
- (read_cr4() & X86_CR4_SMEP))
+ (__read_cr4() & X86_CR4_SMEP))
printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

index d4eddbd..a74aa0f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -713,6 +713,15 @@ void __init zone_sizes_init(void)

free_area_init_nodes(max_zone_pfns);
}

+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);

+
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
/* entry 0 MUST be WB (hardwired to speed up translations) */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36..3250f23 100644

--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
#include <asm/uv/uv.h>
#include <linux/debugfs.h>

-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
- = { &init_mm, 0, };
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c

index 6ec7910..3e32ed5 100644

index bad628a..0b7a63d 100644

--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -81,7 +81,7 @@ void __init setup_real_mode(void)

trampoline_header->start = (u64) secondary_startup_64;
trampoline_cr4_features = &trampoline_header->cr4;
- *trampoline_cr4_features = read_cr4();
+ *trampoline_cr4_features = __read_cr4();

trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
--