[GIT PULL] x86/cpu changes for v2.6.34

Ingo Molnar

unread,

Feb 27, 2010, 10:20:02 AM2/27/10

to

Linus,

Please pull the latest x86-cpu-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86-cpu-for-linus

out-of-topic modifications in x86-cpu-for-linus:
------------------------------------------------
drivers/char/agp/intel-agp.c # 48a719c: intel-agp: Switch to wbinvd_on_al

Thanks,

Ingo

------------------>
Borislav Petkov (7):
x86, lib: Add wbinvd smp helpers
intel-agp: Switch to wbinvd_on_all_cpus
x86, cacheinfo: Fix disabling of L3 cache indices
x86, cacheinfo: Add cache index disable sysfs attrs only to L3 caches
x86, cacheinfo: Calculate L3 indices
x86, cacheinfo: Remove NUMA dependency, fix for AMD Fam10h rev D1
x86, cacheinfo: Enable L3 CID only on AMD

Joerg Roedel (1):
x86, cpu: Print AMD virtualization features in /proc/cpuinfo

arch/x86/include/asm/cpufeature.h | 4 +
arch/x86/include/asm/smp.h | 9 +
arch/x86/kernel/cpu/addon_cpuid_features.c | 4 +
arch/x86/kernel/cpu/intel_cacheinfo.c | 250 +++++++++++++++++-----------
arch/x86/lib/Makefile | 2 +-
arch/x86/lib/cache-smp.c | 19 ++
drivers/char/agp/intel-agp.c | 15 +--
7 files changed, 197 insertions(+), 106 deletions(-)
create mode 100644 arch/x86/lib/cache-smp.c

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 637e1ec..0cd82d0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -168,6 +168,10 @@
#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1e79678..4cfc908 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,8 @@ int native_cpu_disable(void);
void native_cpu_die(unsigned int cpu);
void native_play_dead(void);
void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+int wbinvd_on_all_cpus(void);

void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
@@ -147,6 +149,13 @@ static inline int num_booting_cpus(void)
{
return cpumask_weight(cpu_callout_mask);
}
+#else /* !CONFIG_SMP */
+#define wbinvd_on_cpu(cpu) wbinvd()
+static inline int wbinvd_on_all_cpus(void)
+{
+ wbinvd();
+ return 0;
+}
#endif /* CONFIG_SMP */

extern unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 468489b..97ad79c 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
{ 0, 0, 0, 0 }
};

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fc6c8ef..d440123 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <linux/smp.h>
#include <asm/k8.h>
+#include <asm/smp.h>

#define LVL_1_INST 1
#define LVL_1_DATA 2
@@ -150,7 +151,8 @@ struct _cpuid4_info {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- unsigned long can_disable;
+ bool can_disable;
+ unsigned int l3_indices;
DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
};

@@ -160,7 +162,8 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- unsigned long can_disable;
+ bool can_disable;
+ unsigned int l3_indices;
};

unsigned short num_cache_leaves;
@@ -290,6 +293,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}

+struct _cache_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct _cpuid4_info *, char *);
+ ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
+};
+
+#ifdef CONFIG_CPU_SUP_AMD
+static unsigned int __cpuinit amd_calc_l3_indices(void)
+{
+ /*
+ * We're called over smp_call_function_single() and therefore
+ * are on the correct cpu.
+ */
+ int cpu = smp_processor_id();
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(dev, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ sc0 = !(val & BIT(0));
+ sc1 = !(val & BIT(4));
+ sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
static void __cpuinit
amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
@@ -299,12 +332,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
if (boot_cpu_data.x86 == 0x11)
return;

- /* see erratum #382 */
- if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+ /* see errata #382 and #388 */
+ if ((boot_cpu_data.x86 == 0x10) &&
+ ((boot_cpu_data.x86_model < 0x8) ||
+ (boot_cpu_data.x86_mask < 0x1)))
return;

- this_leaf->can_disable = 1;
+ this_leaf->can_disable = true;
+ this_leaf->l3_indices = amd_calc_l3_indices();
+}
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int index)
+{
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = amd_get_nb_id(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int reg = 0;
+
+ if (!this_leaf->can_disable)
+ return -EINVAL;
+
+ if (!dev)
+ return -EINVAL;
+
+ pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ return sprintf(buf, "0x%08x\n", reg);
+}
+
+#define SHOW_CACHE_DISABLE(index) \
+static ssize_t \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+{ \
+ return show_cache_disable(this_leaf, buf, index); \
+}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count, unsigned int index)
+{
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = amd_get_nb_id(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned long val = 0;
+
+#define SUBCACHE_MASK (3UL << 20)
+#define SUBCACHE_INDEX 0xfff
+
+ if (!this_leaf->can_disable)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
+ return -EINVAL;
+
+ if (strict_strtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ /* do not allow writes outside of allowed bits */
+ if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+ ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+ return -EINVAL;
+
+ val |= BIT(30);
+ pci_write_config_dword(dev, 0x1BC + index * 4, val);
+ /*
+ * We need to WBINVD on a core on the node containing the L3 cache which
+ * indices we disable therefore a simple wbinvd() is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+ pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(index) \
+static ssize_t \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+ const char *buf, size_t count) \
+{ \
+ return store_cache_disable(this_leaf, buf, count, index); \
}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+ show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+ show_cache_disable_1, store_cache_disable_1);
+
+#else /* CONFIG_CPU_SUP_AMD */
+static void __cpuinit
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+{
+};
+#endif /* CONFIG_CPU_SUP_AMD */

static int
__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -711,82 +835,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
#define to_object(k) container_of(k, struct _index_kobject, kobj)
#define to_attr(a) container_of(a, struct _cache_attr, attr)

-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned int reg = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!dev)
- return -EINVAL;
-
- pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
- return sprintf(buf, "%x\n", reg);
-}
-
-#define SHOW_CACHE_DISABLE(index) \
-static ssize_t \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return show_cache_disable(this_leaf, buf, index); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count, unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned long val = 0;
- unsigned int scrubber = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- val |= 0xc0000000;
-
- pci_read_config_dword(dev, 0x58, &scrubber);
- scrubber &= ~0x1f000000;
- pci_write_config_dword(dev, 0x58, scrubber);
-
- pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
- wbinvd();
- pci_write_config_dword(dev, 0x1BC + index * 4, val);
- return count;
-}
-
-#define STORE_CACHE_DISABLE(index) \
-static ssize_t \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
-{ \
- return store_cache_disable(this_leaf, buf, count, index); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
-};
-
#define define_one_ro(_name) \
static struct _cache_attr _name = \
__ATTR(_name, 0444, show_##_name, NULL)
@@ -801,23 +849,28 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);

-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
+#define DEFAULT_SYSFS_CACHE_ATTRS \
+ &type.attr, \
+ &level.attr, \
+ &coherency_line_size.attr, \
+ &physical_line_partition.attr, \
+ &ways_of_associativity.attr, \
+ &number_of_sets.attr, \
+ &size.attr, \
+ &shared_cpu_map.attr, \
+ &shared_cpu_list.attr

static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
+ DEFAULT_SYSFS_CACHE_ATTRS,
+ NULL
+};
+
+static struct attribute *default_l3_attrs[] = {
+ DEFAULT_SYSFS_CACHE_ATTRS,
+#ifdef CONFIG_CPU_SUP_AMD
&cache_disable_0.attr,
&cache_disable_1.attr,
+#endif
NULL
};

@@ -908,6 +961,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
unsigned int cpu = sys_dev->id;
unsigned long i, j;
struct _index_kobject *this_object;
+ struct _cpuid4_info *this_leaf;
int retval;

retval = cpuid4_cache_sysfs_init(cpu);
@@ -926,6 +980,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_object = INDEX_KOBJECT_PTR(cpu, i);
this_object->cpu = cpu;
this_object->index = i;
+
+ this_leaf = CPUID4_INFO_IDX(cpu, i);
+
+ if (this_leaf->can_disable)
+ ktype_cache.default_attrs = default_l3_attrs;
+ else
+ ktype_cache.default_attrs = default_attrs;
+
retval = kobject_init_and_add(&(this_object->kobj),
&ktype_cache,
per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754..d85e0e4 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c

clean-files := inat-tables.c

-obj-$(CONFIG_SMP) += msr-smp.o
+obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o

lib-y := delay.o
lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 0000000..a3c6688
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
+#include <linux/smp.h>
+#include <linux/module.h>
+
+static void __wbinvd(void *dummy)
+{
+ wbinvd();
+}
+
+void wbinvd_on_cpu(int cpu)
+{
+ smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+
+int wbinvd_on_all_cpus(void)
+{
+ return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 3999a5f..8a713f1 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/pagemap.h>
#include <linux/agp_backend.h>
+#include <asm/smp.h>
#include "agp.h"

/*
@@ -815,12 +816,6 @@ static void intel_i830_setup_flush(void)
intel_i830_fini_flush();
}

-static void
-do_wbinvd(void *null)
-{
- wbinvd();
-}
-
/* The chipset_flush interface needs to get data that has already been
* flushed out of the CPU all the way out to main memory, because the GPU
* doesn't snoop those buffers.
@@ -837,12 +832,10 @@ static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)

memset(pg, 0, 1024);

- if (cpu_has_clflush) {
+ if (cpu_has_clflush)
clflush_cache_range(pg, 1024);
- } else {
- if (on_each_cpu(do_wbinvd, NULL, 1) != 0)
- printk(KERN_ERR "Timed out waiting for cache flush.\n");
- }
+ else if (wbinvd_on_all_cpus() != 0)
+ printk(KERN_ERR "Timed out waiting for cache flush.\n");
}

/* The intel i830 automatically initializes the agp aperture during POST.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Arjan van de Ven

unread,

Feb 27, 2010, 12:10:03 PM2/27/10

to

On Sat, 27 Feb 2010 16:09:42 +0100
Ingo Molnar <mi...@elte.hu> wrote:

> +int wbinvd_on_all_cpus(void)
> +{
> + return on_each_cpu(__wbinvd, NULL, 1);
> +}

does this make sense at all?

doesn't cache coherency on x86 already guarantee this?

--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

H. Peter Anvin

unread,

Feb 27, 2010, 3:10:01 PM2/27/10

to

On 02/27/2010 09:10 AM, Arjan van de Ven wrote:
> On Sat, 27 Feb 2010 16:09:42 +0100
> Ingo Molnar <mi...@elte.hu> wrote:
>
>> +int wbinvd_on_all_cpus(void)
>> +{
>> + return on_each_cpu(__wbinvd, NULL, 1);
>> +}
>
> does this make sense at all?
>
> doesn't cache coherency on x86 already guarantee this?
>

No, WBINVD (unlike CLFLUSH) is local to one CPU.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

Linus Torvalds

unread,

Feb 28, 2010, 3:20:02 PM2/28/10

to

I haven't bisected this, but something slowed down in bootup on my machine
recently.

See the timestamps:

[ 0.000000] Linux version 2.6.33-01832-g30ff056
...
[ 0.010066] Enabled Interrupt-remapping
[ 0.010120] Setting APIC routing to physical flat
[ 0.010180] DRHD: handling fault status reg 2
[ 0.010582] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[ 0.049955] CPU0: Genuine Intel(R) CPU 000 @ 3.20GHz stepping 04
[ 0.157195] Booting Node 0, Processors #1
[ 0.245179] CPU 1 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.265332] #2
[ 0.353185] CPU 2 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.373328] #3
[ 2.193277] CPU 3 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 2.213379] #4
[ 2.301283] CPU 4 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 2.321391] #5
[ 2.417287] CPU 5 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 2.437356] #6
[ 2.525293] CPU 6 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 2.545354] #7
[ 2.633298] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 2.653423] Brought up 8 CPUs
[ 2.653571] Total of 8 processors activated (51201.44 BogoMIPS).

what happened there for almost 2 seconds in between CPU#3 and CPU#4?

It wasn't very fast before either, but it was way better:

[ 0.050298] CPU0: Genuine Intel(R) CPU 000 @ 3.20GHz stepping 04
[ 0.156725] Booting Node 0, Processors #1
[ 0.244410] CPU 1 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.264458] #2
[ 0.352078] CPU 2 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.372147] #3
[ 0.459746] CPU 3 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.479838] #4
[ 0.567415] CPU 4 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.587446] #5
[ 0.683057] CPU 5 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.703081] #6
[ 0.790724] CPU 6 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.810748] #7
[ 0.898393] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.918412] Brought up 8 CPUs
[ 0.918562] Total of 8 processors activated (51203.34 BogoMIPS).

what is it that takes so long to bring those CPU's up?

Linus

Linus Torvalds

unread,

Feb 28, 2010, 3:50:02 PM2/28/10

to

On Sun, 28 Feb 2010, Linus Torvalds wrote:
>
> I haven't bisected this, but something slowed down in bootup on my machine
> recently.

Hmm. I take that back. It's not consistent, and it's not recent after all.

It comes and goes:

[torvalds@nehalem linux]$ grep "CPU 7 MCA" /var/log/messages-* /var/log/messages | cut -d: -f5-
[ 0.898396] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898400] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 1.596240] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898394] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 1.600229] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898395] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.901211] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8

[ 2.633298] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8

[ 0.898393] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8

[ 0.901210] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898395] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8

[ 0.898393] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898393] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8

[ 0.898402] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.901213] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898392] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898395] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 1.601467] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898401] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898395] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[ 0.898397] CPU 7 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8

note how it's pretty consistently at about the 0.89s mark, but then
there's a _couple_ of times when it's taken rather longer to boot. But the
delay is always in that CPU bringup phase, because doing the same grep for
"CPU 0 MCA" gives consistently low numbers (0.0005s).

Ingo Molnar

unread,

Mar 1, 2010, 3:10:02 AM3/1/10

to

Weird. It seems to be around multiples of .8: 0.8, 1.6, 2.4, with some extra
overhead.

Almost as if some calibration routine or some other busy-loop misses the train
occasionally.

The way i'd go about debugging this is to narrow down the approximate place
the slowdown happens, then enable CONFIG_FUNCTION_TRACER (and disable
CONFIG_DYNAMIC_FTRACE=y, to not have to deal with the dynamic patching
aspects), and do a single-shot tracing session of only that section, on only
one CPU:

if (smp_processor_id() == 7)
ftrace_enabled = 1;

... bootup sequence ...

if (smp_processor_id() == 7)
ftrace_enabled = 0;

And recover the resulting trace from /debug/tracing/trace - it should have the reason
in it plain and simple.

( Unfortunately i'm not 100% sure that setting ftrace_enabled to 1 is enough.
I asked for a simple ad-hoc enable/disable function tracing mechanism _ages_
ago - Steve, Frederic, what happened to that? ftrace_start()/stop() does not
seem to allow that. )

Or you could sprinkle the code with printk's, and see where the overhead
concentrates into. (But printks ca change timings - etc. So can the function
tracer as well ...)

Ingo

Frederic Weisbecker

unread,

Mar 1, 2010, 8:20:02 AM3/1/10

to

On Mon, Mar 01, 2010 at 09:00:58AM +0100, Ingo Molnar wrote:
> Weird. It seems to be around multiples of .8: 0.8, 1.6, 2.4, with some extra
> overhead.
>
> Almost as if some calibration routine or some other busy-loop misses the train
> occasionally.
>
> The way i'd go about debugging this is to narrow down the approximate place
> the slowdown happens, then enable CONFIG_FUNCTION_TRACER (and disable
> CONFIG_DYNAMIC_FTRACE=y, to not have to deal with the dynamic patching
> aspects), and do a single-shot tracing session of only that section, on only
> one CPU:
>
> if (smp_processor_id() == 7)
> ftrace_enabled = 1;
>
> ... bootup sequence ...
>
> if (smp_processor_id() == 7)
> ftrace_enabled = 0;
>
> And recover the resulting trace from /debug/tracing/trace - it should have the reason
> in it plain and simple.
>
> ( Unfortunately i'm not 100% sure that setting ftrace_enabled to 1 is enough.
> I asked for a simple ad-hoc enable/disable function tracing mechanism _ages_
> ago - Steve, Frederic, what happened to that? ftrace_start()/stop() does not
> seem to allow that. )

I don't remember such request. But that would be useful indeed.
We could simply pair the setting of an early tracer with tracing
disabled and then manually delimit the places to trace with
tracing_on/tracing_off().

Whatever.

For now what you can do is setting the function_graph tracer
on bootup:

ftrace=function_graph

and call ftrace_graph_stop() in the place you want the trace
to finish (you could use ftrace_graph_filter= to delimit
the function tracing window, but that won't work without
dynamic tracing, neither with __init functions).

So, after the boot you can look at /debug/tracing/per_cpu/cpu7/trace
and the end of the trace should contain what you want.

Linus Torvalds

unread,

Mar 1, 2010, 11:50:02 AM3/1/10

to

On Mon, 1 Mar 2010, Frederic Weisbecker wrote:

> On Mon, Mar 01, 2010 at 09:00:58AM +0100, Ingo Molnar wrote:
> >
> > if (smp_processor_id() == 7)
> > ftrace_enabled = 1;
> >
> > ... bootup sequence ...
> >
> > if (smp_processor_id() == 7)
> > ftrace_enabled = 0;

> So, after the boot you can look at /debug/tracing/per_cpu/cpu7/trace

> and the end of the trace should contain what you want.

Both of you seemed to miss the fact that it's not cpu7 that is
particularly slow. See the original email from me in this thread: the jump
was at some random point:

[ 0.245179] CPU 1 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.265332] #2
[ 0.353185] CPU 2 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 0.373328] #3

[ 2.193277] CPU 3 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
[ 2.213379] #4

and the reason I grepped for "CPU 7" was that it's the _last_ CPU on this
machine, so what I was grepping for was basically "how long did it take to
bring up all CPU's".

So that particular really bad case apparently happened for CPU#3, but the
two other slow cases happened for CPU#4.

Also, it seems to happen only about every fifth boot or so. Suggestions
for something simple that can trace things like that?

Linus

Steven Rostedt

unread,

Mar 1, 2010, 2:30:02 PM3/1/10

to

On Mon, 2010-03-01 at 14:17 +0100, Frederic Weisbecker wrote:
> On Mon, Mar 01, 2010 at 09:00:58AM +0100, Ingo Molnar wrote:
> > Weird. It seems to be around multiples of .8: 0.8, 1.6, 2.4, with some extra
> > overhead.
> >
> > Almost as if some calibration routine or some other busy-loop misses the train
> > occasionally.
> >
> > The way i'd go about debugging this is to narrow down the approximate place
> > the slowdown happens, then enable CONFIG_FUNCTION_TRACER (and disable
> > CONFIG_DYNAMIC_FTRACE=y, to not have to deal with the dynamic patching
> > aspects), and do a single-shot tracing session of only that section, on only
> > one CPU:
> >
> > if (smp_processor_id() == 7)
> > ftrace_enabled = 1;
> >
> > ... bootup sequence ...
> >
> > if (smp_processor_id() == 7)
> > ftrace_enabled = 0;
> >
> > And recover the resulting trace from /debug/tracing/trace - it should have the reason
> > in it plain and simple.
> >
> > ( Unfortunately i'm not 100% sure that setting ftrace_enabled to 1 is enough.
> > I asked for a simple ad-hoc enable/disable function tracing mechanism _ages_
> > ago - Steve, Frederic, what happened to that? ftrace_start()/stop() does not
> > seem to allow that. )
>

Setting ftrace_enabled = 0 should stop the function tracer, but may not
stop the function graph tracer.

>
>
> I don't remember such request. But that would be useful indeed.
> We could simply pair the setting of an early tracer with tracing
> disabled and then manually delimit the places to trace with
> tracing_on/tracing_off().

It's best to use tracing_off() and tracing_on() for such things.

>
> Whatever.
>
> For now what you can do is setting the function_graph tracer
> on bootup:
>
> ftrace=function_graph
>
> and call ftrace_graph_stop() in the place you want the trace

tracing_off() is the best API for this. Although you still have the
overhead of the tracer. But you can just
echo nop > /debug/tracing/current_tracer
to remove the overhead after bootup.

-- Steve

Steven Rostedt

unread,

Mar 1, 2010, 2:50:02 PM3/1/10

to

On Mon, 2010-03-01 at 08:47 -0800, Linus Torvalds wrote:

> Both of you seemed to miss the fact that it's not cpu7 that is
> particularly slow. See the original email from me in this thread: the jump
> was at some random point:
>
> [ 0.245179] CPU 1 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
> [ 0.265332] #2
> [ 0.353185] CPU 2 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
> [ 0.373328] #3
> [ 2.193277] CPU 3 MCA banks CMCI:2 CMCI:3 CMCI:5 SHD:6 SHD:8
> [ 2.213379] #4
>
> and the reason I grepped for "CPU 7" was that it's the _last_ CPU on this
> machine, so what I was grepping for was basically "how long did it take to
> bring up all CPU's".
>
> So that particular really bad case apparently happened for CPU#3, but the
> two other slow cases happened for CPU#4.
>
> Also, it seems to happen only about every fifth boot or so. Suggestions
> for something simple that can trace things like that?

As Frederic has said you can use 'ftrace=function_graph' on the kernel
command line. It will be initialized in early_initcall (which I believe
is before CPUs are set up. Then add a tracing_off() after the trouble
code. You can make the trace buffers bigger with the kernel command
line:

trace_buf_size=10000000

The above will make the trace buffer 10Meg per CPU. Unlike the
"buffer_size_kb" file, this number is in bytes, even though it will
round to the nearest page. (I probably should make this into kb, and
rename it to trace_buf_size_kb, and deprecate trace_buf_size).

Then you can cat out /debug/tracing/trace, and search for large
latencies in the timestamps.

-- Steve

H. Peter Anvin

unread,

Mar 1, 2010, 5:30:02 PM3/1/10

to

On 03/01/2010 11:42 AM, Steven Rostedt wrote:
>
> As Frederic has said you can use 'ftrace=function_graph' on the kernel
> command line. It will be initialized in early_initcall (which I believe
> is before CPUs are set up. Then add a tracing_off() after the trouble
> code. You can make the trace buffers bigger with the kernel command
> line:
>
> trace_buf_size=10000000
>
> The above will make the trace buffer 10Meg per CPU. Unlike the
> "buffer_size_kb" file, this number is in bytes, even though it will
> round to the nearest page. (I probably should make this into kb, and
> rename it to trace_buf_size_kb, and deprecate trace_buf_size).
>

Memory sizes specified on the kernel command line should generally be in
units of bytes, but accepting suffixes.

-hpa

Steven Rostedt

unread,

Mar 1, 2010, 5:30:02 PM3/1/10

to

On Mon, 2010-03-01 at 14:42 -0500, Steven Rostedt wrote:

> As Frederic has said you can use 'ftrace=function_graph' on the kernel
> command line. It will be initialized in early_initcall (which I believe
> is before CPUs are set up. Then add a tracing_off() after the trouble
> code. You can make the trace buffers bigger with the kernel command
> line:
>
> trace_buf_size=10000000
>
> The above will make the trace buffer 10Meg per CPU. Unlike the
> "buffer_size_kb" file, this number is in bytes, even though it will
> round to the nearest page. (I probably should make this into kb, and
> rename it to trace_buf_size_kb, and deprecate trace_buf_size).
>
> Then you can cat out /debug/tracing/trace, and search for large
> latencies in the timestamps.

I just tried the above and it doesn't work. The ring buffer gets
allocated with the early_initcall(), so trace_printk()'s will work. But
the function and function graph tracers don't get registered until the
device_initcall().

If you are still interested, this patch will allow you to run the
function graph tracer before smp_init(). You still need to add

"ftrace=function_graph" on the kernel command line.

It's a hack, but I tried it out and it worked.

-- Steve

diff --git a/init/main.c b/init/main.c
index 4cb47a1..b334663 100644
--- a/init/main.c
+++ b/init/main.c
@@ -868,8 +868,15 @@ static int __init kernel_init(void * unused)
do_pre_smp_initcalls();
start_boot_trace();

+ {
+ int init_graph_trace(void);
+ init_graph_trace();
+ }
+ trace_printk("start\n");
smp_init();
sched_init_smp();
+ trace_printk("end\n");
+ tracing_off();

do_basic_setup();

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index aaf580c..f18cad8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1214,11 +1214,11 @@ static struct tracer graph_trace __read_mostly = {
#endif
};

-static __init int init_graph_trace(void)
+__init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);

return register_tracer(&graph_trace);
}

-device_initcall(init_graph_trace);
+//device_initcall(init_graph_trace);