oprofile: Implement Intel architectural perfmon support Newer Intel CPUs support a standard set of performance counter events which are discoverable using CPUID. This is called architectural perfmon. Architectural perfmon works on all newer Intel CPUs starting from Core 1 (Yonah), and including Core 2, Atom, and Nehalem. This patch implements support for that in oprofile. Architectural perfmon is normally only used when the CPU is not explicitely recognized, but it can be forced with Architectural perfmon is very similar to classical PPro performance counters, so the code in op_model_ppro is reused with only minor changes. The number of counters is discovered at runtime now and variable. This needs a patch for oprofile userland when architectural perfmon is used. Signed-off-by: Andi Kleen --- arch/x86/oprofile/nmi_int.c | 32 +++++-- arch/x86/oprofile/op_model_ppro.c | 166 ++++++++++++++++++++++++++++++++------ arch/x86/oprofile/op_x86_model.h | 3 3 files changed, 173 insertions(+), 28 deletions(-) Index: linux/arch/x86/oprofile/nmi_int.c =================================================================== --- linux.orig/arch/x86/oprofile/nmi_int.c +++ linux/arch/x86/oprofile/nmi_int.c @@ -388,6 +388,19 @@ static int __init ppro_init(char **cpu_t return 1; } +static int force_arch_perfmon; +module_param(force_arch_perfmon, int, 0); + +static int __init arch_perfmon_init(char **cpu_type) +{ + if (!cpu_has_arch_perfmon) + return 0; + *cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; + arch_perfmon_setup_counters(); + return 1; +} + /* in order to get sysfs right */ static int using_nmi; @@ -395,7 +408,7 @@ int __init op_nmi_init(struct oprofile_o { __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; - char *cpu_type; + char *cpu_type = NULL; if (!cpu_has_apic) return -ENODEV; @@ -425,22 +438,29 @@ int __init op_nmi_init(struct oprofile_o break; case X86_VENDOR_INTEL: + if (force_arch_perfmon) { + if (!arch_perfmon_init(&cpu_type)) + return -ENODEV; + break; + } + switch (family) { /* Pentium IV */ case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; + p4_init(&cpu_type); break; /* A P6-class processor */ case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; + ppro_init(&cpu_type); break; default: - return -ENODEV; + break; } + + if (!cpu_type && !arch_perfmon_init(&cpu_type)) + return -ENODEV; break; default: Index: linux/arch/x86/oprofile/op_model_ppro.c =================================================================== --- linux.orig/arch/x86/oprofile/op_model_ppro.c +++ linux/arch/x86/oprofile/op_model_ppro.c @@ -1,32 +1,35 @@ /* * @file op_model_ppro.h - * pentium pro / P6 model-specific MSR operations + * Family 6 perfmon and architectural perfmon MSR operations * * @remark Copyright 2002 OProfile authors + * @remark Copyright 2008 Intel Corporation * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare + * @author Andi Kleen */ #include +#include #include #include #include #include +#include #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 2 -#define NUM_CONTROLS 2 +static int num_counters = 2; +static int counter_width = 32; +static int arch_perfmon; #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_32BIT_WRITE(l, msrs, c) \ - do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1)))) #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) @@ -40,20 +43,20 @@ #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) -static unsigned long reset_value[NUM_COUNTERS]; +static u64 *reset_value; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; else msrs->counters[i].addr = 0; } - for (i = 0; i < NUM_CONTROLS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; else @@ -67,8 +70,22 @@ static void ppro_setup_ctrs(struct op_ms unsigned int low, high; int i; + if (!reset_value) { + reset_value = kmalloc(sizeof(unsigned) * num_counters, + GFP_ATOMIC); + if (!reset_value) + return; + } + + if (cpu_has_arch_perfmon) { + union cpuid10_eax eax; + eax.full = cpuid_eax(0xa); + if (counter_width < eax.split.bit_width) + counter_width = eax.split.bit_width; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -77,18 +94,18 @@ static void ppro_setup_ctrs(struct op_ms } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; - CTR_32BIT_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - CTR_32BIT_WRITE(counter_config[i].count, msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -111,13 +128,13 @@ static int ppro_check_ctrs(struct pt_reg unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_32BIT_WRITE(reset_value[i], msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } } @@ -141,7 +158,7 @@ static void ppro_start(struct op_msrs co unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); @@ -156,7 +173,7 @@ static void ppro_stop(struct op_msrs con unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; CTRL_READ(low, high, msrs, i); @@ -169,20 +186,24 @@ static void ppro_shutdown(struct op_msrs { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } + if (reset_value) { + kfree(reset_value); + reset_value = NULL; + } } struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, + .num_counters = 2, + .num_controls = 2, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -190,3 +211,108 @@ struct op_x86_model_spec const op_ppro_s .stop = &ppro_stop, .shutdown = &ppro_shutdown }; + +/* + * Architectural performance monitoring. + * + * Newer Intel CPUs (Core1+) have support for architectural + * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. + * The advantage of this is that it can be done without knowing about + * the specific CPU. + * + * This code needs to know about the architected events in order to + * check CPUID for them. We do that in the kernel to avoid special casing + * that in user space. The number of architected events is quite small and + * stable so that's not a big burden. + */ + +static const struct apevent { + u8 unit; + u8 event; +} apevents[] = { + /* Order must match the bitmask in CPUID 0xa.ebx */ + { 0, 0x3c }, /* unhalted core cycles */ + { 0, 0xc0 }, /* instructions retired */ + { 1, 0x3c }, /* unhalted reference cycles */ + { 0x4f, 0x2e }, /* LLC reference */ + { 0x41, 0x2e }, /* LLC misses */ + { 0, 0xc4 }, /* branch instructions retired */ + { 0, 0xc5 }, /* branch misses retired */ +}; + +static int lookup_apevent(unsigned unit, unsigned event) +{ + int i; + for (i = 0; i < ARRAY_SIZE(apevents); i++) { + if (unit == apevents[i].unit && event == apevents[i].event) + return i; + } + return -1; +} + +void arch_perfmon_setup_counters(void) +{ + union cpuid10_eax eax; + + eax.full = cpuid_eax(0xa); + + /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ + if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && + current_cpu_data.x86_model == 15) { + eax.split.version_id = 2; + eax.split.num_counters = 2; + eax.split.bit_width = 40; + } + + num_counters = eax.split.num_counters; + + op_arch_perfmon_spec.num_counters = num_counters; + op_arch_perfmon_spec.num_controls = num_counters; + + arch_perfmon = 1; +} + +static void arch_perfmon_setup_ctrs(struct op_msrs const * const msrs) +{ + int i; + union cpuid10_eax eax; + u32 supp, dummy; + + cpuid(0xa, &eax.full, &supp, &dummy, &dummy); + + /* Check if events are architectural and supported in CPUID */ + for (i = 0; i < num_counters; i++) { + int index; + struct op_counter_config *cc = &counter_config[i]; + + if (!cc->enabled) + continue; + if (!CTR_IS_RESERVED(msrs, i)) + continue; + index = lookup_apevent(cc->unit_mask, cc->event); + + /* + * When a bit is set in the EBX vector the event is not + * supported. + */ + if (index < 0 || index >= eax.split.mask_length || + ((1UL << index) & supp)) { + printk(KERN_WARNING + "oprofile: Ignoring unsupported arch perfmon event %d (%lx:%lx)\n", + i, cc->unit_mask, cc->event); + cc->enabled = 0; + } + } + + ppro_setup_ctrs(msrs); +} + +struct op_x86_model_spec op_arch_perfmon_spec = { + /* num_counters/num_controls filled in at runtime */ + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &arch_perfmon_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown +}; Index: linux/arch/x86/oprofile/op_x86_model.h =================================================================== --- linux.orig/arch/x86/oprofile/op_x86_model.h +++ linux/arch/x86/oprofile/op_x86_model.h @@ -47,5 +47,8 @@ extern struct op_x86_model_spec const op extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_athlon_spec; +extern struct op_x86_model_spec op_arch_perfmon_spec; + +extern void arch_perfmon_setup_counters(void); #endif /* OP_X86_MODEL_H */ Index: linux/Documentation/kernel-parameters.txt =================================================================== --- linux.orig/Documentation/kernel-parameters.txt +++ linux/Documentation/kernel-parameters.txt @@ -1408,6 +1408,11 @@ and is between 256 and 4096 characters. oprofile.timer= [HW] Use timer interrupt instead of performance counters + oprofile.force_arch_perfmon=1 [X86] + Force use of architectural perfmon performance counters + in oprofile on Intel CPUs. The kernel selects the + correct default on its own. + osst= [HW,SCSI] SCSI Tape Driver Format: , See also Documentation/scsi/st.txt.