mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-07-21 07:01:04 +02:00
Initial commit
This commit is contained in:
13
Makefile
Normal file
13
Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
.PHONY: user kernel
|
||||
|
||||
all: user kernel
|
||||
|
||||
user:
|
||||
$(MAKE) -C user/
|
||||
|
||||
kernel:
|
||||
cd kernel; $(MAKE)
|
||||
|
||||
clean:
|
||||
$(MAKE) -C user/ clean
|
||||
cd kernel; $(MAKE) clean
|
168
README.md
168
README.md
@@ -1 +1,169 @@
|
||||
|
||||
# nanoBench
|
||||
|
||||
*nanoBench* is a Linux-based tool for running small microbenchmarks on recent Intel and AMD x86 CPUs. The microbenchmarks are evaluated using [hardware performance counters](https://en.wikipedia.org/wiki/Hardware_performance_counter). The reading of the performance counters is implemented in a way that incurs only minimal overhead.
|
||||
|
||||
There are two variants of the tool: A user-space implementation and a kernel module. The kernel module makes it possible to benchmark privileged instructions, and it can allow for more accurate measurement results as it disables interrupts and preemptions during measurements. The disadvantage of the kernel module compared to the user-space variant is that it is quite risky to allow arbitrary code to be executed in kernel space. Therefore, the kernel module should not be used on a production system.
|
||||
|
||||
*nanoBench* is used for running the microbenchmarks for obtaining the latency, throughput, and port usage data that is available on [uops.info](http:www.uops.info).
|
||||
|
||||
## Installation
|
||||
|
||||
### User-space Version
|
||||
|
||||
sudo apt install msr-tools
|
||||
git clone https://github.com/andreas-abel/nanoBench.git
|
||||
cd nanoBench
|
||||
make user
|
||||
|
||||
### Kernel Module
|
||||
*Note: The following is not necessary if you would just like to use the user-space version.*
|
||||
|
||||
git clone https://github.com/andreas-abel/nanoBench.git
|
||||
cd nanoBench
|
||||
make kernel
|
||||
|
||||
To load the kernel module, run:
|
||||
|
||||
sudo insmod kernel/nb.ko # this is necessary after every reboot
|
||||
|
||||
## Usage Examples
|
||||
|
||||
The recommended way for using *nanoBench* is with the wrapper scripts `nanoBench.sh` (for the user-space variant) and `kernel-nanoBench.sh` (for the kernel module). The following examples work with both of these scripts.
|
||||
|
||||
For obtaining repeatable results, it can help to disable hyper-threading. This can be done with the `disable-HT.sh` script.
|
||||
|
||||
### Example 1: The ADD Instruction
|
||||
|
||||
The following command will benchmark the assembler code sequence "ADD RAX, RBX; ADD RBX, RAX" on a Skylake-based system.
|
||||
|
||||
sudo ./nanoBench.sh -asm "ADD RAX, RBX; add RBX, RAX" -config configs/cfg_Skylake_common.txt
|
||||
|
||||
It will produce an output similar to the following.
|
||||
|
||||
Instructions retired: 2.00
|
||||
Core cycles: 2.00
|
||||
Reference cycles: 1.85
|
||||
UOPS_ISSUED.ANY: 2.00
|
||||
UOPS_EXECUTED.THREAD: 2.00
|
||||
UOPS_DISPATCHED_PORT.PORT_0: 0.49
|
||||
UOPS_DISPATCHED_PORT.PORT_1: 0.50
|
||||
UOPS_DISPATCHED_PORT.PORT_2: 0.00
|
||||
UOPS_DISPATCHED_PORT.PORT_3: 0.00
|
||||
UOPS_DISPATCHED_PORT.PORT_4: 0.00
|
||||
UOPS_DISPATCHED_PORT.PORT_5: 0.50
|
||||
UOPS_DISPATCHED_PORT.PORT_6: 0.51
|
||||
UOPS_DISPATCHED_PORT.PORT_7: 0.00
|
||||
...
|
||||
|
||||
The tool will *unroll* the assembler code multiple times, i.e., it will create multiple copies of it. The results are averages per copy of the assembler code for multiple runs of the entire generated code sequence.
|
||||
|
||||
The config file contains the required information for configuring the programmable performance counters with the desired events. We provide example configuration files for recent Intel and AMD microarchitectures in the `config` folder. When using the kernel-module, the config file must not be larger than 4kB.
|
||||
|
||||
The assembler code sequence may use and modify any general-purpose or vector registers, including the stack pointer. There is no need to restore the registers to their original values at the end (unless the `-loop` or `-no_mem` options are used).
|
||||
|
||||
R14, RDI, RSI, RSP, and RBP are initialized with addresses in a dedicated buffer (of about 2MB), that can be freely modified by the assembler code. The addresses in R14, RDI, RSI, RSP, and RBP are at least 4kB apart from each other.
|
||||
|
||||
All other registers have initially undefined values. They can, however, be initialized as shown in the following example.
|
||||
|
||||
### Example 2: Load Latency
|
||||
|
||||
sudo ./nanoBench.sh -asm_init "mov RAX, R14; sub RAX, 8; mov [RAX], RAX" -asm "mov RAX, [RAX]" -config configs/cfg_Skylake_common.txt
|
||||
|
||||
The `asm-init` code is executed once in the beginning. It first sets RAX to R14-8 (thus, RAX now contains a valid memory address), and then sets the memory at address RAX to its own address. Then, the `asm` code is executed repeatedly. This code loads the value at the address in RAX into RAX. Thus, the execution time of this instruction corresponds to the L1 data cache latency.
|
||||
|
||||
We will get an output similar to the following.
|
||||
|
||||
Instructions retired: 1.00
|
||||
Core cycles: 4.00
|
||||
Reference cycles: 3.52
|
||||
UOPS_ISSUED.ANY: 1.00
|
||||
UOPS_EXECUTED.THREAD: 1.00
|
||||
UOPS_DISPATCHED_PORT.PORT_0: 0.00
|
||||
UOPS_DISPATCHED_PORT.PORT_1: 0.00
|
||||
UOPS_DISPATCHED_PORT.PORT_2: 0.50
|
||||
UOPS_DISPATCHED_PORT.PORT_3: 0.50
|
||||
...
|
||||
MEM_LOAD_RETIRED.L1_HIT: 1.00
|
||||
MEM_LOAD_RETIRED.L1_MISS: 0.00
|
||||
...
|
||||
|
||||
## Generated Code
|
||||
|
||||
We will now take a look behind the scenes at the code that *nanoBench* generates for evaluating a microbenchmark.
|
||||
|
||||
int run(code, code_init, local_unroll_count):
|
||||
int measurements[n_measurements]
|
||||
|
||||
for i=-warm_up_count to n_measurements
|
||||
save_regs
|
||||
code_init
|
||||
m1 = read_perf_ctrs // stores results in memory, does not modify registers
|
||||
for j=0 to loop_count // this line is omitted if loop_count=0
|
||||
code // (copy #1)
|
||||
code // (copy #2)
|
||||
⋮
|
||||
code // (copy #local_unroll_count)
|
||||
m2 = read_perf_ctrs
|
||||
restore_regs
|
||||
if i >= 0: // ignore warm-up runs
|
||||
measurements[i] = m2 - m1
|
||||
|
||||
return agg(measurements) // apply selected aggregate function
|
||||
|
||||
`run(...)` is executed twice: The first time with `local_unroll_count = unroll_count`, and the second time with `local_unroll_count = 2 * unroll_count`. If the `-basic_mode` options is used, the first execution is with no instructions between `m1 = read_perf_ctrs` and `m2 = read_perf_ctrs`, and the second with `local_unroll_count = unroll_count`.
|
||||
|
||||
|
||||
The result that is finally reported by *nanoBench* is the difference between these two executions divided by `max(loop_count * unroll_count, unroll_count)`.
|
||||
|
||||
Before the first execution of `run(...)`, the performance counters are configured according to the event specifications in the `-config` file. If this file contains more events than there are programmable performance counters available, `run(...)` is executed multiple times with different performance counter configurations.
|
||||
|
||||
|
||||
|
||||
## Command-line Options
|
||||
|
||||
Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line parameters. All parameters are optional. Parameter names may be abbreviated if the abbreviation is unique (e.g., `-l` may be used instead of `-loop_count`).
|
||||
|
||||
| Option | Description |
|
||||
|------------------------------|-------------|
|
||||
| `-asm <code>` | Assembler code sequence (in Intel syntax) containing the code to be benchmarked. |
|
||||
| `-asm_init <code>` | Assembler code sequence (in Intel syntax) that is executed once before executing the benchmark code. |
|
||||
| `-code <filename>` | A binary file containing the code to be benchmarked as raw x86 machine code. *This option cannot be used together with `-asm`.* |
|
||||
| `-code_init <filename>` | A binary file containing code to be executed once before executing the benchmark code. *This option cannot be used together with `-asm_init`.* |
|
||||
| `-config <file>` | File with performance counter event specifications. Details are described [below](#performance-counter-config-files). |
|
||||
| `-n_measurements <n>` | Number of times the measurements are repeated. `[Default: n=10]` |
|
||||
| `-unroll_count <n>` | Number of copies of the benchmark code inside the inner loop. `[Default: n=1000]` |
|
||||
| `-loop_count <n>` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` |
|
||||
| `-warm_up_count <n>` | Number of runs of the generated benchmark code sequence (in each invocation of `run(...)`) before the first measurement result gets recorded . This can, for example, be useful for excluding outliers due to cold caches. `[Default: n=5]` |
|
||||
| `-initial_warm_up_count <n>` | Number of runs of the benchmark code sequence before the first invocation of `run(...)`. This can be useful for benchmarking instructions that require a warm-up period before they can execute at full speed, like [AVX2 instructions on some microarchitectures](https://www.agner.org/optimize/blog/read.php?i=415). `[Default: n=0]` |
|
||||
| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` |
|
||||
| `-median` | Selects the median as the aggregate function. |
|
||||
| `-min` | Selects the minimum as the aggregate function. |
|
||||
| `-basic_mode` | The effect of this option is described in the [Generated Code](#generated-code) section. |
|
||||
| `-no_mem` | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. |
|
||||
| `-verbose` | Outputs the results of all performance counter readings. In the user-space version, the results are printed to stdout. The output of the kernel module can be accessed using `dmesg`. |
|
||||
|
||||
The following parameters are only supported by `nanoBench.sh`.
|
||||
|
||||
| Option | Description |
|
||||
|------------|-------------|
|
||||
| `-cpu <n>` | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` |
|
||||
| `-usr <n>` | If n=1, performance events are counted when the processor is operating at a privilege level greater than 0. `[Default: n=1]` |
|
||||
| `-os <n>` | If n=1, performance events are counted when the processor is operating at privilege level 0. `[Default: n=0]` |
|
||||
|
||||
|
||||
## Performance Counter Config Files
|
||||
|
||||
We provide provide performance counter configuration files for most recent Intel and AMD CPUs in the `configs` folder. These files can be adapted/reduced to the events you are interested in.
|
||||
|
||||
The format of the entries in the configuration files is
|
||||
|
||||
EvtSel.UMASK(.CMSK=...)(.AnyT)(.EDG)(.INV)(.CTR=...)(.MSR_3F6H=...)(.MSR_PF=...)(.MSR_RSP0=...)(.MSR_RSP1=...) Name
|
||||
|
||||
You can find details on the meanings of the different parts of the entries in chapters 18 and 19 of [Intel's System Programming Guide](https://software.intel.com/sites/default/files/managed/a4/60/325384-sdm-vol-3abcd.pdf).
|
||||
|
||||
## Supported Platforms
|
||||
|
||||
*nanoBench* should work with all Intel processors supporting architectural performance monitoring version ≥ 3, as well as with AMD Family 17h processors.
|
||||
|
||||
The code was developed and tested using Ubuntu 18.04.
|
||||
|
1068
common/nanoBench.c
Normal file
1068
common/nanoBench.c
Normal file
File diff suppressed because it is too large
Load Diff
264
common/nanoBench.h
Normal file
264
common/nanoBench.h
Normal file
@@ -0,0 +1,264 @@
|
||||
#ifndef NANOBENCH_H
|
||||
#define NANOBENCH_H
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#include <linux/module.h>
|
||||
#include <linux/sort.h>
|
||||
#else
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include <cpuid.h>
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#define print_error(...) pr_debug(__VA_ARGS__)
|
||||
#define print_verbose(...) if (verbose) pr_debug(__VA_ARGS__)
|
||||
#define print_user_verbose(...) pr_debug(__VA_ARGS__)
|
||||
#define nb_strtoul(s, base, res) kstrtoul(s, base, res)
|
||||
#define qsort(base, n, size, comp) sort(base, n, size, comp, NULL)
|
||||
#else
|
||||
#define print_error(...) fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n");
|
||||
#define print_verbose(...) if (verbose) printf(__VA_ARGS__);
|
||||
#define print_user_verbose(...) if (verbose) printf(__VA_ARGS__);
|
||||
#define nb_strtoul(s, base, res) *res = strtoul(s, NULL, base)
|
||||
#endif
|
||||
|
||||
#ifndef MSR_IA32_PMC0
|
||||
#define MSR_IA32_PMC0 0x0C1
|
||||
#endif
|
||||
#ifndef MSR_IA32_PERFEVTSEL0
|
||||
#define MSR_IA32_PERFEVTSEL0 0x186
|
||||
#endif
|
||||
#ifndef MSR_OFFCORE_RSP0
|
||||
#define MSR_OFFCORE_RSP0 0x1A6
|
||||
#endif
|
||||
#ifndef MSR_OFFCORE_RSP1
|
||||
#define MSR_OFFCORE_RSP1 0x1A7
|
||||
#endif
|
||||
#ifndef MSR_IA32_FIXED_CTR0
|
||||
#define MSR_IA32_FIXED_CTR0 0x309
|
||||
#endif
|
||||
#ifndef MSR_IA32_FIXED_CTR_CTRL
|
||||
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
|
||||
#endif
|
||||
#ifndef MSR_IA32_PERF_GLOBAL_CTRL
|
||||
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
|
||||
#endif
|
||||
#ifndef MSR_PEBS_FRONTEND
|
||||
#define MSR_PEBS_FRONTEND 0x3F7
|
||||
#endif
|
||||
#ifndef CORE_X86_MSR_PERF_CTL
|
||||
#define CORE_X86_MSR_PERF_CTL 0xC0010200
|
||||
#endif
|
||||
#ifndef CORE_X86_MSR_PERF_CTR
|
||||
#define CORE_X86_MSR_PERF_CTR 0xC0010201
|
||||
#endif
|
||||
|
||||
|
||||
// How often the measurement will be repeated.
|
||||
extern long n_measurements;
|
||||
#define N_MEASUREMENTS_DEFAULT 10;
|
||||
|
||||
// How often the code to be measured will be unrolled.
|
||||
extern long unroll_count;
|
||||
#define UNROLL_COUNT_DEFAULT 1000;
|
||||
|
||||
// Number of iterations of the inner loop. Setting this to 0 will disable the inner loop; the code to be measured is then executed unroll_count many times.
|
||||
extern long loop_count;
|
||||
#define LOOP_COUNT_DEFAULT 0;
|
||||
|
||||
// Number of executions of the measurement code before each sequence of measurement runs.
|
||||
extern long warm_up_count;
|
||||
#define WARM_UP_COUNT_DEFAULT 5;
|
||||
|
||||
// Number of executions of the measurement code before the first measurement.
|
||||
extern long initial_warm_up_count;
|
||||
#define INITIAL_WARM_UP_COUNT_DEFAULT 0;
|
||||
|
||||
// If enabled, the temporary performance counter values are stored in registers instead of in memory;
|
||||
// the code to be measured must then not use registers R8-R13
|
||||
extern int no_mem;
|
||||
#define NO_MEM_DEFAULT 0;
|
||||
|
||||
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
|
||||
// measurements.
|
||||
// If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference
|
||||
// between the two measurements.
|
||||
extern int basic_mode;
|
||||
#define BASIC_MODE_DEFAULT 0;
|
||||
|
||||
enum agg_enum {AVG_20_80, MIN, MED};
|
||||
extern int aggregate_function;
|
||||
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
|
||||
|
||||
extern int verbose;
|
||||
#define VERBOSE_DEFAULT 0;
|
||||
|
||||
extern char* code;
|
||||
extern size_t code_length;
|
||||
|
||||
extern char* code_init;
|
||||
extern size_t code_init_length;
|
||||
|
||||
struct pfc_config {
|
||||
unsigned long evt_num;
|
||||
unsigned long umask;
|
||||
unsigned long cmask;
|
||||
unsigned int any;
|
||||
unsigned int edge;
|
||||
unsigned int inv;
|
||||
unsigned long msr_3f6h;
|
||||
unsigned long msr_pf;
|
||||
unsigned long msr_rsp0;
|
||||
unsigned long msr_rsp1;
|
||||
unsigned int invalid;
|
||||
char* description;
|
||||
};
|
||||
|
||||
extern struct pfc_config pfc_configs[];
|
||||
extern size_t n_pfc_configs;
|
||||
|
||||
extern char* pfc_config_file_content;
|
||||
|
||||
extern int is_Intel_CPU;
|
||||
extern int is_AMD_CPU;
|
||||
|
||||
#define MAX_PROGRAMMABLE_COUNTERS 6
|
||||
extern int n_programmable_counters;
|
||||
|
||||
// Pointer to a memory region that is writable and executable.
|
||||
extern char* runtime_code;
|
||||
|
||||
// During measurement, RSP, RDI, RSI, and R14 will point to locations in runtime_mem.
|
||||
extern void* runtime_mem;
|
||||
|
||||
// Stores performance counter values during measurements.
|
||||
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
|
||||
|
||||
// Stores the RSP during measurements.
|
||||
extern void* RSP_mem;
|
||||
|
||||
extern int64_t* measurement_results[MAX_PROGRAMMABLE_COUNTERS];
|
||||
extern int64_t* measurement_results_base[MAX_PROGRAMMABLE_COUNTERS];
|
||||
|
||||
// Process should be pinned to this CPU.
|
||||
extern int cpu;
|
||||
|
||||
// Checks whether we have an Intel or AMD CPU and determines the number of programmable counters.
|
||||
// Returns 0 if successful, 1 otherwise.
|
||||
int check_cpuid(void);
|
||||
|
||||
void parse_counter_configs(void);
|
||||
|
||||
uint64_t read_value_from_cmd(char* cmd);
|
||||
|
||||
uint64_t read_msr(unsigned int msr);
|
||||
void write_msr(unsigned int msr, uint64_t value);
|
||||
|
||||
// Enables and clears the fixed-function performance counters.
|
||||
void configure_perf_ctrs_FF(unsigned int usr, unsigned int os);
|
||||
|
||||
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
|
||||
// start and end are indices into the pfc_configs array.
|
||||
void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsigned int os);
|
||||
|
||||
|
||||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
|
||||
void run_warmup_experiment(char* measurement_template);
|
||||
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
|
||||
|
||||
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
|
||||
int64_t get_aggregate_value_100(int64_t* values, size_t length);
|
||||
int cmpInt64(const void *a, const void *b);
|
||||
long long ll_abs(long long val);
|
||||
|
||||
void print_all_measurement_results(int64_t* results[], int n_counters);
|
||||
|
||||
|
||||
#define MAGIC_BYTES_INIT 0x10b513b1C2813F04
|
||||
#define MAGIC_BYTES_CODE 0x20b513b1C2813F04
|
||||
#define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_MEM 0x40b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC 0x50b513b1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0x60b513b1C2813F04
|
||||
|
||||
#define STRINGIFY2(X) #X
|
||||
#define STRINGIFY(X) STRINGIFY2(X)
|
||||
|
||||
int starts_with_magic_bytes(char* c, int64_t magic_bytes);
|
||||
|
||||
// The following functions must not use global variables (or anything that uses RIP-relative addressing)
|
||||
void measurement_template_Intel(void);
|
||||
void measurement_template_Intel_noMem(void);
|
||||
void measurement_template_AMD(void);
|
||||
void measurement_template_AMD_noMem(void);
|
||||
void measurement_FF_template_Intel(void);
|
||||
void measurement_FF_template_Intel_noMem(void);
|
||||
void measurement_FF_template_AMD(void);
|
||||
void measurement_FF_template_AMD_noMem(void);
|
||||
void measurement_RDTSC_template(void);
|
||||
void measurement_RDTSC_template_noMem(void);
|
||||
|
||||
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
|
||||
#define SAVE_REGS_FLAGS() \
|
||||
asm volatile( \
|
||||
".intel_syntax noprefix\n" \
|
||||
"push rbx\n" \
|
||||
"push rbp\n" \
|
||||
"push r12\n" \
|
||||
"push r13\n" \
|
||||
"push r14\n" \
|
||||
"push r15\n" \
|
||||
"pushfq\n" \
|
||||
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
|
||||
"mov [r15], rsp\n" \
|
||||
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_MEM)"\n" \
|
||||
"add rsp, 0xfffff\n" \
|
||||
"mov r15, 0xfff\n" /*4 kB alignment*/ \
|
||||
"not r15\n" \
|
||||
"and rsp, r15\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
#define RESTORE_REGS_FLAGS() \
|
||||
asm volatile( \
|
||||
".intel_syntax noprefix\n" \
|
||||
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
|
||||
"mov rsp, [r15]\n" \
|
||||
"popfq\n" \
|
||||
"pop r15\n" \
|
||||
"pop r14\n" \
|
||||
"pop r13\n" \
|
||||
"pop r12\n" \
|
||||
"pop rbp\n" \
|
||||
"pop rbx\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
#define INITIALIZE_REGS() \
|
||||
asm volatile( \
|
||||
".intel_syntax noprefix\n" \
|
||||
"mov rax, 0\n" \
|
||||
"mov rbx, 0\n" \
|
||||
"mov rcx, 0\n" \
|
||||
"mov rdx, 0\n" \
|
||||
"mov r8, 0\n" \
|
||||
"mov r9, 0\n" \
|
||||
"mov r10, 0\n" \
|
||||
"mov r11, 0\n" \
|
||||
"mov r12, 0\n" \
|
||||
"mov r13, 0\n" \
|
||||
"mov r14, rsp\n" \
|
||||
"add r14, 0x1000\n" \
|
||||
"mov rdi, rsp\n" \
|
||||
"add rdi, 0x2000\n" \
|
||||
"mov rsi, rsp\n" \
|
||||
"add rsi, 0x3000\n" \
|
||||
"mov rbp, rsp\n" \
|
||||
"add rbp, 0x4000\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
#endif
|
204
configs/cfg_Broadwell_all.txt
Normal file
204
configs/cfg_Broadwell_all.txt
Normal file
@@ -0,0 +1,204 @@
|
||||
# Performance monitoring events for processors based on the Broadwell microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3DH and 06_47H.
|
||||
# See Table 19-8 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
05.01 MISALIGN_MEM_REF.LOADS
|
||||
05.02 MISALIGN_MEM_REF.STORES
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
|
||||
08.10 DTLB_LOAD_MISSES.WALK_DURATION
|
||||
08.20 DTLB_LOAD_MISSES.STLB_HIT_4K
|
||||
0D.03.CMSK=1 INT_MISC.RECOVERY_CYCLES
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.10 UOPS_ISSUED.FLAGS_MERGE
|
||||
0E.20 UOPS_ISSUED.SLOW_LEA
|
||||
0E.40 UOPS_ISSUED.SiNGLE_MUL
|
||||
14.01 ARITH.FPU_DIV_ACTIVE
|
||||
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
|
||||
24.41 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.50 L2_RQSTS.L2_PF_HIT
|
||||
24.30 L2_RQSTS.L2_PF_MISS
|
||||
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.E2 L2_RQSTS.ALL_RFO
|
||||
24.E4 L2_RQSTS.ALL_CODE_RD
|
||||
24.F8 L2_RQSTS.ALL_PF
|
||||
27.50 L2_DEMAND_RQSTS.WB_HIT
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
|
||||
48.01.CTR=2.CMSK=1 L1D_PEND_MISS.PENDING
|
||||
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
|
||||
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
|
||||
49.10 DTLB_STORE_MISSES.WALK_DURATION
|
||||
49.20 DTLB_STORE_MISSES.STLB_HIT_4K
|
||||
4C.02 LOAD_HIT_PRE.HW_PF
|
||||
4F.10 EPT.WALK_CYCLES
|
||||
51.01 L1D.REPLACEMENT
|
||||
58.04 MOVE_ELIMINATION.INT_NOT_ELIMINATED
|
||||
58.08 MOVE_ELIMINATION.SIMD_NOT_ELIMINATED
|
||||
58.01 MOVE_ELIMINATION.INT_ELIMINATED
|
||||
58.02 MOVE_ELIMINATION.SIMD_ELIMINATED
|
||||
5C.01 CPL_CYCLES.RING0
|
||||
5C.02 CPL_CYCLES.RING123
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
||||
60.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
|
||||
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
63.01 LOCK_CYCLES.SPLIT_LOCK_UC_LOCK_DURATION
|
||||
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION
|
||||
79.02 IDQ.EMPTY
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.10 IDQ.MS_DSB_UOPS
|
||||
79.20 IDQ.MS_MITE_UOPS
|
||||
79.30 IDQ.MS_UOPS
|
||||
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS
|
||||
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS
|
||||
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS
|
||||
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS
|
||||
79.3C IDQ.MITE_ALL_UOPS
|
||||
80.02 ICACHE.MISSES
|
||||
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED_4K
|
||||
85.10 ITLB_MISSES.WALK_DURATION
|
||||
85.20 ITLB_MISSES.STLB_HIT_4K
|
||||
87.01 ILD_STALL.LCP
|
||||
88.01 BR_INST_EXEC.COND
|
||||
88.02 BR_INST_EXEC.DIRECT_JMP
|
||||
88.04 BR_INST_EXEC.INDIRECT_JMP_NON_CALL_RET
|
||||
88.08 BR_INST_EXEC.RETURN_NEAR
|
||||
88.10 BR_INST_EXEC.DIRECT_NEAR_CALL
|
||||
88.20 BR_INST_EXEC.INDIRECT_NEAR_CALL
|
||||
88.40 BR_INST_EXEC.NONTAKEN
|
||||
88.80 BR_INST_EXEC.TAKEN
|
||||
88.FF BR_INST_EXEC.ALL_BRANCHES
|
||||
89.01 BR_MISP_EXEC.COND
|
||||
89.04 BR_MISP_EXEC.INDIRECT_JMP_NON_CALL_RET
|
||||
89.08 BR_MISP_EXEC.RETURN_NEAR
|
||||
89.10 BR_MISP_EXEC.DIRECT_NEAR_CALL
|
||||
89.20 BR_MISP_EXEC.INDIRECT_NEAR_CALL
|
||||
89.40 BR_MISP_EXEC.NONTAKEN
|
||||
89.80 BR_MISP_EXEC.TAKEN
|
||||
89.FF BR_MISP_EXEC.ALL_BRANCHES
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.08 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.10 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.04 RESOURCE_STALLS.RS
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A2.10 RESOURCE_STALLS.ROB
|
||||
A8.01 LSD.UOPS
|
||||
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
B1.02 UOPS_EXECUTED.CORE
|
||||
B7.01.CTR=0.MSR_RSP0=0x10001 OFF_CORE_RESPONSE_0.DMND_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10002 OFF_CORE_RESPONSE_0.DMND_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10004 OFF_CORE_RESPONSE_0.DMND_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10008 OFF_CORE_RESPONSE_0.WB
|
||||
B7.01.CTR=0.MSR_RSP0=0x10010 OFF_CORE_RESPONSE_0.PF_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10020 OFF_CORE_RESPONSE_0.PF_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10040 OFF_CORE_RESPONSE_0.PF_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10080 OFF_CORE_RESPONSE_0.PF_LLC_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10100 OFF_CORE_RESPONSE_0.PF_LLC_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10200 OFF_CORE_RESPONSE_0.PF_LLC_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10400 OFF_CORE_RESPONSE_0.BUS_LOCKS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10800 OFF_CORE_RESPONSE_0.STRM_ST
|
||||
B7.01.CTR=0.MSR_RSP0=0x18000 OFF_CORE_RESPONSE_0.OTHER
|
||||
BB.01.CTR=1.MSR_RSP1=0x10001 OFF_CORE_RESPONSE_1.DMND_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10002 OFF_CORE_RESPONSE_1.DMND_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10004 OFF_CORE_RESPONSE_1.DMND_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10008 OFF_CORE_RESPONSE_1.WB
|
||||
BB.01.CTR=1.MSR_RSP1=0x10010 OFF_CORE_RESPONSE_1.PF_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10020 OFF_CORE_RESPONSE_1.PF_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10040 OFF_CORE_RESPONSE_1.PF_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10080 OFF_CORE_RESPONSE_1.PF_LLC_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10100 OFF_CORE_RESPONSE_1.PF_LLC_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10200 OFF_CORE_RESPONSE_1.PF_LLC_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10400 OFF_CORE_RESPONSE_1.BUS_LOCKS
|
||||
BB.01.CTR=1.MSR_RSP1=0x10800 OFF_CORE_RESPONSE_1.STRM_ST
|
||||
BB.01.CTR=1.MSR_RSP1=0x18000 OFF_CORE_RESPONSE_1.OTHER
|
||||
BC.11 PAGE_WALKER_LOADS.DTLB_L1
|
||||
BC.21 PAGE_WALKER_LOADS.ITLB_L1
|
||||
BC.12 PAGE_WALKER_LOADS.DTLB_L2
|
||||
BC.22 PAGE_WALKER_LOADS.ITLB_L2
|
||||
BC.14 PAGE_WALKER_LOADS.DTLB_L3
|
||||
BC.24 PAGE_WALKER_LOADS.ITLB_L3
|
||||
BC.18 PAGE_WALKER_LOADS.DTLB_MEMORY
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.01.CTR=1 INST_RETIRED.PREC_DIST
|
||||
C0.02 INST_RETIRED.X87
|
||||
C1.08 OTHER_ASSISTS.AVX_TO_SSE
|
||||
C1.10 OTHER_ASSISTS.SSE_TO_AVX
|
||||
C1.40 OTHER_ASSISTS.ANY_WB_ASSIST
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C3.01 MACHINE_CLEARS.CYCLES
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C3.20 MACHINE_CLEARS.MASKMOV
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.04 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.NOT_TAKEN
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
CA.02 FP_ASSIST.X87_OUTPUT
|
||||
CA.04 FP_ASSIST.X87_INPUT
|
||||
CA.08 FP_ASSIST.SIMD_OUTPUT
|
||||
CA.10 FP_ASSIST.SIMD_INPUT
|
||||
CA.1E FP_ASSIST.ANY
|
||||
CC.20 ROB_MISC_EVENTS.LBR_INSERTS
|
||||
CD.01.MSR_3F6H=10 MEM_TRANS_RETIRED.LOAD_LATENCY
|
||||
D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS
|
||||
D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES
|
||||
D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
|
||||
D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
|
||||
D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
|
||||
D0.81 MEM_UOPS_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_UOPS_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.L3_HIT
|
||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.L3_MISS
|
||||
D1.40 MEM_LOAD_UOPS_RETIRED.HIT_LFB
|
||||
D2.01 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE
|
||||
D3.01 MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM
|
||||
F0.01 L2_TRANS.DEMAND_DATA_RD
|
||||
F0.02 L2_TRANS.RFO
|
||||
F0.04 L2_TRANS.CODE_RD
|
||||
F0.08 L2_TRANS.ALL_PF
|
||||
F0.10 L2_TRANS.L1D_WB
|
||||
F0.20 L2_TRANS.L2_FILL
|
||||
F0.40 L2_TRANS.L2_WB
|
||||
F0.80 L2_TRANS.ALL_REQUESTS
|
||||
F1.01 L2_LINES_IN.I
|
||||
F1.02 L2_LINES_IN.S
|
||||
F1.04 L2_LINES_IN.E
|
||||
F1.07 L2_LINES_IN.ALL
|
||||
F2.05 L2_LINES_OUT.DEMAND_CLEAN
|
23
configs/cfg_Broadwell_common.txt
Normal file
23
configs/cfg_Broadwell_common.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
# Performance monitoring events for processors based on the Broadwell microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3DH and 06_47H.
|
||||
# See Table 19-8 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.08 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.10 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.L3_HIT
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.L3_MISS
|
233
configs/cfg_Haswell_all.txt
Normal file
233
configs/cfg_Haswell_all.txt
Normal file
@@ -0,0 +1,233 @@
|
||||
# Performance monitoring events for processors based on the Haswell microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3CH, 06_45H and 06_46H.
|
||||
# See Table 19-10 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
05.01 MISALIGN_MEM_REF.LOADS
|
||||
05.02 MISALIGN_MEM_REF.STORES
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
|
||||
08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
|
||||
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.10 DTLB_LOAD_MISSES.WALK_DURATION
|
||||
08.20 DTLB_LOAD_MISSES.STLB_HIT_4K
|
||||
08.40 DTLB_LOAD_MISSES.STLB_HIT_2M
|
||||
08.60 DTLB_LOAD_MISSES.STLB_HIT
|
||||
08.80 DTLB_LOAD_MISSES.PDE_CACHE_MISS
|
||||
0D.03.CMSK=1 INT_MISC.RECOVERY_CYCLES
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.10 UOPS_ISSUED.FLAGS_MERGE
|
||||
0E.20 UOPS_ISSUED.SLOW_LEA
|
||||
0E.40 UOPS_ISSUED.SiNGLE_MUL
|
||||
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
|
||||
24.41 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.42 L2_RQSTS.RFO_HIT
|
||||
24.22 L2_RQSTS.RFO_MISS
|
||||
24.E2 L2_RQSTS.ALL_RFO
|
||||
24.44 L2_RQSTS.CODE_RD_HIT
|
||||
24.24 L2_RQSTS.CODE_RD_MISS
|
||||
24.27 L2_RQSTS.ALL_DEMAND_MISS
|
||||
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
|
||||
24.E4 L2_RQSTS.ALL_CODE_RD
|
||||
24.50 L2_RQSTS.L2_PF_HIT
|
||||
24.30 L2_RQSTS.L2_PF_MISS
|
||||
24.F8 L2_RQSTS.ALL_PF
|
||||
24.3F L2_RQSTS.MISS
|
||||
24.FF L2_RQSTS.REFERENCES
|
||||
27.50 L2_DEMAND_RQSTS.WB_HIT
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
|
||||
48.01.CTR=2 L1D_PEND_MISS.PENDING
|
||||
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
|
||||
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
|
||||
49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
|
||||
49.0E DTLB_STORE_MISSES.WALK_COMPLETED
|
||||
49.10 DTLB_STORE_MISSES.WALK_DURATION
|
||||
49.20 DTLB_STORE_MISSES.STLB_HIT_4K
|
||||
49.40 DTLB_STORE_MISSES.STLB_HIT_2M
|
||||
49.60 DTLB_STORE_MISSES.STLB_HIT
|
||||
49.80 DTLB_STORE_MISSES.PDE_CACHE_MISS
|
||||
4C.01 LOAD_HIT_PRE.SW_PF
|
||||
4C.02 LOAD_HIT_PRE.HW_PF
|
||||
51.01 L1D.REPLACEMENT
|
||||
58.04 MOVE_ELIMINATION.INT_NOT_ELIMINATED
|
||||
58.08 MOVE_ELIMINATION.SIMD_NOT_ELIMINATED
|
||||
58.01 MOVE_ELIMINATION.INT_ELIMINATED
|
||||
58.02 MOVE_ELIMINATION.SIMD_ELIMINATED
|
||||
5C.01 CPL_CYCLES.RING0
|
||||
5C.02 CPL_CYCLES.RING123
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
||||
60.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
|
||||
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
63.01 LOCK_CYCLES.SPLIT_LOCK_UC_LOCK_DURATION
|
||||
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION
|
||||
79.02 IDQ.EMPTY
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.10 IDQ.MS_DSB_UOPS
|
||||
79.20 IDQ.MS_MITE_UOPS
|
||||
79.30 IDQ.MS_UOPS
|
||||
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS
|
||||
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS
|
||||
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS
|
||||
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS
|
||||
79.3C IDQ.MITE_ALL_UOPS
|
||||
80.02 ICACHE.MISSES
|
||||
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED_4K
|
||||
85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
|
||||
85.0E ITLB_MISSES.WALK_COMPLETED
|
||||
85.10 ITLB_MISSES.WALK_DURATION
|
||||
85.20 ITLB_MISSES.STLB_HIT_4K
|
||||
85.40 ITLB_MISSES.STLB_HIT_2M
|
||||
85.60 ITLB_MISSES.STLB_HIT
|
||||
87.01 ILD_STALL.LCP
|
||||
87.04 ILD_STALL.IQ_FULL
|
||||
88.01 BR_INST_EXEC.COND
|
||||
88.02 BR_INST_EXEC.DIRECT_JMP
|
||||
88.04 BR_INST_EXEC.INDIRECT_JMP_NON_CALL_RET
|
||||
88.08 BR_INST_EXEC.RETURN_NEAR
|
||||
88.10 BR_INST_EXEC.DIRECT_NEAR_CALL
|
||||
88.20 BR_INST_EXEC.INDIRECT_NEAR_CALL
|
||||
88.40 BR_INST_EXEC.NONTAKEN
|
||||
88.80 BR_INST_EXEC.TAKEN
|
||||
88.FF BR_INST_EXEC.ALL_BRANCHES
|
||||
89.01 BR_MISP_EXEC.COND
|
||||
89.04 BR_MISP_EXEC.INDIRECT_JMP_NON_CALL_RET
|
||||
89.08 BR_MISP_EXEC.RETURN_NEAR
|
||||
89.10 BR_MISP_EXEC.DIRECT_NEAR_CALL
|
||||
89.20 BR_MISP_EXEC.INDIRECT_NEAR_CALL
|
||||
89.40 BR_MISP_EXEC.NONTAKEN
|
||||
89.80 BR_MISP_EXEC.TAKEN
|
||||
89.FF BR_MISP_EXEC.ALL_BRANCHES
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
A1.01 UOPS_EXECUTED_PORT.PORT_0
|
||||
A1.02 UOPS_EXECUTED_PORT.PORT_1
|
||||
A1.04 UOPS_EXECUTED_PORT.PORT_2
|
||||
A1.08 UOPS_EXECUTED_PORT.PORT_3
|
||||
A1.10 UOPS_EXECUTED_PORT.PORT_4
|
||||
A1.20 UOPS_EXECUTED_PORT.PORT_5
|
||||
A1.40 UOPS_EXECUTED_PORT.PORT_6
|
||||
A1.80 UOPS_EXECUTED_PORT.PORT_7
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.04 RESOURCE_STALLS.RS
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A2.10 RESOURCE_STALLS.ROB
|
||||
A3.01 CYCLE_ACTIVITY.CYCLES_L2_PENDING
|
||||
A3.02 CYCLE_ACTIVITY.CYCLES_LDM_PENDING
|
||||
A3.05 CYCLE_ACTIVITY.STALLS_L2_PENDING
|
||||
A3.08.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_PENDING
|
||||
A3.0C.CTR=2 CYCLE_ACTIVITY.STALLS_L1D_PENDING
|
||||
A8.01 LSD.UOPS
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B1.02 UOPS_EXECUTED.CORE
|
||||
B7.01.CTR=0.MSR_RSP0=0x10001 OFF_CORE_RESPONSE_0.DMND_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10002 OFF_CORE_RESPONSE_0.DMND_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10004 OFF_CORE_RESPONSE_0.DMND_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10008 OFF_CORE_RESPONSE_0.COREWB
|
||||
B7.01.CTR=0.MSR_RSP0=0x10010 OFF_CORE_RESPONSE_0.PF_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10020 OFF_CORE_RESPONSE_0.PF_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10040 OFF_CORE_RESPONSE_0.PF_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10080 OFF_CORE_RESPONSE_0.PF_L3_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10100 OFF_CORE_RESPONSE_0.PF_L3_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10200 OFF_CORE_RESPONSE_0.PF_L3_CODE_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10400 OFF_CORE_RESPONSE_0.SPLIT_LOCK_UC_LOCK
|
||||
B7.01.CTR=0.MSR_RSP0=0x10800 OFF_CORE_RESPONSE_0.STRM_ST
|
||||
B7.01.CTR=0.MSR_RSP0=0x18000 OFF_CORE_RESPONSE_0.OTHER
|
||||
BB.01.CTR=1.MSR_RSP1=0x10001 OFF_CORE_RESPONSE_1.DMND_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10002 OFF_CORE_RESPONSE_1.DMND_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10004 OFF_CORE_RESPONSE_1.DMND_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10008 OFF_CORE_RESPONSE_1.COREWB
|
||||
BB.01.CTR=1.MSR_RSP1=0x10010 OFF_CORE_RESPONSE_1.PF_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10020 OFF_CORE_RESPONSE_1.PF_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10040 OFF_CORE_RESPONSE_1.PF_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10080 OFF_CORE_RESPONSE_1.PF_L3_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10100 OFF_CORE_RESPONSE_1.PF_L3_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10200 OFF_CORE_RESPONSE_1.PF_L3_CODE_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10400 OFF_CORE_RESPONSE_1.SPLIT_LOCK_UC_LOCK
|
||||
BB.01.CTR=1.MSR_RSP1=0x10800 OFF_CORE_RESPONSE_1.STRM_ST
|
||||
BB.01.CTR=1.MSR_RSP1=0x18000 OFF_CORE_RESPONSE_1.OTHER
|
||||
BC.11 PAGE_WALKER_LOADS.DTLB_L1
|
||||
BC.21 PAGE_WALKER_LOADS.ITLB_L1
|
||||
BC.12 PAGE_WALKER_LOADS.DTLB_L2
|
||||
BC.22 PAGE_WALKER_LOADS.ITLB_L2
|
||||
BC.14 PAGE_WALKER_LOADS.DTLB_L3
|
||||
BC.24 PAGE_WALKER_LOADS.ITLB_L3
|
||||
BC.18 PAGE_WALKER_LOADS.DTLB_MEMORY
|
||||
BC.28 PAGE_WALKER_LOADS.ITLB_MEMORY
|
||||
BD.01 TLB_FLUSH.DTLB_THREAD
|
||||
BD.20 TLB_FLUSH.STLB_ANY
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.01.CTR=1 INST_RETIRED.PREC_DIST
|
||||
C1.08 OTHER_ASSISTS.AVX_TO_SSE
|
||||
C1.10 OTHER_ASSISTS.SSE_TO_AVX
|
||||
C1.40 OTHER_ASSISTS.ANY_WB_ASSIST
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C3.20 MACHINE_CLEARS.MASKMOV
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.04 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.NOT_TAKEN
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
|
||||
CA.02 FP_ASSIST.X87_OUTPUT
|
||||
CA.04 FP_ASSIST.X87_INPUT
|
||||
CA.08 FP_ASSIST.SIMD_OUTPUT
|
||||
CA.10 FP_ASSIST.SIMD_INPUT
|
||||
CA.1E FP_ASSIST.ANY
|
||||
CC.20 ROB_MISC_EVENTS.LBR_INSERTS
|
||||
CD.01.MSR_3F6H=10 MEM_TRANS_RETIRED.LOAD_LATENCY
|
||||
D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS
|
||||
D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES
|
||||
D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
|
||||
D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
|
||||
D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
|
||||
D0.81 MEM_UOPS_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_UOPS_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.L3_HIT
|
||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.L3_MISS
|
||||
D1.40 MEM_LOAD_UOPS_RETIRED.HIT_LFB
|
||||
D2.01 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE
|
||||
D3.01 MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM
|
||||
E6.1F BACLEARS.ANY
|
||||
F0.01 L2_TRANS.DEMAND_DATA_RD
|
||||
F0.02 L2_TRANS.RFO
|
||||
F0.04 L2_TRANS.CODE_RD
|
||||
F0.08 L2_TRANS.ALL_PF
|
||||
F0.10 L2_TRANS.L1D_WB
|
||||
F0.20 L2_TRANS.L2_FILL
|
||||
F0.40 L2_TRANS.L2_WB
|
||||
F0.80 L2_TRANS.ALL_REQUESTS
|
||||
F1.01 L2_LINES_IN.I
|
||||
F1.02 L2_LINES_IN.S
|
||||
F1.04 L2_LINES_IN.E
|
||||
F1.07 L2_LINES_IN.ALL
|
||||
F2.05 L2_LINES_OUT.DEMAND_CLEAN
|
||||
F2.06 L2_LINES_OUT.DEMAND_DIRTY
|
22
configs/cfg_Haswell_common.txt
Normal file
22
configs/cfg_Haswell_common.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# Performance monitoring events for processors based on the Haswell microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3CH, 06_45H and 06_46H.
|
||||
# See Table 19-10 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
A1.01 UOPS_EXECUTED_PORT.PORT_0
|
||||
A1.02 UOPS_EXECUTED_PORT.PORT_1
|
||||
A1.04 UOPS_EXECUTED_PORT.PORT_2
|
||||
A1.08 UOPS_EXECUTED_PORT.PORT_3
|
||||
A1.10 UOPS_EXECUTED_PORT.PORT_4
|
||||
A1.20 UOPS_EXECUTED_PORT.PORT_5
|
||||
A1.40 UOPS_EXECUTED_PORT.PORT_6
|
||||
A1.80 UOPS_EXECUTED_PORT.PORT_7
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.L3_HIT
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.L3_MISS
|
230
configs/cfg_IvyBridge_all.txt
Normal file
230
configs/cfg_IvyBridge_all.txt
Normal file
@@ -0,0 +1,230 @@
|
||||
# Performance monitoring events for processors based on the Ivy Bridge microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3AH.
|
||||
# See Table 19-14 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
05.01 MISALIGN_MEM_REF.LOADS
|
||||
05.02 MISALIGN_MEM_REF.STORES
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
08.81 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
|
||||
08.82 DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.84 DTLB_LOAD_MISSES.WALK_DURATION
|
||||
08.88 DTLB_LOAD_MISSES.LARGE_PAGE_WALK_DURATION
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.10 UOPS_ISSUED.FLAGS_MERGE
|
||||
0E.20 UOPS_ISSUED.SLOW_LEA
|
||||
0E.40 UOPS_ISSUED.SiNGLE_MUL
|
||||
10.01 FP_COMP_OPS_EXE.X87
|
||||
10.10 FP_COMP_OPS_EXE.SSE_FP_PACKED_DOUBLE
|
||||
10.20 FP_COMP_OPS_EXE.SSE_FP_SCALAR_SINGLE
|
||||
10.40 FP_COMP_OPS_EXE.SSE_PACKEDSINGLE
|
||||
10.80 FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE
|
||||
11.01 SIMD_FP_256.PACKED_SINGLE
|
||||
11.02 SIMD_FP_256.PACKED_DOUBLE
|
||||
14.01 ARITH.FPU_DIV_ACTIVE
|
||||
24.01 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.03 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.04 L2_RQSTS.RFO_HITS
|
||||
24.08 L2_RQSTS.RFO_MISS
|
||||
24.0C L2_RQSTS.ALL_RFO
|
||||
24.10 L2_RQSTS.CODE_RD_HIT
|
||||
24.20 L2_RQSTS.CODE_RD_MISS
|
||||
24.30 L2_RQSTS.ALL_CODE_RD
|
||||
24.40 L2_RQSTS.PF_HIT
|
||||
24.80 L2_RQSTS.PF_MISS
|
||||
24.C0 L2_RQSTS.ALL_PF
|
||||
27.01 L2_STORE_LOCK_RQSTS.MISS
|
||||
27.08 L2_STORE_LOCK_RQSTS.HIT_M
|
||||
27.0F L2_STORE_LOCK_RQSTS.ALL
|
||||
28.01 L2_L1D_WB_RQSTS.MISS
|
||||
28.04 L2_L1D_WB_RQSTS.HIT_E
|
||||
28.08 L2_L1D_WB_RQSTS.HIT_M
|
||||
28.0F L2_L1D_WB_RQSTS.ALL
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
|
||||
48.01.CTR=2 L1D_PEND_MISS.PENDING
|
||||
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
|
||||
49.02 DTLB_STORE_MISSES.WALK_COMPLETED
|
||||
49.04 DTLB_STORE_MISSES.WALK_DURATION
|
||||
49.10 DTLB_STORE_MISSES.STLB_HIT
|
||||
4C.01 LOAD_HIT_PRE.SW_PF
|
||||
4C.02 LOAD_HIT_PRE.HW_PF
|
||||
51.01 L1D.REPLACEMENT
|
||||
58.04 MOVE_ELIMINATION.INT_NOT_ELIMINATED
|
||||
58.08 MOVE_ELIMINATION.SIMD_NOT_ELIMINATED
|
||||
58.01 MOVE_ELIMINATION.INT_ELIMINATED
|
||||
58.02 MOVE_ELIMINATION.SIMD_ELIMINATED
|
||||
5C.01 CPL_CYCLES.RING0
|
||||
5C.02 CPL_CYCLES.RING123
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
5F.04 DTLB_LOAD_MISSES.STLB_HIT
|
||||
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
||||
60.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
|
||||
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
63.01 LOCK_CYCLES.SPLIT_LOCK_UC_LOCK_DURATION
|
||||
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION
|
||||
79.02 IDQ.EMPTY
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.10 IDQ.MS_DSB_UOPS
|
||||
79.20 IDQ.MS_MITE_UOPS
|
||||
79.30 IDQ.MS_UOPS
|
||||
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS
|
||||
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS
|
||||
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS
|
||||
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS
|
||||
79.3C IDQ.MITE_ALL_UOPS
|
||||
80.04 ICACHE.IFETCH_STALL
|
||||
80.02 ICACHE.MISSES
|
||||
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED
|
||||
85.04 ITLB_MISSES.WALK_DURATION
|
||||
85.10 ITLB_MISSES.STLB_HIT
|
||||
87.01 ILD_STALL.LCP
|
||||
87.04 ILD_STALL.IQ_FULL
|
||||
88.01 BR_INST_EXEC.COND
|
||||
88.02 BR_INST_EXEC.DIRECT_JMP
|
||||
88.04 BR_INST_EXEC.INDIRECT_JMP_NON_CALL_RET
|
||||
88.08 BR_INST_EXEC.RETURN_NEAR
|
||||
88.10 BR_INST_EXEC.DIRECT_NEAR_CALL
|
||||
88.20 BR_INST_EXEC.INDIRECT_NEAR_CALL
|
||||
88.40 BR_INST_EXEC.NONTAKEN
|
||||
88.80 BR_INST_EXEC.TAKEN
|
||||
88.FF BR_INST_EXEC.ALL_BRANCHES
|
||||
89.01 BR_MISP_EXEC.COND
|
||||
89.04 BR_MISP_EXEC.INDIRECT_JMP_NON_CALL_RET
|
||||
89.08 BR_MISP_EXEC.RETURN_NEAR
|
||||
89.10 BR_MISP_EXEC.DIRECT_NEAR_CALL
|
||||
89.20 BR_MISP_EXEC.INDIRECT_NEAR_CALL
|
||||
89.40 BR_MISP_EXEC.NONTAKEN
|
||||
89.80 BR_MISP_EXEC.TAKEN
|
||||
89.FF BR_MISP_EXEC.ALL_BRANCHES
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.30 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.04 RESOURCE_STALLS.RS
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A2.10 RESOURCE_STALLS.ROB
|
||||
A3.01 CYCLE_ACTIVITY.CYCLES_L2_PENDING
|
||||
A3.02 CYCLE_ACTIVITY.CYCLES_LDM_PENDING
|
||||
A3.04 CYCLE_ACTIVITY.CYCLES_NO_EXECUTE
|
||||
A3.05 CYCLE_ACTIVITY.STALLS_L2_PENDING
|
||||
A3.06 CYCLE_ACTIVITY.STALLS_LDM_PENDING
|
||||
A3.08.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_PENDING
|
||||
A3.0C.CTR=2.CMSK=0x0C CYCLE_ACTIVITY.STALLS_L1D_PENDING
|
||||
A8.01 LSD.UOPS
|
||||
AB.01 DSB2MITE_SWITCHES.COUNT
|
||||
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
||||
AC.08 DSB_FILL.EXCEED_DSB_LINES
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
B1.02 UOPS_EXECUTED.CORE
|
||||
B7.01.CTR=0.MSR_RSP0=0x10001 OFF_CORE_RESPONSE_0.DMND_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10002 OFF_CORE_RESPONSE_0.DMND_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10004 OFF_CORE_RESPONSE_0.DMND_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10008 OFF_CORE_RESPONSE_0.WB
|
||||
B7.01.CTR=0.MSR_RSP0=0x10010 OFF_CORE_RESPONSE_0.PF_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10020 OFF_CORE_RESPONSE_0.PF_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10040 OFF_CORE_RESPONSE_0.PF_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10080 OFF_CORE_RESPONSE_0.PF_LLC_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10100 OFF_CORE_RESPONSE_0.PF_LLC_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10200 OFF_CORE_RESPONSE_0.PF_LLC_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10400 OFF_CORE_RESPONSE_0.BUS_LOCKS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10800 OFF_CORE_RESPONSE_0.STRM_ST
|
||||
B7.01.CTR=0.MSR_RSP0=0x18000 OFF_CORE_RESPONSE_0.OTHER
|
||||
BB.01.CTR=1.MSR_RSP1=0x10001 OFF_CORE_RESPONSE_1.DMND_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10002 OFF_CORE_RESPONSE_1.DMND_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10004 OFF_CORE_RESPONSE_1.DMND_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10008 OFF_CORE_RESPONSE_1.WB
|
||||
BB.01.CTR=1.MSR_RSP1=0x10010 OFF_CORE_RESPONSE_1.PF_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10020 OFF_CORE_RESPONSE_1.PF_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10040 OFF_CORE_RESPONSE_1.PF_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10080 OFF_CORE_RESPONSE_1.PF_LLC_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10100 OFF_CORE_RESPONSE_1.PF_LLC_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10200 OFF_CORE_RESPONSE_1.PF_LLC_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10400 OFF_CORE_RESPONSE_1.BUS_LOCKS
|
||||
BB.01.CTR=1.MSR_RSP1=0x10800 OFF_CORE_RESPONSE_1.STRM_ST
|
||||
BB.01.CTR=1.MSR_RSP1=0x18000 OFF_CORE_RESPONSE_1.OTHER
|
||||
BD.01 TLB_FLUSH.DTLB_THREAD
|
||||
BD.20 TLB_FLUSH.STLB_ANY
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.01.CTR=1 INST_RETIRED.PREC_DIST
|
||||
C1.08 OTHER_ASSISTS.AVX_STORE
|
||||
C1.10 OTHER_ASSISTS.AVX_TO_SSE
|
||||
C1.20 OTHER_ASSISTS.SSE_TO_AVX
|
||||
C1.80 OTHER_ASSISTS.WB
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C3.20 MACHINE_CLEARS.MASKMOV
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.04 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.NOT_TAKEN
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
|
||||
CA.02 FP_ASSIST.X87_OUTPUT
|
||||
CA.04 FP_ASSIST.X87_INPUT
|
||||
CA.08 FP_ASSIST.SIMD_OUTPUT
|
||||
CA.10 FP_ASSIST.SIMD_INPUT
|
||||
CA.1E FP_ASSIST.ANY
|
||||
CC.20 ROB_MISC_EVENTS.LBR_INSERTS
|
||||
CD.01.CTR=3.MSR_3F6H=10 MEM_TRANS_RETIRED.LOAD_LATENCY
|
||||
CD.02.CTR=3 MEM_TRANS_RETIRED.PRECISE_STORE
|
||||
D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS
|
||||
D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES
|
||||
D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
|
||||
D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
|
||||
D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
|
||||
D0.81 MEM_UOPS_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_UOPS_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.LLC_HIT
|
||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.LLC_MISS
|
||||
D1.40 MEM_LOAD_UOPS_RETIRED.HIT_LFB
|
||||
D2.01 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE
|
||||
D3.01 MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM
|
||||
E6.1F BACLEARS.ANY
|
||||
F0.01 L2_TRANS.DEMAND_DATA_RD
|
||||
F0.02 L2_TRANS.RFO
|
||||
F0.04 L2_TRANS.CODE_RD
|
||||
F0.08 L2_TRANS.ALL_PF
|
||||
F0.10 L2_TRANS.L1D_WB
|
||||
F0.20 L2_TRANS.L2_FILL
|
||||
F0.40 L2_TRANS.L2_WB
|
||||
F0.80 L2_TRANS.ALL_REQUESTS
|
||||
F1.01 L2_LINES_IN.I
|
||||
F1.02 L2_LINES_IN.S
|
||||
F1.04 L2_LINES_IN.E
|
||||
F1.07 L2_LINES_IN.ALL
|
||||
F2.01 L2_LINES_OUT.DEMAND_CLEAN
|
||||
F2.02 L2_LINES_OUT.DEMAND_DIRTY
|
||||
F2.04 L2_LINES_OUT.PF_CLEAN
|
||||
F2.08 L2_LINES_OUT.PF_DIRTY
|
||||
F2.0A L2_LINES_OUT.DIRTY_ALL
|
21
configs/cfg_IvyBridge_common.txt
Normal file
21
configs/cfg_IvyBridge_common.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
# Performance monitoring events for processors based on the Ivy Bridge microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3AH.
|
||||
# See Table 19-14 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.30 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.LLC_HIT
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.LLC_MISS
|
104
configs/cfg_KnightsLanding_all.txt
Normal file
104
configs/cfg_KnightsLanding_all.txt
Normal file
@@ -0,0 +1,104 @@
|
||||
# Performance monitoring events for processors based on the Knights Landing microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_57H and 06_85H.
|
||||
# See Table 19-7 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.01 RECYCLEQ.LD_BLOCK_ST_FORWARD
|
||||
03.02 RECYCLEQ.LD_BLOCK_STD_NOTREADY
|
||||
03.04 RECYCLEQ.ST_SPLITS
|
||||
03.08 RECYCLEQ.LD_SPLITS
|
||||
03.10 RECYCLEQ.LOCK
|
||||
03.20 RECYCLEQ.STA_FULL
|
||||
03.40 RECYCLEQ.ANY_LD
|
||||
03.80 RECYCLEQ.ANY_ST
|
||||
04.01 MEM_UOPS_RETIRED.L1_MISS_LOADS
|
||||
04.02 MEM_UOPS_RETIRED.L2_HIT_LOADS
|
||||
04.04 MEM_UOPS_RETIRED.L2_MISS_LOADS
|
||||
04.08 MEM_UOPS_RETIRED.DTLB_MISS_LOADS
|
||||
04.10 MEM_UOPS_RETIRED.UTLB_MISS_LOADS
|
||||
04.20 MEM_UOPS_RETIRED.HITM
|
||||
04.40 MEM_UOPS_RETIRED.ALL_LOADS
|
||||
04.80 MEM_UOPS_RETIRED.ALL_STORES
|
||||
05.01.EDG PAGE_WALKS.D_SIDE_WALKS
|
||||
05.01 PAGE_WALKS.D_SIDE_CYCLES
|
||||
05.02.EDG PAGE_WALKS.I_SIDE_WALKS
|
||||
05.02 PAGE_WALKS.I_SIDE_CYCLES
|
||||
05.03.EDG PAGE_WALKS.WALKS
|
||||
05.03 PAGE_WALKS.CYCLES
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
30.00 L2_REQUESTS_REJECT.ALL
|
||||
31.00 CORE_REJECT_L2Q.ALL
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_UNHALTED.REF
|
||||
3E.04 L2_PREFETCHER.ALLOC_XQ
|
||||
80.01 ICACHE.HIT
|
||||
80.02 ICACHE.MISSES
|
||||
80.03 ICACHE.ACCESSES
|
||||
86.04 FETCH_STALL.ICACHE_FILL_PENDING_CYCLES
|
||||
B7.01.CTR=0.MSR_RSP0=0x10001 OFFCORE_RESPONSE_0.DEMAND_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10002 OFFCORE_RESPONSE_0.DEMAND_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10004 OFFCORE_RESPONSE_0.DEMAND_CODE_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10020 OFFCORE_RESPONSE_0.PF_L2_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10040 OFFCORE_RESPONSE_0.PF_L2_CODE_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10080 OFFCORE_RESPONSE_0.PARTIAL_READS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10100 OFFCORE_RESPONSE_0.PARTIAL_WRITES
|
||||
B7.01.CTR=0.MSR_RSP0=0x10200 OFFCORE_RESPONSE_0.UC_CODE_READS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10400 OFFCORE_RESPONSE_0.BUS_LOCKS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10800 OFFCORE_RESPONSE_0.FULL_STREAMING_STORES
|
||||
B7.01.CTR=0.MSR_RSP0=0x11000 OFFCORE_RESPONSE_0.SW_PREFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x12000 OFFCORE_RESPONSE_0.PF_L1_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x14000 OFFCORE_RESPONSE_0.PARTIAL_STREAMING_STORES
|
||||
B7.01.CTR=0.MSR_RSP0=0x18000 OFFCORE_RESPONSE_0.ANY_REQUEST
|
||||
B7.02.CTR=1.MSR_RSP1=0x10001 OFFCORE_RESPONSE_1.DEMAND_DATA_RD
|
||||
B7.02.CTR=1.MSR_RSP1=0x10002 OFFCORE_RESPONSE_1.DEMAND_RFO
|
||||
B7.02.CTR=1.MSR_RSP1=0x10004 OFFCORE_RESPONSE_1.DEMAND_CODE_RD
|
||||
B7.02.CTR=1.MSR_RSP1=0x10020 OFFCORE_RESPONSE_1.PF_L2_RFO
|
||||
B7.02.CTR=1.MSR_RSP1=0x10040 OFFCORE_RESPONSE_1.PF_L2_CODE_RD
|
||||
B7.02.CTR=1.MSR_RSP1=0x10080 OFFCORE_RESPONSE_1.PARTIAL_READS
|
||||
B7.02.CTR=1.MSR_RSP1=0x10100 OFFCORE_RESPONSE_1.PARTIAL_WRITES
|
||||
B7.02.CTR=1.MSR_RSP1=0x10200 OFFCORE_RESPONSE_1.UC_CODE_READS
|
||||
B7.02.CTR=1.MSR_RSP1=0x10400 OFFCORE_RESPONSE_1.BUS_LOCKS
|
||||
B7.02.CTR=1.MSR_RSP1=0x10800 OFFCORE_RESPONSE_1.FULL_STREAMING_STORES
|
||||
B7.02.CTR=1.MSR_RSP1=0x11000 OFFCORE_RESPONSE_1.SW_PREFETCH
|
||||
B7.02.CTR=1.MSR_RSP1=0x12000 OFFCORE_RESPONSE_1.PF_L1_DATA_RD
|
||||
B7.02.CTR=1.MSR_RSP1=0x14000 OFFCORE_RESPONSE_1.PARTIAL_STREAMING_STORES
|
||||
B7.02.CTR=1.MSR_RSP1=0x18000 OFFCORE_RESPONSE_1.ANY_REQUEST
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C2.01 UOPS_RETIRED.MS
|
||||
C2.10 UOPS_RETIRED.ALL
|
||||
C2.20 UOPS_RETIRED.SCALAR_SIMD
|
||||
C2.40 UOPS_RETIRED.PACKED_SIMD
|
||||
C3.01 MACHINE_CLEARS.SMC
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.FP_ASSIST
|
||||
C3.08 MACHINE_CLEARS.ALL
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.7E BR_INST_RETIRED.JCC
|
||||
C4.BF BR_INST_RETIRED.FAR_BRANCH
|
||||
C4.EB BR_INST_RETIRED.NON_RETURN_IND
|
||||
C4.F7 BR_INST_RETIRED.RETURN
|
||||
C4.F9 BR_INST_RETIRED.CALL
|
||||
C4.FB BR_INST_RETIRED.IND_CALL
|
||||
C4.FD BR_INST_RETIRED.REL_CALL
|
||||
C4.FE BR_INST_RETIRED.TAKEN_JCC
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.7E BR_MISP_RETIRED.JCC
|
||||
C5.BF BR_MISP_RETIRED.FAR_BRANCH
|
||||
C5.EB BR_MISP_RETIRED.NON_RETURN_IND
|
||||
C5.F7 BR_MISP_RETIRED.RETURN
|
||||
C5.F9 BR_MISP_RETIRED.CALL
|
||||
C5.FB BR_MISP_RETIRED.IND_CALL
|
||||
C5.FD BR_MISP_RETIRED.REL_CALL
|
||||
C5.FE BR_MISP_RETIRED.TAKEN_JCC
|
||||
CA.01 NO_ALLOC_CYCLES.ROB_FULL
|
||||
CA.04 NO_ALLOC_CYCLES.MISPREDICTS
|
||||
CA.20 NO_ALLOC_CYCLES.RAT_STALL
|
||||
CA.90 NO_ALLOC_CYCLES.NOT_DELIVERED
|
||||
CA.7F NO_ALLOC_CYCLES.ALL
|
||||
CB.01 RS_FULL_STALL.MEC
|
||||
CB.1F RS_FULL_STALL.ALL
|
||||
CD.01 CYCLES_DIV_BUSY.ALL
|
||||
E6.01 BACLEARS.ALL
|
||||
E6.08 BACLEARS.RETURN
|
||||
E6.10 BACLEARS.COND
|
||||
E7.01 MS_DECODED.MS_ENTRY
|
10
configs/cfg_KnightsLanding_common.txt
Normal file
10
configs/cfg_KnightsLanding_common.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
# Performance monitoring events for processors based on the Knights Landing microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_57H and 06_85H.
|
||||
# See Table 19-7 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
C2.10 UOPS_RETIRED.ALL
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
04.01 MEM_UOPS_RETIRED.L1_MISS_LOADS
|
||||
04.02 MEM_UOPS_RETIRED.L2_HIT_LOADS
|
||||
04.04 MEM_UOPS_RETIRED.L2_MISS_LOADS
|
262
configs/cfg_Nehalem_all.txt
Normal file
262
configs/cfg_Nehalem_all.txt
Normal file
@@ -0,0 +1,262 @@
|
||||
# Performance monitoring events for processors based on the Nehalem microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_1AH, 06_1EH, 06_1FH, and 06_2EH.
|
||||
# See Table 19-20 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
04.07 SB_DRAIN.ANY
|
||||
06.04 STORE_BLOCKS.AT_RET
|
||||
06.08 STORE_BLOCKS.L1D_BLOCK
|
||||
07.01 PARTIAL_ADDRESS_ALIAS
|
||||
08.01 DTLB_LOAD_MISSES.ANY
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.10 DTLB_LOAD_MISSES.STLB_HIT
|
||||
08.20 DTLB_LOAD_MISSES.PDE_MISS
|
||||
08.80 DTLB_LOAD_MISSES.LARGE_WALK_COMPLETED
|
||||
0B.01 MEM_INST_RETIRED.LOADS
|
||||
0B.02 MEM_INST_RETIRED.STORES
|
||||
0B.10 MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD
|
||||
0C.01 MEM_STORE_RETIRED.DTLB_MISS
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.01.CMSK=1.INV UOPS_ISSUED.STALLED_CYCLES
|
||||
0E.02 UOPS_ISSUED.FUSED
|
||||
0F.02 MEM_UNCORE_RETIRED.OTHER_CORE_L2_HITM
|
||||
0F.08 MEM_UNCORE_RETIRED.REMOTE_CACHE_LOCAL_HOME_HIT
|
||||
0F.10 MEM_UNCORE_RETIRED.REMOTE_DRAM
|
||||
0F.20 MEM_UNCORE_RETIRED.LOCAL_DRAM
|
||||
10.01 FP_COMP_OPS_EXE.X87
|
||||
10.02 FP_COMP_OPS_EXE.MMX
|
||||
10.04 FP_COMP_OPS_EXE.SSE_FP
|
||||
10.08 FP_COMP_OPS_EXE.SSE2_INTEGER
|
||||
10.10 FP_COMP_OPS_EXE.SSE_FP_PACKED
|
||||
10.20 FP_COMP_OPS_EXE.SSE_FP_SCALAR
|
||||
10.40 FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION
|
||||
10.80 FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION
|
||||
12.01 SIMD_INT_128.PACKED_MPY
|
||||
12.02 SIMD_INT_128.PACKED_SHIFT
|
||||
12.04 SIMD_INT_128.PACK
|
||||
12.08 SIMD_INT_128.UNPACK
|
||||
12.10 SIMD_INT_128.PACKED_LOGICAL
|
||||
12.20 SIMD_INT_128.PACKED_ARITH
|
||||
12.40 SIMD_INT_128.SHUFFLE_MOVE
|
||||
13.01 LOAD_DISPATCH.RS
|
||||
13.02 LOAD_DISPATCH.RS_DELAYED
|
||||
13.04 LOAD_DISPATCH.MOB
|
||||
13.07 LOAD_DISPATCH.ANY
|
||||
14.01 ARITH.CYCLES_DIV_BUSY
|
||||
14.02 ARITH.MUL
|
||||
17.01 INST_QUEUE_WRITES
|
||||
18.01 INST_DECODED.DEC0
|
||||
19.01 TWO_UOP_INSTS_DECODED
|
||||
1E.01 INST_QUEUE_WRITE_CYCLES
|
||||
20.01 LSD_OVERFLOW
|
||||
24.01 L2_RQSTS.LD_HIT
|
||||
24.02 L2_RQSTS.LD_MISS
|
||||
24.03 L2_RQSTS.LOADS
|
||||
24.04 L2_RQSTS.RFO_HIT
|
||||
24.08 L2_RQSTS.RFO_MISS
|
||||
24.0C L2_RQSTS.RFOS
|
||||
24.10 L2_RQSTS.IFETCH_HIT
|
||||
24.20 L2_RQSTS.IFETCH_MISS
|
||||
24.30 L2_RQSTS.IFETCHES
|
||||
24.40 L2_RQSTS.PREFETCH_HIT
|
||||
24.80 L2_RQSTS.PREFETCH_MISS
|
||||
24.C0 L2_RQSTS.PREFETCHES
|
||||
24.AA L2_RQSTS.MISS
|
||||
24.FF L2_RQSTS.REFERENCES
|
||||
26.01 L2_DATA_RQSTS.DEMAND.I_STATE
|
||||
26.02 L2_DATA_RQSTS.DEMAND.S_STATE
|
||||
26.04 L2_DATA_RQSTS.DEMAND.E_STATE
|
||||
26.08 L2_DATA_RQSTS.DEMAND.M_STATE
|
||||
26.0F L2_DATA_RQSTS.DEMAND.MESI
|
||||
26.10 L2_DATA_RQSTS.PREFETCH.I_STATE
|
||||
26.20 L2_DATA_RQSTS.PREFETCH.S_STATE
|
||||
26.40 L2_DATA_RQSTS.PREFETCH.E_STATE
|
||||
26.80 L2_DATA_RQSTS.PREFETCH.M_STATE
|
||||
26.F0 L2_DATA_RQSTS.PREFETCH.MESI
|
||||
26.FF L2_DATA_RQSTS.ANY
|
||||
27.01 L2_WRITE.RFO.I_STATE
|
||||
27.02 L2_WRITE.RFO.S_STATE
|
||||
27.08 L2_WRITE.RFO.M_STATE
|
||||
27.0E L2_WRITE.RFO.HIT
|
||||
27.0F L2_WRITE.RFO.MESI
|
||||
27.10 L2_WRITE.LOCK.I_STATE
|
||||
27.20 L2_WRITE.LOCK.S_STATE
|
||||
27.40 L2_WRITE.LOCK.E_STATE
|
||||
27.80 L2_WRITE.LOCK.M_STATE
|
||||
27.E0 L2_WRITE.LOCK.HIT
|
||||
27.F0 L2_WRITE.LOCK.MESI
|
||||
28.01 L1D_WB_L2.I_STATE
|
||||
28.02 L1D_WB_L2.S_STATE
|
||||
28.04 L1D_WB_L2.E_STATE
|
||||
28.08 L1D_WB_L2.M_STATE
|
||||
28.0F L1D_WB_L2.MESI
|
||||
2E.4F L3_LAT_CACHE.REFERENCE
|
||||
2E.41 L3_LAT_CACHE.MISS
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_UNHALTED.REF_P
|
||||
40.01.CTR=0 L1D_CACHE_LD.I_STATE
|
||||
40.02.CTR=0 L1D_CACHE_LD.S_STATE
|
||||
40.04.CTR=0 L1D_CACHE_LD.E_STATE
|
||||
40.08.CTR=0 L1D_CACHE_LD.M_STATE
|
||||
40.0F.CTR=0 L1D_CACHE_LD.MESI
|
||||
41.02.CTR=0 L1D_CACHE_ST.S_STATE
|
||||
41.04.CTR=0 L1D_CACHE_ST.E_STATE
|
||||
41.08.CTR=0 L1D_CACHE_ST.M_STATE
|
||||
42.01.CTR=0 L1D_CACHE_LOCK.HIT
|
||||
42.02.CTR=0 L1D_CACHE_LOCK.S_STATE
|
||||
42.04.CTR=0 L1D_CACHE_LOCK.E_STATE
|
||||
42.08.CTR=0 L1D_CACHE_LOCK.M_STATE
|
||||
43.01.CTR=0 L1D_ALL_REF.ANY
|
||||
43.02.CTR=0 L1D_ALL_REF.CACHEABLE
|
||||
49.01 DTLB_MISSES.ANY
|
||||
49.02 DTLB_MISSES.WALK_COMPLETED
|
||||
49.10 DTLB_MISSES.STLB_HIT
|
||||
49.20 DTLB_MISSES.PDE_MISS
|
||||
49.80 DTLB_MISSES.LARGE_WALK_COMPLETED
|
||||
4C.01 LOAD_HIT_PRE
|
||||
4E.01 L1D_PREFETCH.REQUESTS
|
||||
4E.02 L1D_PREFETCH.MISS
|
||||
4E.04 L1D_PREFETCH.TRIGGERS
|
||||
51.01.CTR=0 L1D.REPL
|
||||
51.02.CTR=0 L1D.M_REPL
|
||||
51.04.CTR=0 L1D.M_EVICT
|
||||
51.08.CTR=0 L1D.M_SNOOP_EVICT
|
||||
52.01 L1D_CACHE_PREFETCH_LOCK_FB_HIT
|
||||
53.01 L1D_CACHE_LOCK_FB_HIT
|
||||
63.01.CTR=0 CACHE_LOCK_CYCLES.L1D_L2
|
||||
63.02.CTR=0 CACHE_LOCK_CYCLES.L1D
|
||||
6C.01 IO_TRANSACTIONS
|
||||
80.01 L1I.HITS
|
||||
80.02 L1I.MISSES
|
||||
80.03 L1I.READS
|
||||
80.04 L1I.CYCLES_STALLED
|
||||
82.01 LARGE_ITLB.HIT
|
||||
85.01 ITLB_MISSES.ANY
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED
|
||||
87.01 ILD_STALL.LCP
|
||||
87.02 ILD_STALL.MRU
|
||||
87.04 ILD_STALL.IQ_FULL
|
||||
87.08 ILD_STALL.REGEN
|
||||
87.0F ILD_STALL.ANY
|
||||
88.01 BR_INST_EXEC.COND
|
||||
88.02 BR_INST_EXEC.DIRECT
|
||||
88.04 BR_INST_EXEC.INDIRECT_NON_CALL
|
||||
88.07 BR_INST_EXEC.NON_CALLS
|
||||
88.08 BR_INST_EXEC.RETURN_NEAR
|
||||
88.10 BR_INST_EXEC.DIRECT_NEAR_CALL
|
||||
88.20 BR_INST_EXEC.INDIRECT_NEAR_CALL
|
||||
88.30 BR_INST_EXEC.NEAR_CALLS
|
||||
88.40 BR_INST_EXEC.TAKEN
|
||||
88.7F BR_INST_EXEC.ANY
|
||||
89.01 BR_MISP_EXEC.COND
|
||||
89.02 BR_MISP_EXEC.DIRECT
|
||||
89.04 BR_MISP_EXEC.INDIRECT_NON_CALL
|
||||
89.07 BR_MISP_EXEC.NON_CALLS
|
||||
89.08 BR_MISP_EXEC.RETURN_NEAR
|
||||
89.10 BR_MISP_EXEC.DIRECT_NEAR_CALL
|
||||
89.20 BR_MISP_EXEC.INDIRECT_NEAR_CALL
|
||||
89.30 BR_MISP_EXEC.NEAR_CALLS
|
||||
89.40 BR_MISP_EXEC.TAKEN
|
||||
89.7F BR_MISP_EXEC.ANY
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.02 RESOURCE_STALLS.LOAD
|
||||
A2.04 RESOURCE_STALLS.RS_FULL
|
||||
A2.08 RESOURCE_STALLS.STORE
|
||||
A2.10 RESOURCE_STALLS.ROB_FULL
|
||||
A2.20 RESOURCE_STALLS.FPCW
|
||||
A2.40 RESOURCE_STALLS.MXCSR
|
||||
A2.80 RESOURCE_STALLS.OTHER
|
||||
A6.01 MACRO_INSTS.FUSIONS_DECODED
|
||||
A7.01 BACLEAR_FORCE_IQ
|
||||
A8.01 LSD.UOPS
|
||||
AE.01 ITLB_FLUSH
|
||||
B0.40 OFFCORE_REQUESTS.L1D_WRITEBACK
|
||||
B1.01 UOPS_EXECUTED.PORT0
|
||||
B1.02 UOPS_EXECUTED.PORT1
|
||||
B1.04 UOPS_EXECUTED.PORT2_CORE
|
||||
B1.08 UOPS_EXECUTED.PORT3_CORE
|
||||
B1.10 UOPS_EXECUTED.PORT4_CORE
|
||||
B1.1F UOPS_EXECUTED.CORE_ACTIVE_CYCLES_NO_PORT5
|
||||
B1.20 UOPS_EXECUTED.PORT5
|
||||
B1.3F UOPS_EXECUTED.CORE_ACTIVE_CYCLES
|
||||
B1.40 UOPS_EXECUTED.PORT015
|
||||
B1.80 UOPS_EXECUTED.PORT234
|
||||
B2.01 OFFCORE_REQUESTS_SQ_FULL
|
||||
B8.01 SNOOP_RESPONSE.HIT
|
||||
B8.02 SNOOP_RESPONSE.HITE
|
||||
B8.04 SNOOP_RESPONSE.HITM
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.02 INST_RETIRED.X87
|
||||
C0.04 INST_RETIRED.MMX
|
||||
C2.01 UOPS_RETIRED.ANY
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C2.04 UOPS_RETIRED.MACRO_FUSED
|
||||
C3.01 MACHINE_CLEARS.CYCLES
|
||||
C3.02 MACHINE_CLEARS.MEM_ORDER
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.02 BR_MISP_RETIRED.NEAR_CALL
|
||||
C7.01 SSEX_UOPS_RETIRED.PACKED_SINGLE
|
||||
C7.02 SSEX_UOPS_RETIRED.SCALAR_SINGLE
|
||||
C7.04 SSEX_UOPS_RETIRED.PACKED_DOUBLE
|
||||
C7.08 SSEX_UOPS_RETIRED.SCALAR_DOUBLE
|
||||
C7.10 SSEX_UOPS_RETIRED.VECTOR_INTEGER
|
||||
C8.20 ITLB_MISS_RETIRED
|
||||
CB.01 MEM_LOAD_RETIRED.L1D_HIT
|
||||
CB.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
CB.04 MEM_LOAD_RETIRED.L3_UNSHARED_HIT
|
||||
CB.08 MEM_LOAD_RETIRED.OTHER_CORE_L2_HIT_HITM
|
||||
CB.10 MEM_LOAD_RETIRED.L3_MISS
|
||||
CB.40 MEM_LOAD_RETIRED.HIT_LFB
|
||||
CB.80 MEM_LOAD_RETIRED.DTLB_MISS
|
||||
CC.01 FP_MMX_TRANS.TO_FP
|
||||
CC.02 FP_MMX_TRANS.TO_MMX
|
||||
CC.03 FP_MMX_TRANS.ANY
|
||||
D0.01 MACRO_INSTS.DECODED
|
||||
D1.02 UOPS_DECODED.MS
|
||||
D1.04 UOPS_DECODED.ESP_FOLDING
|
||||
D1.08 UOPS_DECODED.ESP_SYNC
|
||||
D2.01 RAT_STALLS.FLAGS
|
||||
D2.02 RAT_STALLS.REGISTERS
|
||||
D2.04 RAT_STALLS.ROB_READ_PORT
|
||||
D2.08 RAT_STALLS.SCOREBOARD
|
||||
D2.0F RAT_STALLS.ANY
|
||||
D4.01 SEG_RENAME_STALLS
|
||||
D5.01 ES_REG_RENAMES
|
||||
DB.01 UOP_UNFUSION
|
||||
E0.01 BR_INST_DECODED
|
||||
E5.01 BPU_MISSED_CALL_RET
|
||||
E6.01 BACLEAR.CLEAR
|
||||
E6.02 BACLEAR.BAD_TARGET
|
||||
E8.01 BPU_CLEARS.EARLY
|
||||
E8.02 BPU_CLEARS.LATE
|
||||
F0.01 L2_TRANSACTIONS.LOAD
|
||||
F0.02 L2_TRANSACTIONS.RFO
|
||||
F0.04 L2_TRANSACTIONS.IFETCH
|
||||
F0.08 L2_TRANSACTIONS.PREFETCH
|
||||
F0.10 L2_TRANSACTIONS.L1D_WB
|
||||
F0.20 L2_TRANSACTIONS.FILL
|
||||
F0.40 L2_TRANSACTIONS.WB
|
||||
F0.80 L2_TRANSACTIONS.ANY
|
||||
F1.02 L2_LINES_IN.S_STATE
|
||||
F1.04 L2_LINES_IN.E_STATE
|
||||
F1.07 L2_LINES_IN.ANY
|
||||
F2.01 L2_LINES_OUT.DEMAND_CLEAN
|
||||
F2.02 L2_LINES_OUT.DEMAND_DIRTY
|
||||
F2.04 L2_LINES_OUT.PREFETCH_CLEAN
|
||||
F2.08 L2_LINES_OUT.PREFETCH_DIRTY
|
||||
F2.0F L2_LINES_OUT.ANY
|
||||
F4.10 SQ_MISC.SPLIT_LOCK
|
||||
F6.01 SQ_FULL_STALL_CYCLES
|
||||
F7.01 FP_ASSIST.ALL
|
||||
F7.02 FP_ASSIST.OUTPUT
|
||||
F7.04 FP_ASSIST.INPUT
|
||||
FD.01 SIMD_INT_64.PACKED_MPY
|
||||
FD.02 SIMD_INT_64.PACKED_SHIFT
|
||||
FD.04 SIMD_INT_64.PACK
|
||||
FD.08 SIMD_INT_64.UNPACK
|
||||
FD.10 SIMD_INT_64.PACKED_LOGICAL
|
||||
FD.20 SIMD_INT_64.PACKED_ARITH
|
||||
FD.40 SIMD_INT_64.SHUFFLE_MOVE
|
19
configs/cfg_Nehalem_common.txt
Normal file
19
configs/cfg_Nehalem_common.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
# Performance monitoring events for processors based on the Nehalem microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_1AH, 06_1EH, 06_1FH, and 06_2EH.
|
||||
# See Table 19-20 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.02 UOPS_ISSUED.FUSED
|
||||
C2.01 UOPS_RETIRED.ANY
|
||||
B1.01 UOPS_EXECUTED.PORT0
|
||||
B1.02 UOPS_EXECUTED.PORT1
|
||||
B1.04 UOPS_EXECUTED.PORT2_CORE
|
||||
B1.08 UOPS_EXECUTED.PORT3_CORE
|
||||
B1.10 UOPS_EXECUTED.PORT4_CORE
|
||||
B1.20 UOPS_EXECUTED.PORT5
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
CB.01 MEM_LOAD_RETIRED.L1D_HIT
|
||||
CB.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
CB.04 MEM_LOAD_RETIRED.L3_UNSHARED_HIT
|
||||
CB.10 MEM_LOAD_RETIRED.L3_MISS
|
243
configs/cfg_SandyBridge_all.txt
Normal file
243
configs/cfg_SandyBridge_all.txt
Normal file
@@ -0,0 +1,243 @@
|
||||
# Performance monitoring events for processors based on the Sandy Bridge microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_2AH and 06_2DH.
|
||||
# See Table 19-16 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.01 LD_BLOCKS.DATA_UNKNOWN
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
03.10 LD_BLOCKS.ALL_BLOCK
|
||||
05.01 MISALIGN_MEM_REF.LOADS
|
||||
05.02 MISALIGN_MEM_REF.STORES
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
07.08 LD_BLOCKS_PARTIAL.ALL_STA_BLOCK
|
||||
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.04 DTLB_LOAD_MISSES.WALK_DURATION
|
||||
08.10 DTLB_LOAD_MISSES.STLB_HIT
|
||||
0D.03.CMSK=1 INT_MISC.RECOVERY_CYCLES
|
||||
0D.40 INT_MISC.RAT_STALL_CYCLES
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
10.01 FP_COMP_OPS_EXE.X87
|
||||
10.10 FP_COMP_OPS_EXE.SSE_FP_PACKED_DOUBLE
|
||||
10.20 FP_COMP_OPS_EXE.SSE_FP_SCALAR_SINGLE
|
||||
10.40 FP_COMP_OPS_EXE.SSE_PACKED
|
||||
10.80 FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE
|
||||
11.01 SIMD_FP_256.PACKED_SINGLE
|
||||
11.02 SIMD_FP_256.PACKED_DOUBLE
|
||||
14.01 ARITH.FPU_DIV_ACTIVE
|
||||
17.01 INSTS_WRITTEN_TO_IQ.INSTS
|
||||
24.01 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.03 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.04 L2_RQSTS.RFO_HITS
|
||||
24.08 L2_RQSTS.RFO_MISS
|
||||
24.0C L2_RQSTS.ALL_RFO
|
||||
24.10 L2_RQSTS.CODE_RD_HIT
|
||||
24.20 L2_RQSTS.CODE_RD_MISS
|
||||
24.30 L2_RQSTS.ALL_CODE_RD
|
||||
24.40 L2_RQSTS.PF_HIT
|
||||
24.80 L2_RQSTS.PF_MISS
|
||||
24.C0 L2_RQSTS.ALL_PF
|
||||
27.01 L2_STORE_LOCK_RQSTS.MISS
|
||||
27.04 L2_STORE_LOCK_RQSTS.HIT_E
|
||||
27.08 L2_STORE_LOCK_RQSTS.HIT_M
|
||||
27.0F L2_STORE_LOCK_RQSTS.ALL
|
||||
28.01 L2_L1D_WB_RQSTS.MISS
|
||||
28.02 L2_L1D_WB_RQSTS.HIT_S
|
||||
28.04 L2_L1D_WB_RQSTS.HIT_E
|
||||
28.08 L2_L1D_WB_RQSTS.HIT_M
|
||||
28.0F L2_L1D_WB_RQSTS.ALL
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
|
||||
48.01.CTR=2 L1D_PEND_MISS.PENDING
|
||||
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
|
||||
49.02 DTLB_STORE_MISSES.WALK_COMPLETED
|
||||
49.04 DTLB_STORE_MISSES.WALK_DURATION
|
||||
49.10 DTLB_STORE_MISSES.STLB_HIT
|
||||
4C.01 LOAD_HIT_PRE.SW_PF
|
||||
4C.02 LOAD_HIT_PRE.HW_PF
|
||||
4E.02 HW_PRE_REQ.DL1_MISS
|
||||
51.01 L1D.REPLACEMENT
|
||||
51.02 L1D.ALLOCATED_IN_M
|
||||
51.04 L1D.EVICTION
|
||||
51.08 L1D.ALL_M_REPLACEMENT
|
||||
59.20 PARTIAL_RAT_STALLS.FLAGS_MERGE_UOP
|
||||
59.40 PARTIAL_RAT_STALLS.SLOW_LEA_WINDOW
|
||||
59.80 PARTIAL_RAT_STALLS.MUL_SINGLE_UOP
|
||||
5B.0C RESOURCE_STALLS2.ALL_FL_EMPTY
|
||||
5B.0F RESOURCE_STALLS2.ALL_PRF_CONTROL
|
||||
5B.40 RESOURCE_STALLS2.BOB_FULL
|
||||
5B.4F RESOURCE_STALLS2.OOO_RSRC
|
||||
5C.01 CPL_CYCLES.RING0
|
||||
5C.02 CPL_CYCLES.RING123
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
||||
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
63.01 LOCK_CYCLES.SPLIT_LOCK_UC_LOCK_DURATION
|
||||
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION
|
||||
79.02 IDQ.EMPTY
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.10 IDQ.MS_DSB_UOPS
|
||||
79.20 IDQ.MS_MITE_UOPS
|
||||
79.30 IDQ.MS_UOPS
|
||||
80.02 ICACHE.MISSES
|
||||
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED
|
||||
85.04 ITLB_MISSES.WALK_DURATION
|
||||
85.10 ITLB_MISSES.STLB_HIT
|
||||
87.01 ILD_STALL.LCP
|
||||
87.04 ILD_STALL.IQ_FULL
|
||||
88.41 BR_INST_EXEC.NONTAKEN_CONDITIONAL
|
||||
88.81 BR_INST_EXEC.TAKEN_CONDITIONAL
|
||||
88.82 BR_INST_EXEC.TAKEN_DIRECT_JUMP
|
||||
88.84 BR_INST_EXEC.TAKEN_INDIRECT_JUMP_NON_CALL_RET
|
||||
88.88 BR_INST_EXEC.TAKEN_INDIRECT_NEAR_RETURN
|
||||
88.90 BR_INST_EXEC.TAKEN_DIRECT_NEAR_CALL
|
||||
88.A0 BR_INST_EXEC.TAKEN_INDIRECT_NEAR_CALL
|
||||
88.C1 BR_INST_EXEC.ALL_CONDITIONAL
|
||||
88.C2 BR_INST_EXEC.ALL_DIRECT_JUMP
|
||||
88.C4 BR_INST_EXEC.ALL_INDIRECT_JUMP_NON_CALL_RET
|
||||
88.C8 BR_INST_EXEC.ALL_INDIRECT_NEAR_RETURN
|
||||
88.D0 BR_INST_EXEC.ALL_NEAR_CALL
|
||||
88.FF BR_INST_EXEC.ALL_BRANCHES
|
||||
89.41 BR_MISP_EXEC.NONTAKEN_CONDITIONAL
|
||||
89.81 BR_MISP_EXEC.TAKEN_CONDITIONAL
|
||||
89.84 BR_MISP_EXEC.TAKEN_INDIRECT_JUMP_NON_CALL_RET
|
||||
89.88 BR_MISP_EXEC.TAKEN_RETURN_NEAR
|
||||
89.90 BR_MISP_EXEC.TAKEN_DIRECT_NEAR_CALL
|
||||
89.A0 BR_MISP_EXEC.TAKEN_INDIRECT_NEAR_CALL
|
||||
89.C1 BR_MISP_EXEC.ALL_CONDITIONAL
|
||||
89.C4 BR_MISP_EXEC.ALL_INDIRECT_JUMP_NON_CALL_RET
|
||||
89.D0 BR_MISP_EXEC.ALL_NEAR_CALL
|
||||
89.FF BR_MISP_EXEC.ALL_BRANCHES
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.30 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.02 RESOURCE_STALLS.LB
|
||||
A2.04 RESOURCE_STALLS.RS
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A2.10 RESOURCE_STALLS.ROB
|
||||
A2.20 RESOURCE_STALLS.FCSW
|
||||
A3.01 CYCLE_ACTIVITY.CYCLES_L2_PENDING
|
||||
A3.02.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_PENDING
|
||||
A3.04 CYCLE_ACTIVITY.CYCLES_NO_DISPATCH
|
||||
A3.05 CYCLE_ACTIVITY.STALL_CYCLES_L2_PENDING
|
||||
A3.06.CTR=2 CYCLE_ACTIVITY.STALL_CYCLES_L1D_PENDING
|
||||
A8.01 LSD.UOPS
|
||||
AB.01 DSB2MITE_SWITCHES.COUNT
|
||||
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
||||
AC.02 DSB_FILL.OTHER_CANCEL
|
||||
AC.08 DSB_FILL.EXCEED_DSB_LINES
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B1.01 UOPS_DISPATCHED.THREAD
|
||||
B1.02 UOPS_DISPATCHED.CORE
|
||||
B2.01 OFFCORE_REQUESTS_BUFFER.SQ_FULL
|
||||
B6.01 AGU_BYPASS_CANCEL.COUNT
|
||||
B7.01.CTR=0.MSR_RSP0=0x10001 OFF_CORE_RESPONSE_0.DMND_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10002 OFF_CORE_RESPONSE_0.DMND_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10004 OFF_CORE_RESPONSE_0.DMND_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10008 OFF_CORE_RESPONSE_0.WB
|
||||
B7.01.CTR=0.MSR_RSP0=0x10010 OFF_CORE_RESPONSE_0.PF_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10020 OFF_CORE_RESPONSE_0.PF_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10040 OFF_CORE_RESPONSE_0.PF_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10080 OFF_CORE_RESPONSE_0.PF_LLC_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10100 OFF_CORE_RESPONSE_0.PF_LLC_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10200 OFF_CORE_RESPONSE_0.PF_LLC_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10400 OFF_CORE_RESPONSE_0.BUS_LOCKS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10800 OFF_CORE_RESPONSE_0.STRM_ST
|
||||
B7.01.CTR=0.MSR_RSP0=0x18000 OFF_CORE_RESPONSE_0.OTHER
|
||||
BB.01.CTR=1.MSR_RSP1=0x10001 OFF_CORE_RESPONSE_1.DMND_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10002 OFF_CORE_RESPONSE_1.DMND_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10004 OFF_CORE_RESPONSE_1.DMND_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10008 OFF_CORE_RESPONSE_1.WB
|
||||
BB.01.CTR=1.MSR_RSP1=0x10010 OFF_CORE_RESPONSE_1.PF_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10020 OFF_CORE_RESPONSE_1.PF_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10040 OFF_CORE_RESPONSE_1.PF_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10080 OFF_CORE_RESPONSE_1.PF_LLC_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10100 OFF_CORE_RESPONSE_1.PF_LLC_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10200 OFF_CORE_RESPONSE_1.PF_LLC_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10400 OFF_CORE_RESPONSE_1.BUS_LOCKS
|
||||
BB.01.CTR=1.MSR_RSP1=0x10800 OFF_CORE_RESPONSE_1.STRM_ST
|
||||
BB.01.CTR=1.MSR_RSP1=0x18000 OFF_CORE_RESPONSE_1.OTHER
|
||||
BD.01 TLB_FLUSH.DTLB_THREAD
|
||||
BD.20 TLB_FLUSH.STLB_ANY
|
||||
BF.05.CMSK=1 L1D_BLOCKS.BANK_CONFLICT_CYCLES
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.01.CTR=1 INST_RETIRED.PREC_DIST
|
||||
C1.02 OTHER_ASSISTS.ITLB_MISS_RETIRED
|
||||
C1.08 OTHER_ASSISTS.AVX_STORE
|
||||
C1.10 OTHER_ASSISTS.AVX_TO_SSE
|
||||
C1.20 OTHER_ASSISTS.SSE_TO_AVX
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C3.20 MACHINE_CLEARS.MASKMOV
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.04 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.NOT_TAKEN
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.02 BR_MISP_RETIRED.NEAR_CALL
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.10 BR_MISP_RETIRED.NOT_TAKEN
|
||||
C5.20 BR_MISP_RETIRED.TAKEN
|
||||
CA.02 FP_ASSIST.X87_OUTPUT
|
||||
CA.04 FP_ASSIST.X87_INPUT
|
||||
CA.08 FP_ASSIST.SIMD_OUTPUT
|
||||
CA.10 FP_ASSIST.SIMD_INPUT
|
||||
CA.1E FP_ASSIST.ANY
|
||||
CC.20 ROB_MISC_EVENTS.LBR_INSERTS
|
||||
CD.01.CTR=3.MSR_3F6H=10 MEM_TRANS_RETIRED.LOAD_LATENCY
|
||||
CD.02.CTR=3 MEM_TRANS_RETIRED.PRECISE_STORE
|
||||
D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS
|
||||
D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES
|
||||
D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
|
||||
D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
|
||||
D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
|
||||
D0.81 MEM_UOPS_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_UOPS_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.LLC_HIT
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.LLC_MISS
|
||||
D1.40 MEM_LOAD_UOPS_RETIRED.HIT_LFB
|
||||
D2.01 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE
|
||||
E6.01 BACLEARS.ANY
|
||||
F0.01 L2_TRANS.DEMAND_DATA_RD
|
||||
F0.02 L2_TRANS.RFO
|
||||
F0.04 L2_TRANS.CODE_RD
|
||||
F0.08 L2_TRANS.ALL_PF
|
||||
F0.10 L2_TRANS.L1D_WB
|
||||
F0.20 L2_TRANS.L2_FILL
|
||||
F0.40 L2_TRANS.L2_WB
|
||||
F0.80 L2_TRANS.ALL_REQUESTS
|
||||
F1.01 L2_LINES_IN.I
|
||||
F1.02 L2_LINES_IN.S
|
||||
F1.04 L2_LINES_IN.E
|
||||
F1.07 L2_LINES_IN.ALL
|
||||
F2.01 L2_LINES_OUT.DEMAND_CLEAN
|
||||
F2.02 L2_LINES_OUT.DEMAND_DIRTY
|
||||
F2.04 L2_LINES_OUT.PF_CLEAN
|
||||
F2.08 L2_LINES_OUT.PF_DIRTY
|
||||
F2.0A L2_LINES_OUT.DIRTY_ALL
|
||||
F4.10 SQ_MISC.SPLIT_LOCK
|
19
configs/cfg_SandyBridge_common.txt
Normal file
19
configs/cfg_SandyBridge_common.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
# Performance monitoring events for processors based on the Sandy Bridge microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_2AH and 06_2DH.
|
||||
# See Table 19-16 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
B1.01 UOPS_DISPATCHED.THREAD
|
||||
C2.01 UOPS_RETIRED.ALL
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.30 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_UOPS_RETIRED.LLC_HIT
|
||||
D1.20 MEM_LOAD_UOPS_RETIRED.LLC_MISS
|
237
configs/cfg_Skylake_all.txt
Normal file
237
configs/cfg_Skylake_all.txt
Normal file
@@ -0,0 +1,237 @@
|
||||
# Performance monitoring events for processors based on Skylake, Kaby Lake and Coffee Lake microarchitectures.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_4EH and 06_5EH.
|
||||
# See Table 19-5 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
|
||||
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.10 DTLB_LOAD_MISSES.WALK_PENDING
|
||||
08.10.CMSK=1 DTLB_LOAD_MISSES.WALK_ACTIVE
|
||||
08.20 DTLB_LOAD_MISSES.STLB_HIT
|
||||
0D.01 INT_MISC.RECOVERY_CYCLES
|
||||
0D.01.AnyT INT_MISC.RECOVERY_CYCLES_ANY
|
||||
0D.80 INT_MISC.CLEAR_RESTEER_CYCLES
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.01.CMSK=1.INV UOPS_ISSUED.STALL_CYCLES
|
||||
0E.02 UOPS_ISSUED.VECTOR_WIDTH_MISMATCH
|
||||
0E.20 UOPS_ISSUED.SLOW_LEA
|
||||
14.01 ARITH.FPU_DIVIDER_ACTIVE
|
||||
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
|
||||
24.22 L2_RQSTS.RFO_MISS
|
||||
24.24 L2_RQSTS.CODE_RD_MISS
|
||||
24.27 L2_RQSTS.ALL_DEMAND_MISS
|
||||
24.38 L2_RQSTS.PF_MISS
|
||||
24.3F L2_RQSTS.MISS
|
||||
24.41 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.42 L2_RQSTS.RFO_HIT
|
||||
24.44 L2_RQSTS.CODE_RD_HIT
|
||||
24.D8 L2_RQSTS.PF_HIT
|
||||
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.E2 L2_RQSTS.ALL_RFO
|
||||
24.E4 L2_RQSTS.ALL_CODE_RD
|
||||
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
|
||||
24.F8 L2_RQSTS.ALL_PF
|
||||
24.EF L2_RQSTS.REFERENCES
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.00.AnyT CPU_CLK_UNHALTED.THREAD_P_ANY
|
||||
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
|
||||
3C.01.AnyT CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY
|
||||
3C.02 CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE
|
||||
48.01 L1D_PEND_MISS.PENDING
|
||||
48.01.CMSK=1 L1D_PEND_MISS.PENDING_CYCLES
|
||||
48.01.CMSK=1.AnyT L1D_PEND_MISS.PENDING_CYCLES_ANY
|
||||
48.02 L1D_PEND_MISS.FB_FULL
|
||||
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
|
||||
49.0E DTLB_STORE_MISSES.WALK_COMPLETED
|
||||
49.10 DTLB_STORE_MISSES.WALK_PENDING
|
||||
49.10.CMSK=1 DTLB_STORE_MISSES.WALK_ACTIVE
|
||||
49.20 DTLB_STORE_MISSES.STLB_HIT
|
||||
4C.01 LOAD_HIT_PRE.HW_PF
|
||||
4F.10 EPT.WALK_PENDING
|
||||
51.01 L1D.REPLACEMENT
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
5E.01.CMSK=1.INV RS_EVENTS.EMPTY_END
|
||||
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
||||
60.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
|
||||
60.01.CMSK=6 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6
|
||||
60.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
|
||||
60.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD
|
||||
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
|
||||
60.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
60.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
|
||||
60.10 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD
|
||||
60.10.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD
|
||||
60.10.CMSK=6 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD_GE_6
|
||||
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.04.CMSK=1 IDQ.MITE_CYCLES
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.08.CMSK=1 IDQ.DSB_CYCLES
|
||||
79.10 IDQ.MS_DSB_UOPS
|
||||
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS
|
||||
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS
|
||||
79.20 IDQ.MS_MITE_UOPS
|
||||
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS
|
||||
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS
|
||||
79.30 IDQ.MS_UOPS
|
||||
79.30.EDG IDQ.MS_SWITCHES
|
||||
79.30.CMSK=1 IDQ.MS_CYCLES
|
||||
80.04 ICACHE_16B.IFDATA_STALL
|
||||
80.04 ICACHE_64B.IFDATA_STALL
|
||||
83.01 ICACHE_64B.IFTAG_HIT
|
||||
83.02 ICACHE_64B.IFTAG_MISS
|
||||
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
|
||||
85.0E ITLB_MISSES.WALK_COMPLETED
|
||||
85.10 ITLB_MISSES.WALK_PENDING
|
||||
85.20 ITLB_MISSES.STLB_HIT
|
||||
87.01 ILD_STALL.LCP
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
9C.01.CMSK=4 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOP_DELIV.CORE
|
||||
9C.01.CMSK=3 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_1_UOP_DELIV.CORE
|
||||
9C.01.CMSK=2 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE
|
||||
9C.01.CMSK=1 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE
|
||||
9C.01.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.08 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.10 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS
|
||||
A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_L3_MISS
|
||||
A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
|
||||
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS
|
||||
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_L3_MISS
|
||||
A3.08.CMSK=8 CYCLE_ACTIVITY.CYCLES_L1D_MISS
|
||||
A3.0C.CMSK=12 CYCLE_ACTIVITY.STALLS_L1D_MISS
|
||||
A3.10.CMSK=16 CYCLE_ACTIVITY.CYCLES_MEM_ANY
|
||||
A3.14.CMSK=20 CYCLE_ACTIVITY.STALLS_MEM_ANY
|
||||
A6.01 EXE_ACTIVITY.EXE_BOUND_0_PORTS
|
||||
A6.02 EXE_ACTIVITY.1_PORTS_UTIL
|
||||
A6.04 EXE_ACTIVITY.2_PORTS_UTIL
|
||||
A6.08 EXE_ACTIVITY.3_PORTS_UTIL
|
||||
A6.10 EXE_ACTIVITY.4_PORTS_UTIL
|
||||
A6.40 EXE_ACTIVITY.BOUND_ON_STORES
|
||||
A8.01 LSD.UOPS
|
||||
A8.01.CMSK=1 LSD.CYCLES_ACTIVE
|
||||
A8.01.CMSK=4 LSD.CYCLES_4_UOPS
|
||||
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B0.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
|
||||
B0.80 OFFCORE_REQUESTS.ALL_REQUESTS
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
B1.01.CMSK=1.INV UOPS_EXECUTED.STALL_CYCLES
|
||||
B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC
|
||||
B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2_UOP_EXEC
|
||||
B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3_UOP_EXEC
|
||||
B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4_UOP_EXEC
|
||||
B1.02 UOPS_EXECUTED.CORE
|
||||
B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1
|
||||
B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2
|
||||
B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3
|
||||
B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4
|
||||
B1.02.CMSK=1.INV UOPS_EXECUTED.CORE_CYCLES_NONE
|
||||
B1.10 UOPS_EXECUTED.X87
|
||||
B2.01 OFF_CORE_REQUEST_BUFFER.SQ_FULL
|
||||
B7.01.CTR=0.MSR_RSP0=0x10001 OFF_CORE_RESPONSE_0.DMND_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10002 OFF_CORE_RESPONSE_0.DMND_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10004 OFF_CORE_RESPONSE_0.DMND_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10008 OFF_CORE_RESPONSE_0.WB
|
||||
B7.01.CTR=0.MSR_RSP0=0x10010 OFF_CORE_RESPONSE_0.PF_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10020 OFF_CORE_RESPONSE_0.PF_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10040 OFF_CORE_RESPONSE_0.PF_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10080 OFF_CORE_RESPONSE_0.PF_LLC_DATA_RD
|
||||
B7.01.CTR=0.MSR_RSP0=0x10100 OFF_CORE_RESPONSE_0.PF_LLC_RFO
|
||||
B7.01.CTR=0.MSR_RSP0=0x10200 OFF_CORE_RESPONSE_0.PF_LLC_IFETCH
|
||||
B7.01.CTR=0.MSR_RSP0=0x10400 OFF_CORE_RESPONSE_0.BUS_LOCKS
|
||||
B7.01.CTR=0.MSR_RSP0=0x10800 OFF_CORE_RESPONSE_0.STRM_ST
|
||||
B7.01.CTR=0.MSR_RSP0=0x18000 OFF_CORE_RESPONSE_0.OTHER
|
||||
BB.01.CTR=1.MSR_RSP1=0x10001 OFF_CORE_RESPONSE_1.DMND_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10002 OFF_CORE_RESPONSE_1.DMND_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10004 OFF_CORE_RESPONSE_1.DMND_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10008 OFF_CORE_RESPONSE_1.WB
|
||||
BB.01.CTR=1.MSR_RSP1=0x10010 OFF_CORE_RESPONSE_1.PF_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10020 OFF_CORE_RESPONSE_1.PF_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10040 OFF_CORE_RESPONSE_1.PF_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10080 OFF_CORE_RESPONSE_1.PF_LLC_DATA_RD
|
||||
BB.01.CTR=1.MSR_RSP1=0x10100 OFF_CORE_RESPONSE_1.PF_LLC_RFO
|
||||
BB.01.CTR=1.MSR_RSP1=0x10200 OFF_CORE_RESPONSE_1.PF_LLC_IFETCH
|
||||
BB.01.CTR=1.MSR_RSP1=0x10400 OFF_CORE_RESPONSE_1.BUS_LOCKS
|
||||
BB.01.CTR=1.MSR_RSP1=0x10800 OFF_CORE_RESPONSE_1.STRM_ST
|
||||
BB.01.CTR=1.MSR_RSP1=0x18000 OFF_CORE_RESPONSE_1.OTHER
|
||||
BD.01 TLB_FLUSH.DTLB_THREAD
|
||||
BD.01 TLB_FLUSH.STLB_ANY
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.01.CTR=1 INST_RETIRED.PREC_DIST
|
||||
C0.01.CMSK=10 INST_RETIRED.TOTAL_CYCLES
|
||||
C1.3F OTHER_ASSISTS.ANY
|
||||
C2.01.CMSK=1.INV UOPS_RETIRED.STALL_CYCLES
|
||||
C2.01.CMSK=10.INV UOPS_RETIRED.TOTAL_CYCLES
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.04 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.NOT_TAKEN
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
|
||||
C6.01.CTR=0.MSR_PF=0x11 FRONTEND_RETIRED.DSB_MISS
|
||||
C6.01.CTR=0.MSR_PF=0x12 FRONTEND_RETIRED.L1I_MISS
|
||||
C6.01.CTR=0.MSR_PF=0x13 FRONTEND_RETIRED.L2_MISS
|
||||
C6.01.CTR=0.MSR_PF=0x14 FRONTEND_RETIRED.ITLB_MISS
|
||||
C6.01.CTR=0.MSR_PF=0x15 FRONTEND_RETIRED.STLB_MISS
|
||||
C6.01.CTR=0.MSR_PF=0x401016 FRONTEND_RETIRED.LATENCY_GE_16
|
||||
C6.01.CTR=0.MSR_PF=0x100216 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
|
||||
C6.01.CTR=0.MSR_PF=0x200216 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_2
|
||||
C6.01.CTR=0.MSR_PF=0x400216 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_3
|
||||
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
|
||||
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
|
||||
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
|
||||
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
|
||||
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
|
||||
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
|
||||
CA.1E.CMSK=1 FP_ASSIST.ANY
|
||||
CB.01 HW_INTERRUPTS.RECEIVED
|
||||
CD.01.MSR_3F6H=10 MEM_TRANS_RETIRED.LOAD_LATENCY
|
||||
D0.11 MEM_INST_RETIRED.STLB_MISS_LOADS
|
||||
D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
|
||||
D0.21 MEM_INST_RETIRED.LOCK_LOADS
|
||||
D0.41 MEM_INST_RETIRED.SPLIT_LOADS
|
||||
D0.42 MEM_INST_RETIRED.SPLIT_STORES
|
||||
D0.81 MEM_INST_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_INST_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
||||
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
||||
D1.40 MEM_LOAD_RETIRED.FB_HIT
|
||||
D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
|
||||
E6.01 BACLEARS.ANY
|
||||
F0.40 L2_TRANS.L2_WB
|
||||
F1.07 L2_LINES_IN.ALL
|
22
configs/cfg_Skylake_common.txt
Normal file
22
configs/cfg_Skylake_common.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# Performance monitoring events for processors based on Skylake, Kaby Lake and Coffee Lake microarchitectures.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_4EH and 06_5EH.
|
||||
# See Table 19-5 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.08 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.10 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
||||
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
||||
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
278
configs/cfg_Westmere_all.txt
Normal file
278
configs/cfg_Westmere_all.txt
Normal file
@@ -0,0 +1,278 @@
|
||||
# Performance monitoring events for processors based on the Westmere microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_25H and 06_2CH.
|
||||
# See Table 19-22 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
03.02 LOAD_BLOCK.OVERLAP_STORE
|
||||
04.07 SB_DRAIN.ANY
|
||||
05.02 MISALIGN_MEMORY.STORE
|
||||
06.04 STORE_BLOCKS.AT_RET
|
||||
06.08 STORE_BLOCKS.L1D_BLOCK
|
||||
07.01 PARTIAL_ADDRESS_ALIAS
|
||||
08.01 DTLB_LOAD_MISSES.ANY
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.04 DTLB_LOAD_MISSES.WALK_CYCLES
|
||||
08.10 DTLB_LOAD_MISSES.STLB_HIT
|
||||
08.20 DTLB_LOAD_MISSES.PDE_MISS
|
||||
0B.01 MEM_INST_RETIRED.LOADS
|
||||
0B.02 MEM_INST_RETIRED.STORES
|
||||
0B.10 MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD
|
||||
0C.01 MEM_STORE_RETIRED.DTLB_MISS
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.01.CMSK=1.INV UOPS_ISSUED.STALLED_CYCLES
|
||||
0E.02 UOPS_ISSUED.FUSED
|
||||
0F.01 MEM_UNCORE_RETIRED.UNKNOWN_SOURCE
|
||||
0F.02 MEM_UNCORE_RETIRED.OHTER_CORE_L2_HIT
|
||||
0F.04 MEM_UNCORE_RETIRED.REMOTE_HITM
|
||||
0F.08 MEM_UNCORE_RETIRED.LOCAL_DRAM_AND_REMOTE_CACH
|
||||
0F.10 MEM_UNCORE_RETIRED.REMOTE_DRAM
|
||||
0F.20 MEM_UNCORE_RETIRED.OTHER_LLC_MISS
|
||||
0F.80 MEM_UNCORE_RETIRED.UNCACHEABLE
|
||||
10.01 FP_COMP_OPS_EXE.X87
|
||||
10.02 FP_COMP_OPS_EXE.MMX
|
||||
10.04 FP_COMP_OPS_EXE.SSE_FP
|
||||
10.08 FP_COMP_OPS_EXE.SSE2_INTEGER
|
||||
10.10 FP_COMP_OPS_EXE.SSE_FP_PACKED
|
||||
10.20 FP_COMP_OPS_EXE.SSE_FP_SCALAR
|
||||
10.40 FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION
|
||||
10.80 FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION
|
||||
12.01 SIMD_INT_128.PACKED_MPY
|
||||
12.02 SIMD_INT_128.PACKED_SHIFT
|
||||
12.04 SIMD_INT_128.PACK
|
||||
12.08 SIMD_INT_128.UNPACK
|
||||
12.10 SIMD_INT_128.PACKED_LOGICAL
|
||||
12.20 SIMD_INT_128.PACKED_ARITH
|
||||
12.40 SIMD_INT_128.SHUFFLE_MOVE
|
||||
13.01 LOAD_DISPATCH.RS
|
||||
13.02 LOAD_DISPATCH.RS_DELAYED
|
||||
13.04 LOAD_DISPATCH.MOB
|
||||
13.07 LOAD_DISPATCH.ANY
|
||||
14.01 ARITH.CYCLES_DIV_BUSY
|
||||
14.02 ARITH.MUL
|
||||
17.01 INST_QUEUE_WRITES
|
||||
18.01 INST_DECODED.DEC0
|
||||
19.01 TWO_UOP_INSTS_DECODED
|
||||
1E.01 INST_QUEUE_WRITE_CYCLES
|
||||
20.01 LSD_OVERFLOW
|
||||
24.01 L2_RQSTS.LD_HIT
|
||||
24.02 L2_RQSTS.LD_MISS
|
||||
24.03 L2_RQSTS.LOADS
|
||||
24.04 L2_RQSTS.RFO_HIT
|
||||
24.08 L2_RQSTS.RFO_MISS
|
||||
24.0C L2_RQSTS.RFOS
|
||||
24.10 L2_RQSTS.IFETCH_HIT
|
||||
24.20 L2_RQSTS.IFETCH_MISS
|
||||
24.30 L2_RQSTS.IFETCHES
|
||||
24.40 L2_RQSTS.PREFETCH_HIT
|
||||
24.80 L2_RQSTS.PREFETCH_MISS
|
||||
24.C0 L2_RQSTS.PREFETCHES
|
||||
24.AA L2_RQSTS.MISS
|
||||
24.FF L2_RQSTS.REFERENCES
|
||||
26.01 L2_DATA_RQSTS.DEMAND.I_STATE
|
||||
26.02 L2_DATA_RQSTS.DEMAND.S_STATE
|
||||
26.04 L2_DATA_RQSTS.DEMAND.E_STATE
|
||||
26.08 L2_DATA_RQSTS.DEMAND.M_STATE
|
||||
26.0F L2_DATA_RQSTS.DEMAND.MESI
|
||||
26.10 L2_DATA_RQSTS.PREFETCH.I_STATE
|
||||
26.20 L2_DATA_RQSTS.PREFETCH.S_STATE
|
||||
26.40 L2_DATA_RQSTS.PREFETCH.E_STATE
|
||||
26.80 L2_DATA_RQSTS.PREFETCH.M_STATE
|
||||
26.F0 L2_DATA_RQSTS.PREFETCH.MESI
|
||||
26.FF L2_DATA_RQSTS.ANY
|
||||
27.01 L2_WRITE.RFO.I_STATE
|
||||
27.02 L2_WRITE.RFO.S_STATE
|
||||
27.08 L2_WRITE.RFO.M_STATE
|
||||
27.0E L2_WRITE.RFO.HIT
|
||||
27.0F L2_WRITE.RFO.MESI
|
||||
27.10 L2_WRITE.LOCK.I_STATE
|
||||
27.20 L2_WRITE.LOCK.S_STATE
|
||||
27.40 L2_WRITE.LOCK.E_STATE
|
||||
27.80 L2_WRITE.LOCK.M_STATE
|
||||
27.E0 L2_WRITE.LOCK.HIT
|
||||
27.F0 L2_WRITE.LOCK.MESI
|
||||
28.01 L1D_WB_L2.I_STATE
|
||||
28.02 L1D_WB_L2.S_STATE
|
||||
28.04 L1D_WB_L2.E_STATE
|
||||
28.08 L1D_WB_L2.M_STATE
|
||||
28.0F L1D_WB_L2.MESI
|
||||
2E.41 L3_LAT_CACHE.MISS
|
||||
2E.4F L3_LAT_CACHE.REFERENCE
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_UNHALTED.REF_P
|
||||
49.01 DTLB_MISSES.ANY
|
||||
49.02 DTLB_MISSES.WALK_COMPLETED
|
||||
49.04 DTLB_MISSES.WALK_CYCLES
|
||||
49.10 DTLB_MISSES.STLB_HIT
|
||||
49.20 DTLB_MISSES.PDE_MISS
|
||||
49.80 DTLB_MISSES.LARGE_WALK_COMPLETED
|
||||
4C.01.CTR=0 LOAD_HIT_PRE
|
||||
4E.01.CTR=0 L1D_PREFETCH.REQUESTS
|
||||
4E.02.CTR=0 L1D_PREFETCH.MISS
|
||||
4E.04.CTR=0 L1D_PREFETCH.TRIGGERS
|
||||
4F.10 EPT.WALK_CYCLES
|
||||
51.01.CTR=0 L1D.REPL
|
||||
51.02.CTR=0 L1D.M_REPL
|
||||
51.04.CTR=0 L1D.M_EVICT
|
||||
51.08.CTR=0 L1D.M_SNOOP_EVICT
|
||||
52.01 L1D_CACHE_PREFETCH_LOCK_FB_HIT
|
||||
60.01.CTR=0 OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_DATA
|
||||
60.02.CTR=0 OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_CODE
|
||||
60.04.CTR=0 OFFCORE_REQUESTS_OUTSTANDING.DEMAND.RFO
|
||||
60.08.CTR=0 OFFCORE_REQUESTS_OUTSTANDING.ANY.READ
|
||||
63.01.CTR=0 CACHE_LOCK_CYCLES.L1D_L2
|
||||
63.02.CTR=0 CACHE_LOCK_CYCLES.L1D
|
||||
6C.01 IO_TRANSACTIONS
|
||||
80.01 L1I.HITS
|
||||
80.02 L1I.MISSES
|
||||
80.03 L1I.READS
|
||||
80.04 L1I.CYCLES_STALLED
|
||||
82.01 LARGE_ITLB.HIT
|
||||
85.01 ITLB_MISSES.ANY
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED
|
||||
85.04 ITLB_MISSES.WALK_CYCLES
|
||||
85.10 ITLB_MISSES.STLB_HIT
|
||||
85.80 ITLB_MISSES.LARGE_WALK_COMPLETED
|
||||
87.01 ILD_STALL.LCP
|
||||
87.02 ILD_STALL.MRU
|
||||
87.04 ILD_STALL.IQ_FULL
|
||||
87.08 ILD_STALL.REGEN
|
||||
87.0F ILD_STALL.ANY
|
||||
88.01 BR_INST_EXEC.COND
|
||||
88.02 BR_INST_EXEC.DIRECT
|
||||
88.04 BR_INST_EXEC.INDIRECT_NON_CALL
|
||||
88.07 BR_INST_EXEC.NON_CALLS
|
||||
88.08 BR_INST_EXEC.RETURN_NEAR
|
||||
88.10 BR_INST_EXEC.DIRECT_NEAR_CALL
|
||||
88.20 BR_INST_EXEC.INDIRECT_NEAR_CALL
|
||||
88.30 BR_INST_EXEC.NEAR_CALLS
|
||||
88.40 BR_INST_EXEC.TAKEN
|
||||
88.7F BR_INST_EXEC.ANY
|
||||
89.01 BR_MISP_EXEC.COND
|
||||
89.02 BR_MISP_EXEC.DIRECT
|
||||
89.04 BR_MISP_EXEC.INDIRECT_NON_CALL
|
||||
89.07 BR_MISP_EXEC.NON_CALLS
|
||||
89.08 BR_MISP_EXEC.RETURN_NEAR
|
||||
89.10 BR_MISP_EXEC.DIRECT_NEAR_CALL
|
||||
89.20 BR_MISP_EXEC.INDIRECT_NEAR_CALL
|
||||
89.30 BR_MISP_EXEC.NEAR_CALLS
|
||||
89.40 BR_MISP_EXEC.TAKEN
|
||||
89.7F BR_MISP_EXEC.ANY
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.02 RESOURCE_STALLS.LOAD
|
||||
A2.04 RESOURCE_STALLS.RS_FULL
|
||||
A2.08 RESOURCE_STALLS.STORE
|
||||
A2.10 RESOURCE_STALLS.ROB_FULL
|
||||
A2.20 RESOURCE_STALLS.FPCW
|
||||
A2.40 RESOURCE_STALLS.MXCSR
|
||||
A2.80 RESOURCE_STALLS.OTHER
|
||||
A6.01 MACRO_INSTS.FUSIONS_DECODED
|
||||
A7.01 BACLEAR_FORCE_IQ
|
||||
A8.01 LSD.UOPS
|
||||
AE.01 ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND.READ_DATA
|
||||
B0.02 OFFCORE_REQUESTS.DEMAND.READ_CODE
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND.RFO
|
||||
B0.08 OFFCORE_REQUESTS.ANY.READ
|
||||
B0.10 OFFCORE_REQUESTS.ANY.RFO
|
||||
B0.40 OFFCORE_REQUESTS.L1D_WRITEBACK
|
||||
B0.80 OFFCORE_REQUESTS.ANY
|
||||
B1.01 UOPS_EXECUTED.PORT0
|
||||
B1.02 UOPS_EXECUTED.PORT1
|
||||
B1.04 UOPS_EXECUTED.PORT2_CORE
|
||||
B1.08 UOPS_EXECUTED.PORT3_CORE
|
||||
B1.10 UOPS_EXECUTED.PORT4_CORE
|
||||
B1.1F UOPS_EXECUTED.CORE_ACTIVE_CYCLES_NO_PORT5
|
||||
B1.20 UOPS_EXECUTED.PORT5
|
||||
B1.3F UOPS_EXECUTED.CORE_ACTIVE_CYCLES
|
||||
B1.40 UOPS_EXECUTED.PORT015
|
||||
B1.80 UOPS_EXECUTED.PORT234
|
||||
B2.01 OFFCORE_REQUESTS_SQ_FULL
|
||||
B3.01.CTR=0 SNOOPQ_REQUESTS_OUTSTANDING.DATA
|
||||
B3.02.CTR=0 SNOOPQ_REQUESTS_OUTSTANDING.INVALIDATE
|
||||
B3.04.CTR=0 SNOOPQ_REQUESTS_OUTSTANDING.CODE
|
||||
B4.01 SNOOPQ_REQUESTS.CODE
|
||||
B4.02 SNOOPQ_REQUESTS.DATA
|
||||
B4.04 SNOOPQ_REQUESTS.INVALIDATE
|
||||
B8.01 SNOOP_RESPONSE.HIT
|
||||
B8.02 SNOOP_RESPONSE.HITE
|
||||
B8.04 SNOOP_RESPONSE.HITM
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.02 INST_RETIRED.X87
|
||||
C0.04 INST_RETIRED.MMX
|
||||
C2.01 UOPS_RETIRED.ANY
|
||||
C2.02 UOPS_RETIRED.RETIRE_SLOTS
|
||||
C2.04 UOPS_RETIRED.MACRO_FUSED
|
||||
C3.01 MACHINE_CLEARS.CYCLES
|
||||
C3.02 MACHINE_CLEARS.MEM_ORDER
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.02 BR_MISP_RETIRED.NEAR_CALL
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C7.01 SSEX_UOPS_RETIRED.PACKED_SINGLE
|
||||
C7.02 SSEX_UOPS_RETIRED.SCALAR_SINGLE
|
||||
C7.04 SSEX_UOPS_RETIRED.PACKED_DOUBLE
|
||||
C7.08 SSEX_UOPS_RETIRED.SCALAR_DOUBLE
|
||||
C7.10 SSEX_UOPS_RETIRED.VECTOR_INTEGER
|
||||
C8.20 ITLB_MISS_RETIRED
|
||||
CB.01 MEM_LOAD_RETIRED.L1D_HIT
|
||||
CB.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
CB.04 MEM_LOAD_RETIRED.L3_UNSHARED_HIT
|
||||
CB.08 MEM_LOAD_RETIRED.OTHER_CORE_L2_HIT_HITM
|
||||
CB.10 MEM_LOAD_RETIRED.L3_MISS
|
||||
CB.40 MEM_LOAD_RETIRED.HIT_LFB
|
||||
CB.80 MEM_LOAD_RETIRED.DTLB_MISS
|
||||
CC.01 FP_MMX_TRANS.TO_FP
|
||||
CC.02 FP_MMX_TRANS.TO_MMX
|
||||
CC.03 FP_MMX_TRANS.ANY
|
||||
D0.01 MACRO_INSTS.DECODED
|
||||
D1.01 UOPS_DECODED.STALL_CYCLES
|
||||
D1.02 UOPS_DECODED.MS
|
||||
D1.04 UOPS_DECODED.ESP_FOLDING
|
||||
D1.08 UOPS_DECODED.ESP_SYNC
|
||||
D2.01 RAT_STALLS.FLAGS
|
||||
D2.02 RAT_STALLS.REGISTERS
|
||||
D2.04 RAT_STALLS.ROB_READ_PORT
|
||||
D2.08 RAT_STALLS.SCOREBOARD
|
||||
D2.0F RAT_STALLS.ANY
|
||||
D4.01 SEG_RENAME_STALLS
|
||||
D5.01 ES_REG_RENAMES
|
||||
DB.01 UOP_UNFUSION
|
||||
E0.01 BR_INST_DECODED
|
||||
E5.01 BPU_MISSED_CALL_RET
|
||||
E6.01 BACLEAR.CLEAR
|
||||
E6.02 BACLEAR.BAD_TARGET
|
||||
E8.01 BPU_CLEARS.EARLY
|
||||
E8.02 BPU_CLEARS.LATE
|
||||
EC.01 THREAD_ACTIVE
|
||||
F0.01 L2_TRANSACTIONS.LOAD
|
||||
F0.02 L2_TRANSACTIONS.RFO
|
||||
F0.04 L2_TRANSACTIONS.IFETCH
|
||||
F0.08 L2_TRANSACTIONS.PREFETCH
|
||||
F0.10 L2_TRANSACTIONS.L1D_WB
|
||||
F0.20 L2_TRANSACTIONS.FILL
|
||||
F0.40 L2_TRANSACTIONS.WB
|
||||
F0.80 L2_TRANSACTIONS.ANY
|
||||
F1.02 L2_LINES_IN.S_STATE
|
||||
F1.04 L2_LINES_IN.E_STATE
|
||||
F1.07 L2_LINES_IN.ANY
|
||||
F2.01 L2_LINES_OUT.DEMAND_CLEAN
|
||||
F2.02 L2_LINES_OUT.DEMAND_DIRTY
|
||||
F2.04 L2_LINES_OUT.PREFETCH_CLEAN
|
||||
F2.08 L2_LINES_OUT.PREFETCH_DIRTY
|
||||
F2.0F L2_LINES_OUT.ANY
|
||||
F4.04 SQ_MISC.LRU_HINTS
|
||||
F4.10 SQ_MISC.SPLIT_LOCK
|
||||
F6.01 SQ_FULL_STALL_CYCLES
|
||||
F7.01 FP_ASSIST.ALL
|
||||
F7.02 FP_ASSIST.OUTPUT
|
||||
F7.04 FP_ASSIST.INPUT
|
||||
FD.01 SIMD_INT_64.PACKED_MPY
|
||||
FD.02 SIMD_INT_64.PACKED_SHIFT
|
||||
FD.04 SIMD_INT_64.PACK
|
||||
FD.08 SIMD_INT_64.UNPACK
|
||||
FD.10 SIMD_INT_64.PACKED_LOGICAL
|
||||
FD.20 SIMD_INT_64.PACKED_ARITH
|
||||
FD.40 SIMD_INT_64.SHUFFLE_MOVE
|
19
configs/cfg_Westmere_common.txt
Normal file
19
configs/cfg_Westmere_common.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
# Performance monitoring events for processors based on the Westmere microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_25H and 06_2CH.
|
||||
# See Table 19-22 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.02 UOPS_ISSUED.FUSED
|
||||
C2.01 UOPS_RETIRED.ANY
|
||||
B1.01 UOPS_EXECUTED.PORT0
|
||||
B1.02 UOPS_EXECUTED.PORT1
|
||||
B1.04 UOPS_EXECUTED.PORT2_CORE
|
||||
B1.08 UOPS_EXECUTED.PORT3_CORE
|
||||
B1.10 UOPS_EXECUTED.PORT4_CORE
|
||||
B1.20 UOPS_EXECUTED.PORT5
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
CB.01 MEM_LOAD_RETIRED.L1D_HIT
|
||||
CB.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
CB.04 MEM_LOAD_RETIRED.L3_UNSHARED_HIT
|
||||
CB.10 MEM_LOAD_RETIRED.L3_MISS
|
288
configs/cfg_XeonScalable_all.txt
Normal file
288
configs/cfg_XeonScalable_all.txt
Normal file
@@ -0,0 +1,288 @@
|
||||
# Performance monitoring events for the Intel Xeon Processor Scalable Family.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_55H.
|
||||
# See Table 19-3 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
00.01 INST_RETIRED.ANY
|
||||
00.02 CPU_CLK_UNHALTED.THREAD
|
||||
00.02.AnyT CPU_CLK_UNHALTED.THREAD_ANY
|
||||
00.03 CPU_CLK_UNHALTED.REF_TSC
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
|
||||
08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
|
||||
08.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G
|
||||
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.10 DTLB_LOAD_MISSES.WALK_PENDING
|
||||
08.10.CMSK=1 DTLB_LOAD_MISSES.WALK_ACTIVE
|
||||
08.20 DTLB_LOAD_MISSES.STLB_HIT
|
||||
0D.01 INT_MISC.RECOVERY_CYCLES
|
||||
0D.01.AnyT INT_MISC.RECOVERY_CYCLES_ANY
|
||||
0D.80 INT_MISC.CLEAR_RESTEER_CYCLES
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.01.CMSK=1.INV UOPS_ISSUED.STALL_CYCLES
|
||||
0E.02 UOPS_ISSUED.VECTOR_WIDTH_MISMATCH
|
||||
0E.20 UOPS_ISSUED.SLOW_LEA
|
||||
14.01.CMSK=1 ARITH.DIVIDER_ACTIVE
|
||||
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
|
||||
24.22 L2_RQSTS.RFO_MISS
|
||||
24.24 L2_RQSTS.CODE_RD_MISS
|
||||
24.27 L2_RQSTS.ALL_DEMAND_MISS
|
||||
24.38 L2_RQSTS.PF_MISS
|
||||
24.3F L2_RQSTS.MISS
|
||||
24.41 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.42 L2_RQSTS.RFO_HIT
|
||||
24.44 L2_RQSTS.CODE_RD_HIT
|
||||
24.D8 L2_RQSTS.PF_HIT
|
||||
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.E2 L2_RQSTS.ALL_RFO
|
||||
24.E4 L2_RQSTS.ALL_CODE_RD
|
||||
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
|
||||
24.F8 L2_RQSTS.ALL_PF
|
||||
24.FF L2_RQSTS.REFERENCES
|
||||
28.07 CORE_POWER.LVL0_TURBO_LICENSE
|
||||
28.18 CORE_POWER.LVL1_TURBO_LICENSE
|
||||
28.20 CORE_POWER.LVL2_TURBO_LICENSE
|
||||
28.40 CORE_POWER.THROTTLE
|
||||
2E.41 LONGEST_LAT_CACHE.MISS
|
||||
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.00.AnyT CPU_CLK_UNHALTED.THREAD_P_ANY
|
||||
3C.00.CMSK=1.EDG CPU_CLK_UNHALTED.RING0_TRANS
|
||||
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
|
||||
3C.01.AnyT CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY
|
||||
3C.01 CPU_CLK_UNHALTED.REF_XCLK
|
||||
3C.01.AnyT CPU_CLK_UNHALTED.REF_XCLK_ANY
|
||||
3C.02 CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE
|
||||
3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
|
||||
48.01 L1D_PEND_MISS.PENDING
|
||||
48.01.CMSK=1 L1D_PEND_MISS.PENDING_CYCLES
|
||||
48.01.CMSK=1.AnyT L1D_PEND_MISS.PENDING_CYCLES_ANY
|
||||
48.02 L1D_PEND_MISS.FB_FULL
|
||||
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
|
||||
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
|
||||
49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
|
||||
49.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G
|
||||
49.0E DTLB_STORE_MISSES.WALK_COMPLETED
|
||||
49.10 DTLB_STORE_MISSES.WALK_PENDING
|
||||
49.10.CMSK=1 DTLB_STORE_MISSES.WALK_ACTIVE
|
||||
49.20 DTLB_STORE_MISSES.STLB_HIT
|
||||
4C.01 LOAD_HIT_PRE.SW_PF
|
||||
4F.10 EPT.WALK_PENDING
|
||||
51.01 L1D.REPLACEMENT
|
||||
54.01 TX_MEM.ABORT_CONFLICT
|
||||
54.02 TX_MEM.ABORT_CAPACITY
|
||||
54.04 TX_MEM.ABORT_HLE_STORE_TO_ELIDED_LOCK
|
||||
54.08 TX_MEM.ABORT_HLE_ELISION_BUFFER_NOT_EMPTY
|
||||
54.10 TX_MEM.ABORT_HLE_ELISION_BUFFER_MISMATCH
|
||||
54.20 TX_MEM.ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGN
|
||||
54.40 TX_MEM.HLE_ELISION_BUFFER_FULL
|
||||
5D.01 TX_EXEC.MISC1
|
||||
5D.02 TX_EXEC.MISC2
|
||||
5D.04 TX_EXEC.MISC3
|
||||
5D.08 TX_EXEC.MISC4
|
||||
5D.10 TX_EXEC.MISC5
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
5E.01.CMSK=1.EDG.INV RS_EVENTS.EMPTY_END
|
||||
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
||||
60.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
|
||||
60.01.CMSK=6 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6
|
||||
60.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
|
||||
60.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD
|
||||
60.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
|
||||
60.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
60.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
|
||||
60.10 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD
|
||||
60.10.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD
|
||||
60.10.CMSK=6 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD_GE_6
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.04.CMSK=1 IDQ.MITE_CYCLES
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.08.CMSK=1 IDQ.DSB_CYCLES
|
||||
79.10.CMSK=1 IDQ.MS_DSB_CYCLES
|
||||
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS
|
||||
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS
|
||||
79.20 IDQ.MS_MITE_UOPS
|
||||
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS
|
||||
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS
|
||||
79.30.CMSK=1 IDQ.MS_CYCLES
|
||||
79.30.CMSK=1.EDG IDQ.MS_SWITCHES
|
||||
79.30 IDQ.MS_UOPS
|
||||
80.04 ICACHE_16B.IFDATA_STALL
|
||||
83.01 ICACHE_64B.IFTAG_HIT
|
||||
83.02 ICACHE_64B.IFTAG_MISS
|
||||
83.04 ICACHE_64B.IFTAG_STALL
|
||||
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED_4K
|
||||
85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
|
||||
85.08 ITLB_MISSES.WALK_COMPLETED_1G
|
||||
85.0E ITLB_MISSES.WALK_COMPLETED
|
||||
85.10 ITLB_MISSES.WALK_PENDING
|
||||
85.10.CMSK=1 ITLB_MISSES.WALK_ACTIVE
|
||||
85.20 ITLB_MISSES.STLB_HIT
|
||||
87.01 ILD_STALL.LCP
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
9C.01.CMSK=4 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
|
||||
9C.01.CMSK=3 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_1_UOP_DELIV.CORE
|
||||
9C.01.CMSK=2 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE
|
||||
9C.01.CMSK=1 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE
|
||||
9C.01.CMSK=1.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.08 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.10 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||
A2.01 RESOURCE_STALLS.ANY
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS
|
||||
A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_L3_MISS
|
||||
A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
|
||||
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS
|
||||
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_L3_MISS
|
||||
A3.08.CMSK=8 CYCLE_ACTIVITY.CYCLES_L1D_MISS
|
||||
A3.0C.CMSK=12 CYCLE_ACTIVITY.STALLS_L1D_MISS
|
||||
A3.10.CMSK=16 CYCLE_ACTIVITY.CYCLES_MEM_ANY
|
||||
A3.14.CMSK=20 CYCLE_ACTIVITY.STALLS_MEM_ANY
|
||||
A6.01 EXE_ACTIVITY.EXE_BOUND_0_PORTS
|
||||
A6.02 EXE_ACTIVITY.1_PORTS_UTIL
|
||||
A6.04 EXE_ACTIVITY.2_PORTS_UTIL
|
||||
A6.08 EXE_ACTIVITY.3_PORTS_UTIL
|
||||
A6.10 EXE_ACTIVITY.4_PORTS_UTIL
|
||||
A6.40 EXE_ACTIVITY.BOUND_ON_STORES
|
||||
A8.01 LSD.UOPS
|
||||
A8.01.CMSK=1 LSD.CYCLES_ACTIVE
|
||||
A8.01.CMSK=4 LSD.CYCLES_4_UOPS
|
||||
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B0.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
|
||||
B0.80 OFFCORE_REQUESTS.ALL_REQUESTS
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
B1.01.CMSK=1.INV UOPS_EXECUTED.STALL_CYCLES
|
||||
B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC
|
||||
B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC
|
||||
B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC
|
||||
B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4_UOPS_EXEC
|
||||
B1.02 UOPS_EXECUTED.CORE
|
||||
B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1
|
||||
B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2
|
||||
B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3
|
||||
B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4
|
||||
B1.02.CMSK=1.INV UOPS_EXECUTED.CORE_CYCLES_NONE
|
||||
B1.10 UOPS_EXECUTED.X87
|
||||
B2.01 OFFCORE_REQUESTS_BUFFER.SQ_FULL
|
||||
BD.01 TLB_FLUSH.DTLB_THREAD
|
||||
BD.20 TLB_FLUSH.STLB_ANY
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C0.01 INST_RETIRED.PREC_DIST
|
||||
C1.3F OTHER_ASSISTS.ANY
|
||||
C2.01.CMSK=1.INV UOPS_RETIRED.STALL_CYCLES
|
||||
C2.01.CMSK=10.INV UOPS_RETIRED.TOTAL_CYCLES
|
||||
C2.02.CMSK=1.EDG UOPS_RETIRED.RETIRE_SLOTS
|
||||
C3.01 MACHINE_CLEARS.COUNT
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.CONDITIONAL
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.NOT_TAKEN
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.CONDITIONAL
|
||||
C5.02 BR_MISP_RETIRED.NEAR_CALL
|
||||
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
|
||||
C6.01 FRONTEND_RETIRED.DSB_MISS
|
||||
C6.01 FRONTEND_RETIRED.L1I_MISS
|
||||
C6.01 FRONTEND_RETIRED.L2_MISS
|
||||
C6.01 FRONTEND_RETIRED.ITLB_MISS
|
||||
C6.01 FRONTEND_RETIRED.STLB_MISS
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_2
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_4
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_8
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_16
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_32
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_64
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_128
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_256
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_512
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_2
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_3
|
||||
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
|
||||
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
|
||||
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
|
||||
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
|
||||
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
|
||||
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
|
||||
C7.40 FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE
|
||||
C7.80 FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE
|
||||
C8.01 HLE_RETIRED.START
|
||||
C8.02 HLE_RETIRED.COMMIT
|
||||
C8.04 HLE_RETIRED.ABORTED
|
||||
C8.08 HLE_RETIRED.ABORTED_MEM
|
||||
C8.10 HLE_RETIRED.ABORTED_TIMER
|
||||
C8.20 HLE_RETIRED.ABORTED_UNFRIENDLY
|
||||
C8.40 HLE_RETIRED.ABORTED_MEMTYPE
|
||||
C8.80 HLE_RETIRED.ABORTED_EVENTS
|
||||
C9.01 RTM_RETIRED.START
|
||||
C9.02 RTM_RETIRED.COMMIT
|
||||
C9.04 RTM_RETIRED.ABORTED
|
||||
C9.08 RTM_RETIRED.ABORTED_MEM
|
||||
C9.10 RTM_RETIRED.ABORTED_TIMER
|
||||
C9.20 RTM_RETIRED.ABORTED_UNFRIENDLY
|
||||
C9.40 RTM_RETIRED.ABORTED_MEMTYPE
|
||||
C9.80 RTM_RETIRED.ABORTED_EVENTS
|
||||
CA.1E.CMSK=1 FP_ASSIST.ANY
|
||||
CB.01 HW_INTERRUPTS.RECEIVED
|
||||
CC.20 ROB_MISC_EVENTS.LBR_INSERTS
|
||||
CD.01.MSR_3F6H=4 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
|
||||
CD.01.MSR_3F6H=8 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
|
||||
CD.01.MSR_3F6H=16 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
|
||||
CD.01.MSR_3F6H=32 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
|
||||
CD.01.MSR_3F6H=64 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
|
||||
CD.01.MSR_3F6H=128 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
|
||||
CD.01.MSR_3F6H=256 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
|
||||
CD.01.MSR_3F6H=512 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
|
||||
D0.11 MEM_INST_RETIRED.STLB_MISS_LOADS
|
||||
D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
|
||||
D0.21 MEM_INST_RETIRED.LOCK_LOADS
|
||||
D0.41 MEM_INST_RETIRED.SPLIT_LOADS
|
||||
D0.42 MEM_INST_RETIRED.SPLIT_STORES
|
||||
D0.81 MEM_INST_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_INST_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
||||
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
||||
D1.40 MEM_LOAD_RETIRED.FB_HIT
|
||||
D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
|
||||
D3.01 MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM
|
||||
D3.02 MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM
|
||||
D3.04 MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM
|
||||
D3.08 MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD
|
||||
D4.04 MEM_LOAD_MISC_RETIRED.UC
|
||||
E6.01 BACLEARS.ANY
|
||||
F0.40 L2_TRANS.L2_WB
|
||||
F1.1F L2_LINES_IN.ALL
|
||||
F2.01 L2_LINES_OUT.SILENT
|
||||
F2.02 L2_LINES_OUT.NON_SILENT
|
||||
F2.04 L2_LINES_OUT.USELESS_PREF
|
||||
F2.04 L2_LINES_OUT.USELESS_HWPF
|
||||
F4.10 SQ_MISC.SPLIT_LOCK
|
||||
FE.02 IDI_MISC.WB_UPGRADE
|
||||
FE.04 IDI_MISC.WB_DOWNGRADE
|
22
configs/cfg_XeonScalable_common.txt
Normal file
22
configs/cfg_XeonScalable_common.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# Performance monitoring events for the Intel Xeon Processor Scalable Family.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_55H.
|
||||
# See Table 19-3 of Intel's "System Programming Guide" (Jan. 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||
A1.08 UOPS_DISPATCHED_PORT.PORT_3
|
||||
A1.10 UOPS_DISPATCHED_PORT.PORT_4
|
||||
A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
||||
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
||||
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
143
configs/cfg_Zen_all.txt
Normal file
143
configs/cfg_Zen_all.txt
Normal file
@@ -0,0 +1,143 @@
|
||||
# Performance monitoring events for AMD Family 17h processors.
|
||||
# See Section 2.1.13.3 of AMD's "Preliminary Processor Programming Reference" (Apr. 2017)
|
||||
|
||||
000.01 FpuPipeAssignment.Total0
|
||||
000.02 FpuPipeAssignment.Total1
|
||||
000.04 FpuPipeAssignment.Total2
|
||||
000.08 FpuPipeAssignment.Total3
|
||||
000.10 FpuPipeAssignment.Dual0
|
||||
000.20 FpuPipeAssignment.Dual1
|
||||
000.40 FpuPipeAssignment.Dual2
|
||||
000.80 FpuPipeAssignment.Dual3
|
||||
001.00 FpSchedEmpty
|
||||
002.01 FpRetx87FpOps.AddSubOps
|
||||
002.02 FpRetx87FpOps.MulOps
|
||||
002.04 FpRetx87FpOps.DivSqrROps
|
||||
003.01 FpRetSseAvxOps.SpAddSubFlops
|
||||
003.02 FpRetSseAvxOps.SpMultFlops
|
||||
003.04 FpRetSseAvxOps.SpDivFlops
|
||||
003.08 FpRetSseAvxOps.SpMultAddFlops
|
||||
003.10 FpRetSseAvxOps.DpAddSubFlops
|
||||
003.20 FpRetSseAvxOps.DpMultFlops
|
||||
003.40 FpRetSseAvxOps.DpDivFlops
|
||||
003.80 FpRetSseAvxOps.DpMultAddFlops
|
||||
004.01 FpNumMovElimScalOp.Optimized
|
||||
004.02 FpNumMovElimScalOp.OptPotential
|
||||
004.04 FpNumMovElimScalOp.SseMovOpsElim
|
||||
004.08 FpNumMovElimScalOp.SseMovOps
|
||||
005.01 FpRetiredSerOps.SseBotRet
|
||||
005.02 FpRetiredSerOps.SseCtrlRet
|
||||
005.04 FpRetiredSerOps.X87BotRet
|
||||
005.08 FpRetiredSerOps.X87CtrlRet
|
||||
025.01 LsLocks.BusLock
|
||||
025.02 LsLocks.NonSpecLock
|
||||
025.04 LsLocks.SpecLock
|
||||
025.08 LsLocks.SpecLockMapCommit
|
||||
029.01 LsDispatch.LdDispatch
|
||||
029.02 LsDispatch.StoreDispatch
|
||||
029.04 LsDispatch.LdStDispatch
|
||||
035.00 LsSTLF
|
||||
040.00 LsDcAccesses
|
||||
041.01 LsMabAllocPipe.DataPipe
|
||||
041.02 LsMabAllocPipe.StPipe
|
||||
041.04 LsMabAllocPipe.TlbPipeLate
|
||||
041.08 LsMabAllocPipe.HwPf
|
||||
041.10 LsMabAllocPipe.TlbPipeEarly
|
||||
045.01 LsL1DTlbMiss.TlbReload4KL2Hit
|
||||
045.02 LsL1DTlbMiss.TlbReload32KL2Hit
|
||||
045.04 LsL1DTlbMiss.TlbReload2ML2Hit
|
||||
045.08 LsL1DTlbMiss.TlbReload1GL2Hit
|
||||
045.10 LsL1DTlbMiss.TlbReload4KL2Miss
|
||||
045.20 LsL1DTlbMiss.TlbReload32KL2Miss
|
||||
045.40 LsL1DTlbMiss.TlbReload2ML2Miss
|
||||
045.80 LsL1DTlbMiss.TlbReload1GL2Miss
|
||||
046.01 LsTablewalker.PerfMonTablewalkAllocDside0
|
||||
046.02 LsTablewalker.PerfMonTablewalkAllocDside1
|
||||
046.04 LsTablewalker.PerfMonTablewalkAllocIside0
|
||||
046.08 LsTablewalker.PerfMonTablewalkAllocIside1
|
||||
047.00 LsMisalAccesses
|
||||
04B.01 LsPrefInstrDisp.LoadPrefetchW
|
||||
04B.02 LsPrefInstrDisp.StorePrefetchW
|
||||
04B.04 LsPrefInstrDisp.PrefetchNTA
|
||||
052.01 LsInefSwPref.MabMchCnt
|
||||
052.02 LsInefSwPref.DataPipeSwPfDcHit
|
||||
076.00 LsNotHaltedCyc
|
||||
080.00 IcFw32
|
||||
081.00 IcFw32Miss
|
||||
082.00 IcCacheFillL2
|
||||
083.00 IcCacheFillSys
|
||||
084.00 BpL1TlbMissL2Hit
|
||||
085.00 BpL1TlbMissL2Miss
|
||||
086.00 BpSnpReSync
|
||||
087.01 IcFetchStall.IcStallBackPressure
|
||||
087.02 IcFetchStall.IcStallDqEmpty
|
||||
087.04 IcFetchStall.IcStallAny
|
||||
08A.00 BpL1BTBCorrect
|
||||
08B.00 BpL2BTBCorrect
|
||||
08C.01 IcCacheInval.FillInvalidated.
|
||||
08C.02 IcCacheInval.L2InvalidatingProbe
|
||||
099.00 BpTlbRel
|
||||
28A.01 IcOcModeSwitch.IcOcModeSwitch
|
||||
28A.02 IcOcModeSwitch.OcIcModeSwitch
|
||||
0AF.01 DeDisDispatchTokenStalls0.ALSQ1TokenStall
|
||||
0AF.02 DeDisDispatchTokenStalls0.ALSQ2TokenStall
|
||||
0AF.04 DeDisDispatchTokenStalls0.ALSQ3TokenStall
|
||||
0AF.08 DeDisDispatchTokenStalls0.ALSQ3_0_TokenStall
|
||||
0AF.10 DeDisDispatchTokenStalls0.ALUTokenStall
|
||||
0AF.20 DeDisDispatchTokenStalls0.AGSQTokenStall
|
||||
0AF.40 DeDisDispatchTokenStalls0.RetireTokenStall
|
||||
0C0.00 ExRetInstr
|
||||
0C1.00 ExRetCops
|
||||
0C2.00 ExRetBrn
|
||||
0C3.00 ExRetBrnMisp
|
||||
0C4.00 ExRetBrnTkn
|
||||
0C5.00 ExRetBrnTknMisp
|
||||
0C6.00 ExRetBrnFar
|
||||
0C7.00 ExRetBrnResync
|
||||
0C8.00 ExRetNearRet
|
||||
0C9.00 ExRetNearRetMispred
|
||||
0CA.00 ExRetBrnIndMisp
|
||||
0CB.01 ExRetMmxFpInstr.X87Instr
|
||||
0CB.02 ExRetMmxFpInstr.MmxInstr
|
||||
0CB.04 ExRetMmxFpInstr.SseInstr
|
||||
0D1.00 ExRetCond
|
||||
0D2.00 ExRetCondMisp
|
||||
0D3.00 ExDivBusy
|
||||
0D4.00 ExDivCount
|
||||
1CF.01 ExTaggedIbsOps.IbsTaggedOps
|
||||
1CF.02 ExTaggedIbsOps.IbsTaggedOpsRet
|
||||
1CF.04 ExTaggedIbsOps.IbsCountRollover
|
||||
1D0.00 IbsCountRollover
|
||||
060.01 L2RequestG1.OtherRequests
|
||||
060.02 L2RequestG1.L2HwPf
|
||||
060.04 L2RequestG1.PrefetchL2
|
||||
060.08 L2RequestG1.ChangeToX
|
||||
060.10 L2RequestG1.CacheableIcRead
|
||||
060.20 L2RequestG1.LsRdBlkC_S
|
||||
060.40 L2RequestG1.RdBlkX
|
||||
060.80 L2RequestG1.RdBlkL
|
||||
061.01 L2RequestG2.BusLocksResponses
|
||||
061.02 L2RequestG2.BusLocksOriginator
|
||||
061.04 L2RequestG2.SmcInval
|
||||
061.08 L2RequestG2.IcRdSizedNC
|
||||
061.10 L2RequestG2.IcRdSized
|
||||
061.20 L2RequestG2.LsRdSizedNC
|
||||
061.40 L2RequestG2.LsRdSized
|
||||
061.80 L2RequestG2.Group1
|
||||
062.01 L2Latency.L2CyclesWaitingOnFills
|
||||
063.01 L2WbcReq.CLZero
|
||||
063.02 L2WbcReq.LocalIcClr
|
||||
063.04 L2WbcReq.ZeroByteStore
|
||||
063.08 L2WbcReq.I_LineFlush
|
||||
063.10 L2WbcReq.CacheLineFlush
|
||||
063.20 L2WbcReq.WcbClose
|
||||
063.40 L2WbcReq.WcbWrite
|
||||
064.01 L2CacheReqStat.IcFillMiss
|
||||
064.02 L2CacheReqStat.IcFillHitS
|
||||
064.04 L2CacheReqStat.IcFillHitX
|
||||
064.08 L2CacheReqStat.LsRdBlkC
|
||||
064.10 L2CacheReqStat.LsRdBlkX
|
||||
064.20 L2CacheReqStat.LsRdBlkLHitS
|
||||
064.40 L2CacheReqStat.LsRdBlkLHitX
|
||||
064.80 L2CacheReqStat.LsRdBlkCS
|
||||
06D.01 L2FillPending.L2FillBusy
|
13
configs/cfg_Zen_common.txt
Normal file
13
configs/cfg_Zen_common.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
# Performance monitoring events for AMD Family 17h processors.
|
||||
# See Section 2.1.13.3 of AMD's "Preliminary Processor Programming Reference" (Apr. 2017)
|
||||
|
||||
076.00 LsNotHaltedCyc
|
||||
0C0.00 ExRetInstr
|
||||
0C1.00 ExRetCops
|
||||
000.01 FpuPipeAssignment.Total0
|
||||
000.02 FpuPipeAssignment.Total1
|
||||
000.04 FpuPipeAssignment.Total2
|
||||
000.08 FpuPipeAssignment.Total3
|
||||
0C2.00 ExRetBrn
|
||||
0C3.00 ExRetBrnMisp
|
||||
040.00 LsDcAccesses
|
15
disable-HT.sh
Executable file
15
disable-HT.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "This script must be run as root" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for cpu in /sys/devices/system/cpu/cpu[1-9]*; do
|
||||
if [ -e "$cpu/topology/thread_siblings_list" ]; then
|
||||
sibling=$(awk -F '[^0-9]' '{ print $2 }' $cpu/topology/thread_siblings_list)
|
||||
if [ ! -z $sibling ]; then
|
||||
echo 0 > "/sys/devices/system/cpu/cpu$sibling/online"
|
||||
fi
|
||||
fi
|
||||
done
|
10
enable-HT.sh
Executable file
10
enable-HT.sh
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "This script must be run as root" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for cpu in /sys/devices/system/cpu/cpu[1-9]*; do
|
||||
echo 1 > "$cpu/online"
|
||||
done
|
79
kernel-nanoBench.sh
Executable file
79
kernel-nanoBench.sh
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "Error: nanoBench requires root privileges"
|
||||
echo "Try \"sudo ./nb_km-asm.sh ...\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -e /sys/nb ]; then
|
||||
echo "Error: nanoBench kernel module not loaded"
|
||||
echo "Load with \"sudo insmod nb.ko\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat /sys/nb/reset
|
||||
|
||||
while [ "$1" ]; do
|
||||
if [[ "$1" == -asm_init ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-init.s
|
||||
echo "$2" >> asm-init.s
|
||||
as asm-init.s -o asm-init.o
|
||||
objcopy asm-init.o -O binary /sys/nb/init
|
||||
rm -f asm-init.s asm-init.o
|
||||
shift 2
|
||||
elif [[ "$1" == -asm ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-code.s
|
||||
echo "$2" >> asm-code.s
|
||||
as asm-code.s -o asm-code.o
|
||||
objcopy asm-code.o -O binary /sys/nb/code
|
||||
rm -f asm-code.s asm-code.o
|
||||
shift 2
|
||||
elif [[ "$1" == -code_init ]]; then
|
||||
cp "$2" /sys/nb/init
|
||||
shift 2
|
||||
elif [[ "$1" == -code ]]; then
|
||||
cp "$2" /sys/nb/code
|
||||
shift 2
|
||||
elif [[ "$1" == -config ]]; then
|
||||
cp "$2" /sys/nb/config
|
||||
shift 2
|
||||
elif [[ "$1" == -u* ]]; then
|
||||
echo "$2" > /sys/nb/unroll_count
|
||||
shift 2
|
||||
elif [[ "$1" == -l* ]]; then
|
||||
echo "$2" > /sys/nb/loop_count
|
||||
shift 2
|
||||
elif [[ "$1" == -no_mem ]]; then
|
||||
echo "1" > /sys/nb/no_mem
|
||||
shift
|
||||
elif [[ "$1" == -n* ]]; then
|
||||
echo "$2" > /sys/nb/n_measurements
|
||||
shift 2
|
||||
elif [[ "$1" == -b* ]]; then
|
||||
echo "1" > /sys/nb/basic_mode
|
||||
shift
|
||||
elif [[ "$1" == -v* ]]; then
|
||||
echo "1" > /sys/nb/verbose
|
||||
shift
|
||||
elif [[ "$1" == -w* ]]; then
|
||||
echo "$2" > /sys/nb/warm_up
|
||||
shift 2
|
||||
elif [[ "$1" == -initial* ]]; then
|
||||
echo "$2" > /sys/nb/initial_warm_up
|
||||
shift 2
|
||||
elif [[ "$1" == -min* ]]; then
|
||||
echo "min" > /sys/nb/agg
|
||||
shift
|
||||
elif [[ "$1" == -med* ]]; then
|
||||
echo "med" > /sys/nb/agg
|
||||
shift
|
||||
elif [[ "$1" == -avg* ]]; then
|
||||
echo "avg" > /sys/nb/agg
|
||||
shift
|
||||
else
|
||||
echo "Invalid option: " "$1"
|
||||
fi
|
||||
done
|
||||
|
||||
cat /sys/nb/run
|
20
kernel/Makefile
Normal file
20
kernel/Makefile
Normal file
@@ -0,0 +1,20 @@
|
||||
MODULE_NAME = nb
|
||||
|
||||
SRC := nb_km.c ../common/nanoBench.c
|
||||
|
||||
$(MODULE_NAME)-objs += $(SRC:.c=.o)
|
||||
|
||||
obj-m += $(MODULE_NAME).o
|
||||
|
||||
CFLAGS_nb_km.o := -DDEBUG
|
||||
CFLAGS_nanoBench.o := -DDEBUG
|
||||
|
||||
ccflags-y+=-std=gnu99 -Wno-declaration-after-statement
|
||||
|
||||
all:
|
||||
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
|
||||
|
||||
clean:
|
||||
rm -f ../common/*.o ../common/*.ur-safe
|
||||
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
|
||||
|
380
kernel/nb_km.c
Normal file
380
kernel/nb_km.c
Normal file
@@ -0,0 +1,380 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <../arch/x86/include/asm/fpu/api.h>
|
||||
|
||||
#include "../common/nanoBench.h"
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Andreas Abel");
|
||||
|
||||
static ssize_t init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
memcpy(buf, code_init, code_init_length);
|
||||
return code_init_length;
|
||||
}
|
||||
static ssize_t init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
memcpy(code_init, buf, count);
|
||||
code_init_length = count;
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute code_init_attribute =__ATTR(init, 0660, init_show, init_store);
|
||||
|
||||
static ssize_t code_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
memcpy(buf, code, code_length);
|
||||
return code_length;
|
||||
}
|
||||
static ssize_t code_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
memcpy(code, buf, count);
|
||||
code_length = count;
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute code_attribute =__ATTR(code, 0660, code_show, code_store);
|
||||
|
||||
static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
ssize_t count = 0;
|
||||
for (int i=0; i<n_pfc_configs; i++) {
|
||||
if (is_Intel_CPU) {
|
||||
count += sprintf(&(buf[count]), "%02lx.%02lx %s\n", pfc_configs[i].evt_num, pfc_configs[i].umask, pfc_configs[i].description);
|
||||
} else {
|
||||
count += sprintf(&(buf[count]), "%03lx.%02lx %s\n", pfc_configs[i].evt_num, pfc_configs[i].umask, pfc_configs[i].description);
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
static ssize_t config_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
memcpy(pfc_config_file_content, buf, count);
|
||||
pfc_config_file_content[count] = '\0';
|
||||
parse_counter_configs();
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute config_attribute =__ATTR(config, 0660, config_show, config_store);
|
||||
|
||||
static ssize_t unroll_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%ld\n", unroll_count);
|
||||
}
|
||||
static ssize_t unroll_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%ld", &unroll_count);
|
||||
vfree(runtime_code);
|
||||
runtime_code = __vmalloc(PAGE_SIZE + (unroll_count)*PAGE_SIZE*2 + 10000, GFP_KERNEL, PAGE_KERNEL_EXEC);
|
||||
if (!runtime_code) {
|
||||
pr_debug("failed to allocate executable memory\n");
|
||||
}
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute unroll_count_attribute =__ATTR(unroll_count, 0660, unroll_count_show, unroll_count_store);
|
||||
|
||||
static ssize_t loop_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%ld\n", loop_count);
|
||||
}
|
||||
static ssize_t loop_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%ld", &loop_count);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute loop_count_attribute =__ATTR(loop_count, 0660, loop_count_show, loop_count_store);
|
||||
|
||||
static ssize_t n_measurements_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%ld\n", n_measurements);
|
||||
}
|
||||
static ssize_t n_measurements_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%ld", &n_measurements);
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
kfree(measurement_results[i]);
|
||||
measurement_results[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute n_measurements_attribute =__ATTR(n_measurements, 0660, n_measurements_show, n_measurements_store);
|
||||
|
||||
static ssize_t warm_up_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%ld\n", warm_up_count);
|
||||
}
|
||||
static ssize_t warm_up_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%ld", &warm_up_count);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute warm_up_attribute =__ATTR(warm_up, 0660, warm_up_show, warm_up_store);
|
||||
|
||||
static ssize_t initial_warm_up_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%ld\n", initial_warm_up_count);
|
||||
}
|
||||
static ssize_t initial_warm_up_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%ld", &initial_warm_up_count);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute initial_warm_up_attribute =__ATTR(initial_warm_up, 0660, initial_warm_up_show, initial_warm_up_store);
|
||||
|
||||
static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%u\n", basic_mode);
|
||||
}
|
||||
static ssize_t basic_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%u", &basic_mode);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute basic_mode_attribute =__ATTR(basic_mode, 0660, basic_mode_show, basic_mode_store);
|
||||
|
||||
static ssize_t no_mem_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%u\n", no_mem);
|
||||
}
|
||||
static ssize_t no_mem_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%u", &no_mem);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute no_mem_attribute =__ATTR(no_mem, 0660, no_mem_show, no_mem_store);
|
||||
|
||||
static ssize_t agg_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%d\n", aggregate_function);
|
||||
}
|
||||
static ssize_t agg_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
if (!strncmp(buf, "min", 3)) {
|
||||
aggregate_function = MIN;
|
||||
} else if (!strncmp(buf, "med", 3)) {
|
||||
aggregate_function = MED;
|
||||
} else {
|
||||
aggregate_function = AVG_20_80;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute agg_attribute =__ATTR(agg, 0660, agg_show, agg_store);
|
||||
|
||||
static ssize_t verbose_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%u\n", verbose);
|
||||
}
|
||||
static ssize_t verbose_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%u", &verbose);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute verbose_attribute =__ATTR(verbose, 0660, verbose_show, verbose_store);
|
||||
|
||||
static ssize_t clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
code_init_length = 0;
|
||||
code_length = 0;
|
||||
return 0;
|
||||
}
|
||||
static ssize_t clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
return 0;
|
||||
}
|
||||
static struct kobj_attribute clear_attribute =__ATTR(clear, 0660, clear_show, clear_store);
|
||||
|
||||
static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
n_measurements = N_MEASUREMENTS_DEFAULT;
|
||||
unroll_count = UNROLL_COUNT_DEFAULT;
|
||||
loop_count = LOOP_COUNT_DEFAULT;
|
||||
warm_up_count = WARM_UP_COUNT_DEFAULT;
|
||||
initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT;
|
||||
|
||||
no_mem = NO_MEM_DEFAULT;
|
||||
basic_mode = BASIC_MODE_DEFAULT;
|
||||
aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
||||
verbose = VERBOSE_DEFAULT;
|
||||
|
||||
code_init_length = 0;
|
||||
code_length = 0;
|
||||
n_pfc_configs = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
static ssize_t reset_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
return 0;
|
||||
}
|
||||
static struct kobj_attribute reset_attribute =__ATTR(reset, 0660, reset_show, reset_store);
|
||||
|
||||
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
kernel_fpu_begin();
|
||||
|
||||
long base_unroll_count = (basic_mode?0:unroll_count);
|
||||
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
|
||||
long base_loop_count = (basic_mode?0:loop_count);
|
||||
long main_loop_count = loop_count;
|
||||
|
||||
char* measurement_template;
|
||||
|
||||
/*********************************
|
||||
* Fixed-function counters.
|
||||
********************************/
|
||||
if (is_AMD_CPU) {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_FF_template_AMD_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_FF_template_AMD;
|
||||
}
|
||||
} else {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_FF_template_Intel;
|
||||
}
|
||||
}
|
||||
|
||||
configure_perf_ctrs_FF(0, 1);
|
||||
|
||||
run_warmup_experiment(measurement_template);
|
||||
|
||||
if (is_AMD_CPU) {
|
||||
run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
pr_debug("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, 3);
|
||||
pr_debug("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, 3);
|
||||
}
|
||||
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "RDTSC", 0);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "MPERF", 1);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "APERF", 2);
|
||||
} else {
|
||||
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
pr_debug("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, 4);
|
||||
pr_debug("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, 4);
|
||||
}
|
||||
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "RDTSC", 0);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "Instructions retired", 1);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "Core cycles", 2);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "Reference cycles", 3);
|
||||
}
|
||||
|
||||
/*********************************
|
||||
* Programmable counters.
|
||||
********************************/
|
||||
if (is_AMD_CPU) {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_template_AMD_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_template_AMD;
|
||||
}
|
||||
} else {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_template_Intel_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_template_Intel;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i=0; i<n_pfc_configs; i+=n_programmable_counters) {
|
||||
configure_perf_ctrs_programmable(i, min(i+n_programmable_counters, n_pfc_configs), 0, 1);
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
pr_debug("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, n_programmable_counters);
|
||||
pr_debug("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, n_programmable_counters);
|
||||
}
|
||||
|
||||
for (int c=0; c < n_programmable_counters && i + c < n_pfc_configs; c++) {
|
||||
if (!pfc_configs[i+c].invalid) compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), pfc_configs[i+c].description, c);
|
||||
}
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
return strlen(buf);
|
||||
}
|
||||
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
return 0;
|
||||
}
|
||||
static struct kobj_attribute run_attribute =__ATTR(run, 0660, run_show, run_store);
|
||||
|
||||
static struct kobject* nb_kobject;
|
||||
|
||||
static int __init nb_init (void) {
|
||||
pr_debug("Initializing nanoBench kernel module...\n");
|
||||
|
||||
if (check_cpuid()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
code = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if(!code){
|
||||
printk(KERN_ERR "Could not allocate memory for code\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
code_init = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if(!code_init){
|
||||
printk(KERN_ERR "Could not allocate memory for code_init\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
pfc_config_file_content = kmalloc(PAGE_SIZE+1, GFP_KERNEL);
|
||||
if(!pfc_config_file_content){
|
||||
printk(KERN_ERR "Could not allocate memory for pfc_config_file_content\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
measurement_results[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
measurement_results_base[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
if(!measurement_results[i] || !measurement_results_base[i]){
|
||||
printk(KERN_ERR "Could not allocate memory for measurement_results\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
runtime_mem = kmalloc(2*1024*1024, GFP_KERNEL);
|
||||
if(!runtime_mem){
|
||||
printk(KERN_ERR "Could not allocate memory for runtime_mem\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
runtime_code = __vmalloc(PAGE_SIZE + (unroll_count)*PAGE_SIZE*2 + 10000, GFP_KERNEL, PAGE_KERNEL_EXEC);
|
||||
if (!runtime_code) {
|
||||
pr_debug("failed to allocate executable memory\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
nb_kobject = kobject_create_and_add("nb", kernel_kobj->parent);
|
||||
if(!nb_kobject) {
|
||||
pr_debug("failed to create and add nb\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int error = sysfs_create_file(nb_kobject, &run_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &clear_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &reset_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &code_init_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &code_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &config_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &verbose_attribute.attr);
|
||||
|
||||
if (error) {
|
||||
pr_debug("failed to create file in /sys/nb/\n");
|
||||
return error;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit nb_exit (void) {
|
||||
if (code) kfree(code);
|
||||
if (code_init) kfree(code_init);
|
||||
if (pfc_config_file_content) kfree(pfc_config_file_content);
|
||||
if (runtime_code) vfree(runtime_code);
|
||||
if (runtime_mem) kfree(runtime_mem);
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
if (measurement_results[i]) kfree(measurement_results[i]);
|
||||
if (measurement_results_base[i]) kfree(measurement_results_base[i]);
|
||||
}
|
||||
|
||||
kobject_put(nb_kobject);
|
||||
}
|
||||
|
||||
module_init(nb_init);
|
||||
module_exit(nb_exit);
|
73
nanoBench.sh
Executable file
73
nanoBench.sh
Executable file
@@ -0,0 +1,73 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "Error: nanoBench requires root privileges" 1>&2
|
||||
echo "Try \"sudo ./nanoBench-asm.sh ...\"" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v rdmsr &>/dev/null; then
|
||||
echo "Error: nanoBench requires msr-tools"
|
||||
echo "Install with \"sudo apt install msr-tools\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
args=''
|
||||
while [ "$2" ]; do
|
||||
if [ "$1" == '-asm' ]; then
|
||||
echo ".intel_syntax noprefix" > asm-code.s
|
||||
echo "$2" >> asm-code.s
|
||||
as asm-code.s -o asm-code.o || exit
|
||||
objcopy asm-code.o -O binary asm-code.bin
|
||||
args="$args -code asm-code.bin"
|
||||
shift 2
|
||||
elif [ "$1" == '-asm_init' ]; then
|
||||
echo ".intel_syntax noprefix" > asm-init.s
|
||||
echo "$2" >> asm-init.s
|
||||
as asm-init.s -o asm-init.o || exit
|
||||
objcopy asm-init.o -O binary asm-init.bin
|
||||
args="$args -code_init asm-init.bin"
|
||||
shift 2
|
||||
else
|
||||
args="$args $1"
|
||||
shift
|
||||
fi
|
||||
done
|
||||
args="$args $1"
|
||||
set "$args"
|
||||
|
||||
prev_rdpmc=$(cat /sys/bus/event_source/devices/cpu/rdpmc)
|
||||
echo 2 > /sys/bus/event_source/devices/cpu/rdpmc || exit
|
||||
|
||||
modprobe --first-time msr &>/dev/null
|
||||
msr_prev_loaded=$?
|
||||
|
||||
# (Temporarily) disable watchdogs, see https://github.com/obilaniu/libpfc
|
||||
! modprobe --first-time -r iTCO_wdt &>/dev/null
|
||||
iTCO_wdt_prev_loaded=$?
|
||||
|
||||
! modprobe --first-time -r iTCO_vendor_support &>/dev/null
|
||||
iTCO_vendor_support_prev_loaded=$?
|
||||
|
||||
prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog)
|
||||
echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||
|
||||
user/nanoBench $@
|
||||
|
||||
rm -f asm-code.*
|
||||
rm -f asm-init.*
|
||||
|
||||
echo $prev_rdpmc > /sys/bus/event_source/devices/cpu/rdpmc
|
||||
echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog
|
||||
|
||||
if [[ $msr_prev_loaded == 0 ]]; then
|
||||
modprobe -r msr
|
||||
fi
|
||||
|
||||
if [[ $iTCO_wdt_prev_loaded != 0 ]]; then
|
||||
modprobe iTCO_wdt &>/dev/null
|
||||
fi
|
||||
|
||||
if [[ $iTCO_vendor_support_prev_loaded != 0 ]]; then
|
||||
modprobe iTCO_vendor_support &>/dev/null
|
||||
fi
|
11
user/Makefile
Normal file
11
user/Makefile
Normal file
@@ -0,0 +1,11 @@
|
||||
CC=gcc
|
||||
CFLAGS=-Wall
|
||||
VPATH=../common/
|
||||
|
||||
all: nanoBench
|
||||
nanoBench: nanoBench_main.o nanoBench.o
|
||||
nanoBench_main.o: nanoBench_main.c ../common/nanoBench.h
|
||||
nanoBench.o: ../common/nanoBench.c ../common/nanoBench.h
|
||||
|
||||
clean:
|
||||
rm -f nanoBench *.o ../common/*.o
|
299
user/nanoBench_main.c
Normal file
299
user/nanoBench_main.c
Normal file
@@ -0,0 +1,299 @@
|
||||
#define _GNU_SOURCE
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <getopt.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <sched.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "../common/nanoBench.h"
|
||||
|
||||
void print_usage() {
|
||||
printf("\n");
|
||||
printf("nanoBench usage:\n");
|
||||
printf("\n");
|
||||
printf(" -code <filename>: Binary file containing the code to be benchmarked.\n");
|
||||
printf(" -code_init <filename>: Binary file containing code to be executed once in the beginning\n");
|
||||
printf(" -config <filename>: File with performance counter event specifications.\n");
|
||||
printf(" -n_measurements <n>: Number of times the measurements are repeated.\n");
|
||||
printf(" -unroll_count <n>: Number of copies of the benchmark code inside the inner loop.\n");
|
||||
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
|
||||
printf(" -warm_up_count <n>: Number of runs before the first measurement gets recorded.\n");
|
||||
printf(" -initial_warm_up_count <n>: Number of runs before any measurement is performed.\n");
|
||||
printf(" -avg: Selects the arithmetic mean as the aggregate function.\n");
|
||||
printf(" -median: Selects the median as the aggregate function.\n");
|
||||
printf(" -min: Selects the minimum as the aggregate function.\n");
|
||||
printf(" -basic_mode: Enables basic mode.\n");
|
||||
printf(" -no_mem: The code for reading the perf. ctrs. does not make memory accesses.\n");
|
||||
printf(" -verbose: Outputs the results of all performance counter readings.\n");
|
||||
printf(" -cpu <n>: Pins the measurement thread to CPU n. \n");
|
||||
printf(" -usr <n>: If 1, counts events at a privilege level greater than 0.\n");
|
||||
printf(" -os <n>: If 1, counts events at a privilege level 0.\n");
|
||||
}
|
||||
|
||||
size_t mmap_file(char* filename, char** content) {
|
||||
int fd = open(filename, O_RDONLY);
|
||||
size_t len = lseek(fd, 0, SEEK_END);
|
||||
*content = mmap(0, len, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||
if (*content == MAP_FAILED) {
|
||||
fprintf(stderr, "Error reading %s\n", filename);
|
||||
exit(1);
|
||||
}
|
||||
close(fd);
|
||||
return len;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
/*************************************
|
||||
* Parse command-line options
|
||||
************************************/
|
||||
char* config_file_name = NULL;
|
||||
int usr = 1;
|
||||
int os = 0;
|
||||
|
||||
struct option long_opts[] = {
|
||||
{"code", required_argument, 0, 'c'},
|
||||
{"code_init", required_argument, 0, 'i'},
|
||||
{"config", required_argument, 0, 'f'},
|
||||
{"n_measurements", required_argument, 0, 'n'},
|
||||
{"unroll_count", required_argument, 0, 'u'},
|
||||
{"loop_count", required_argument, 0, 'l'},
|
||||
{"warm_up_count", required_argument, 0, 'w'},
|
||||
{"initial_warm_up_count", required_argument, 0, 'a'},
|
||||
{"avg", no_argument, &aggregate_function, AVG_20_80},
|
||||
{"median", no_argument, &aggregate_function, MED},
|
||||
{"min", no_argument, &aggregate_function, MIN},
|
||||
{"basic_mode", no_argument, &basic_mode, 1},
|
||||
{"no_mem", no_argument, &no_mem, 1},
|
||||
{"verbose", no_argument, &verbose, 1},
|
||||
{"cpu", required_argument, 0, 'p'},
|
||||
{"usr", required_argument, 0, 'r'},
|
||||
{"os", required_argument, 0, 's'},
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
int option = 0;
|
||||
while ((option = getopt_long_only(argc, argv, "", long_opts, NULL)) != -1) {
|
||||
switch (option) {
|
||||
case 0:
|
||||
break;
|
||||
case 'c':
|
||||
code_length = mmap_file(optarg, &code);
|
||||
break;
|
||||
case 'i':
|
||||
code_init_length = mmap_file(optarg, &code_init);
|
||||
break;
|
||||
case 'f': ;
|
||||
config_file_name = optarg;
|
||||
break;
|
||||
case 'n':
|
||||
n_measurements = atol(optarg);
|
||||
break;
|
||||
case 'u':
|
||||
unroll_count = atol(optarg);
|
||||
if (unroll_count <= 0) {
|
||||
fprintf(stderr, "Error: unroll_count must be > 0\n");
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'l':
|
||||
loop_count = atol(optarg);
|
||||
break;
|
||||
case 'w':
|
||||
warm_up_count = atol(optarg);
|
||||
break;
|
||||
case 'a':
|
||||
initial_warm_up_count = atol(optarg);
|
||||
break;
|
||||
case 'p':
|
||||
cpu = atol(optarg);
|
||||
break;
|
||||
case 'r':
|
||||
usr = atoi(optarg);
|
||||
break;
|
||||
case 's':
|
||||
os = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Check CPUID and parse config file
|
||||
************************************/
|
||||
if (check_cpuid()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (config_file_name) {
|
||||
char* config_mmap;
|
||||
size_t len = mmap_file(config_file_name, &config_mmap);
|
||||
pfc_config_file_content = calloc(len+1, sizeof(char));
|
||||
memcpy(pfc_config_file_content, config_mmap, len);
|
||||
parse_counter_configs();
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Pin thread to CPU
|
||||
************************************/
|
||||
if (cpu == -1) {
|
||||
cpu = sched_getcpu();
|
||||
}
|
||||
|
||||
cpu_set_t mask;
|
||||
CPU_ZERO(&mask);
|
||||
CPU_SET(cpu, &mask);
|
||||
|
||||
if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
|
||||
fprintf(stderr, "Error: Could not pin thread to core %d\n", cpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Allocate memory
|
||||
************************************/
|
||||
size_t runtime_code_length = code_init_length + (unroll_count)*code_length*2 + 10000;
|
||||
posix_memalign((void**)&runtime_code, sysconf(_SC_PAGESIZE), runtime_code_length);
|
||||
if (!runtime_code) {
|
||||
fprintf(stderr, "Error: Failed to allocate memory for runtime_code\n");
|
||||
return 1;
|
||||
}
|
||||
if (mprotect(runtime_code, runtime_code_length, (PROT_READ | PROT_WRITE |PROT_EXEC))) {
|
||||
fprintf(stderr, "Error: mprotect failed\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
runtime_mem = malloc(2*1024*1024);
|
||||
if(!runtime_mem){
|
||||
fprintf(stderr, "Error: Could not allocate memory for runtime_mem\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
measurement_results[i] = malloc(n_measurements*sizeof(int64_t));
|
||||
measurement_results_base[i] = malloc(n_measurements*sizeof(int64_t));
|
||||
if(!measurement_results[i] || !measurement_results_base[i]){
|
||||
fprintf(stderr, "Error: Could not allocate memory for measurement_results\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Fixed-function counters
|
||||
************************************/
|
||||
long base_unroll_count = (basic_mode?0:unroll_count);
|
||||
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
|
||||
long base_loop_count = (basic_mode?0:loop_count);
|
||||
long main_loop_count = loop_count;
|
||||
|
||||
char buf[100];
|
||||
char* measurement_template;
|
||||
|
||||
if (is_AMD_CPU) {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_RDTSC_template_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_RDTSC_template;
|
||||
}
|
||||
} else {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_FF_template_Intel;
|
||||
}
|
||||
}
|
||||
|
||||
run_warmup_experiment(measurement_template);
|
||||
|
||||
if (is_AMD_CPU) {
|
||||
run_experiment(measurement_template, measurement_results_base, 1, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, 1, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nRDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, 1);
|
||||
printf("RDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, 1);
|
||||
}
|
||||
|
||||
printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||
} else {
|
||||
configure_perf_ctrs_FF(usr, os);
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, 4);
|
||||
printf("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, 4);
|
||||
}
|
||||
|
||||
printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||
printf("%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
|
||||
printf("%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
|
||||
printf("%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Programmable counters
|
||||
************************************/
|
||||
if (is_AMD_CPU) {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_template_AMD_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_template_AMD;
|
||||
}
|
||||
} else {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_template_Intel_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_template_Intel;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i=0; i<n_pfc_configs; i+=n_programmable_counters) {
|
||||
size_t end = i + n_programmable_counters;
|
||||
if (end > n_pfc_configs) {
|
||||
end = n_pfc_configs;
|
||||
}
|
||||
|
||||
configure_perf_ctrs_programmable(i, end, usr, os);
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, n_programmable_counters);
|
||||
printf("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, n_programmable_counters);
|
||||
}
|
||||
|
||||
for (int c=0; c < n_programmable_counters && i + c < n_pfc_configs; c++) {
|
||||
if (!pfc_configs[i+c].invalid) printf("%s", compute_result_str(buf, sizeof(buf), pfc_configs[i+c].description, c));
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Cleanup
|
||||
************************************/
|
||||
free(runtime_code);
|
||||
free(runtime_mem);
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
free(measurement_results[i]);
|
||||
free(measurement_results_base[i]);
|
||||
}
|
||||
|
||||
if (pfc_config_file_content) {
|
||||
free(pfc_config_file_content);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user