Files
nanoBench/configs/cfg_Broadwell_all_core.txt
2021-12-08 22:01:44 +01:00

1002 lines
38 KiB
Plaintext

# Based on https://download.01.org/perfmon/BDW/broadwell_core_v26.json
# Applies to processors with family-model in {6-3D, 6-47}
# Cases when loads get true Block-on-Store blocking code preventing store forwarding
03.02 LD_BLOCKS.STORE_FORWARD
# This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.
03.08 LD_BLOCKS.NO_SR
# Speculative cache line split load uops dispatched to L1 cache
05.01 MISALIGN_MEM_REF.LOADS
# Speculative cache line split STA uops dispatched to L1 cache
05.02 MISALIGN_MEM_REF.STORES
# False dependencies in MOB due to partial compare
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
# Load misses in all DTLB levels that cause page walks
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK
# Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (4K).
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
# Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (2M/4M).
08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
# Load miss in all TLB levels causes a page walk that completes. (1G)
08.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G
# Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size.
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
# Cycles when PMH is busy with page walks
08.10 DTLB_LOAD_MISSES.WALK_DURATION
# Load misses that miss the DTLB and hit the STLB (4K).
08.20 DTLB_LOAD_MISSES.STLB_HIT_4K
# Load misses that miss the DTLB and hit the STLB (2M).
08.40 DTLB_LOAD_MISSES.STLB_HIT_2M
# Load operations that miss the first DTLB level but hit the second and do not cause page walks.
08.60 DTLB_LOAD_MISSES.STLB_HIT
# Core cycles the allocator was stalled due to recovery from earlier clear event for this thread (e.g. misprediction or memory nuke)
0D.03.CMSK=1 INT_MISC.RECOVERY_CYCLES
# Core cycles the allocator was stalled due to recovery from earlier clear event for any thread running on the physical core (e.g. misprediction or memory nuke).
0D.03.CMSK=1.AnyT INT_MISC.RECOVERY_CYCLES_ANY
# Cycles when Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for the thread
0D.08 INT_MISC.RAT_STALL_CYCLES
# Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS)
0E.01 UOPS_ISSUED.ANY
# Cycles when Resource Allocation Table (RAT) does not issue Uops to Reservation Station (RS) for the thread
0E.01.CMSK=1.INV UOPS_ISSUED.STALL_CYCLES
# Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch.
0E.10 UOPS_ISSUED.FLAGS_MERGE
# Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not.
0E.20 UOPS_ISSUED.SLOW_LEA
# Number of Multiply packed/scalar single precision uops allocated.
0E.40 UOPS_ISSUED.SINGLE_MUL
# Cycles when divider is busy executing divide operations
14.01 ARITH.FPU_DIV_ACTIVE
# Demand Data Read miss L2, no rejects
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
# RFO requests that miss L2 cache.
24.22 L2_RQSTS.RFO_MISS
# L2 cache misses when fetching instructions.
24.24 L2_RQSTS.CODE_RD_MISS
# Demand requests that miss L2 cache.
24.27 L2_RQSTS.ALL_DEMAND_MISS
# L2 prefetch requests that miss L2 cache
24.30 L2_RQSTS.L2_PF_MISS
# All requests that miss L2 cache.
24.3F L2_RQSTS.MISS
# Demand Data Read requests that hit L2 cache
24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT
# RFO requests that hit L2 cache.
24.C2 L2_RQSTS.RFO_HIT
# L2 cache hits when fetching instructions, code reads.
24.C4 L2_RQSTS.CODE_RD_HIT
# L2 prefetch requests that hit L2 cache
24.D0 L2_RQSTS.L2_PF_HIT
# Demand Data Read requests
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
# RFO requests to L2 cache
24.E2 L2_RQSTS.ALL_RFO
# L2 code requests
24.E4 L2_RQSTS.ALL_CODE_RD
# Demand requests to L2 cache.
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
# Requests from L2 hardware prefetchers
24.F8 L2_RQSTS.ALL_PF
# All L2 requests.
24.FF L2_RQSTS.REFERENCES
# Not rejected writebacks that hit L2 cache
27.50 L2_DEMAND_RQSTS.WB_HIT
# Core-originated cacheable demand requests missed L3
2E.41 LONGEST_LAT_CACHE.MISS
# Core-originated cacheable demand requests that refer to L3
2E.4F LONGEST_LAT_CACHE.REFERENCE
# Thread cycles when thread is not in halt state
3C.00 CPU_CLK_UNHALTED.THREAD_P
# Core cycles when at least one thread on the physical core is not in halt state.
3C.00.AnyT CPU_CLK_UNHALTED.THREAD_P_ANY
# Reference cycles when the thread is unhalted (counts at 100 MHz rate)
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK
# Reference cycles when the thread is unhalted (counts at 100 MHz rate)
3C.01 CPU_CLK_UNHALTED.REF_XCLK
# Reference cycles when the at least one thread on the physical core is unhalted (counts at 100 MHz rate).
3C.01.AnyT CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY
# Reference cycles when the at least one thread on the physical core is unhalted (counts at 100 MHz rate).
3C.01.AnyT CPU_CLK_UNHALTED.REF_XCLK_ANY
# Count XClk pulses when this thread is unhalted and the other thread is halted.
3C.02 CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE
# Count XClk pulses when this thread is unhalted and the other thread is halted.
3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
# Cycles with L1D load Misses outstanding from any thread on physical core.
48.01.CMSK=1.AnyT.CTR=2 L1D_PEND_MISS.PENDING_CYCLES_ANY
# Cycles with L1D load Misses outstanding.
48.01.CMSK=1.CTR=2 L1D_PEND_MISS.PENDING_CYCLES
# L1D miss oustandings duration in cycles
48.01.CTR=2 L1D_PEND_MISS.PENDING
# Cycles a demand request was blocked due to Fill Buffers inavailability.
48.02.CMSK=1 L1D_PEND_MISS.FB_FULL
# Store misses in all DTLB levels that cause page walks
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK
# Store miss in all TLB levels causes a page walk that completes. (4K)
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
# Store misses in all DTLB levels that cause completed page walks (2M/4M)
49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
# Store misses in all DTLB levels that cause completed page walks (1G)
49.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G
# Store misses in all DTLB levels that cause completed page walks.
49.0E DTLB_STORE_MISSES.WALK_COMPLETED
# Cycles when PMH is busy with page walks
49.10 DTLB_STORE_MISSES.WALK_DURATION
# Store misses that miss the DTLB and hit the STLB (4K).
49.20 DTLB_STORE_MISSES.STLB_HIT_4K
# Store misses that miss the DTLB and hit the STLB (2M).
49.40 DTLB_STORE_MISSES.STLB_HIT_2M
# Store operations that miss the first TLB level but hit the second and do not cause page walks.
49.60 DTLB_STORE_MISSES.STLB_HIT
# Not software-prefetch load dispatches that hit FB allocated for software prefetch
4C.01 LOAD_HIT_PRE.SW_PF
# Not software-prefetch load dispatches that hit FB allocated for hardware prefetch
4C.02 LOAD_HIT_PRE.HW_PF
# Cycle count for an Extended Page table walk.
4F.10 EPT.WALK_CYCLES
# L1D data line replacements
51.01 L1D.REPLACEMENT
# Number of times a TSX line had a cache conflict
54.01 TX_MEM.ABORT_CONFLICT
# Number of times a TSX Abort was triggered due to an evicted line caused by a transaction overflow
54.02 TX_MEM.ABORT_CAPACITY_WRITE
# Number of times a TSX Abort was triggered due to a non-release/commit store to lock
54.04 TX_MEM.ABORT_HLE_STORE_TO_ELIDED_LOCK
# Number of times a TSX Abort was triggered due to commit but Lock Buffer not empty
54.08 TX_MEM.ABORT_HLE_ELISION_BUFFER_NOT_EMPTY
# Number of times a TSX Abort was triggered due to release/commit but data and address mismatch
54.10 TX_MEM.ABORT_HLE_ELISION_BUFFER_MISMATCH
# Number of times a TSX Abort was triggered due to attempting an unsupported alignment from Lock Buffer
54.20 TX_MEM.ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT
# Number of times we could not allocate Lock Buffer
54.40 TX_MEM.HLE_ELISION_BUFFER_FULL
# Number of integer Move Elimination candidate uops that were eliminated.
58.01 MOVE_ELIMINATION.INT_ELIMINATED
# Number of SIMD Move Elimination candidate uops that were eliminated.
58.02 MOVE_ELIMINATION.SIMD_ELIMINATED
# Number of integer Move Elimination candidate uops that were not eliminated.
58.04 MOVE_ELIMINATION.INT_NOT_ELIMINATED
# Number of SIMD Move Elimination candidate uops that were not eliminated.
58.08 MOVE_ELIMINATION.SIMD_NOT_ELIMINATED
# Unhalted core cycles when the thread is in ring 0
5C.01 CPL_CYCLES.RING0
# Number of intervals between processor halts while thread is in ring 0
5C.01.CMSK=1.EDG CPL_CYCLES.RING0_TRANS
# Unhalted core cycles when thread is in rings 1, 2, or 3
5C.02 CPL_CYCLES.RING123
# Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort.
5D.01 TX_EXEC.MISC1
# Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region
5D.02 TX_EXEC.MISC2
# Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded
5D.04 TX_EXEC.MISC3
# Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region.
5D.08 TX_EXEC.MISC4
# Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region.
5D.10 TX_EXEC.MISC5
# Cycles when Reservation Station (RS) is empty for the thread
5E.01 RS_EVENTS.EMPTY_CYCLES
# Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues.
5E.01.CMSK=1.EDG.INV RS_EVENTS.EMPTY_END
# Offcore outstanding Demand Data Read transactions in uncore queue.
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
# Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore
60.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
# Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue.
60.01.CMSK=6 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6
# Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle
60.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
# Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
# Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle
60.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
# Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
# Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore
60.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
# Cycles when L1 and L2 are locked due to UC or split lock
63.01 LOCK_CYCLES.SPLIT_LOCK_UC_LOCK_DURATION
# Cycles when L1D is locked
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION
# Instruction Decode Queue (IDQ) empty cycles
79.02 IDQ.EMPTY
# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
79.04 IDQ.MITE_UOPS
# Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path
79.04.CMSK=1 IDQ.MITE_CYCLES
# Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path
79.08 IDQ.DSB_UOPS
# Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path
79.08.CMSK=1 IDQ.DSB_CYCLES
# Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.10 IDQ.MS_DSB_UOPS
# Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.10.CMSK=1 IDQ.MS_DSB_CYCLES
# Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy
79.10.CMSK=1.EDG IDQ.MS_DSB_OCCUR
# Cycles Decode Stream Buffer (DSB) is delivering any Uop
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS
# Cycles Decode Stream Buffer (DSB) is delivering 4 Uops
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS
# Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.20 IDQ.MS_MITE_UOPS
# Cycles MITE is delivering any Uop
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS
# Cycles MITE is delivering 4 Uops
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS
# Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.30 IDQ.MS_UOPS
# Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.30.CMSK=1 IDQ.MS_CYCLES
# Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.
79.30.CMSK=1.EDG IDQ.MS_SWITCHES
# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
79.3C IDQ.MITE_ALL_UOPS
# Number of Instruction Cache, Streaming Buffer and Victim Cache Reads. both cacheable and noncacheable, including UC fetches
80.01 ICACHE.HIT
# Number of Instruction Cache, Streaming Buffer and Victim Cache Misses. Includes Uncacheable accesses.
80.02 ICACHE.MISSES
# Cycles where a code fetch is stalled due to L1 instruction-cache miss.
80.04 ICACHE.IFDATA_STALL
# Misses at all ITLB levels that cause page walks
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK
# Code miss in all TLB levels causes a page walk that completes. (4K)
85.02 ITLB_MISSES.WALK_COMPLETED_4K
# Code miss in all TLB levels causes a page walk that completes. (2M/4M)
85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
# Store miss in all TLB levels causes a page walk that completes. (1G)
85.08 ITLB_MISSES.WALK_COMPLETED_1G
# Misses in all ITLB levels that cause completed page walks.
85.0E ITLB_MISSES.WALK_COMPLETED
# Cycles when PMH is busy with page walks
85.10 ITLB_MISSES.WALK_DURATION
# Core misses that miss the DTLB and hit the STLB (4K).
85.20 ITLB_MISSES.STLB_HIT_4K
# Code misses that miss the DTLB and hit the STLB (2M).
85.40 ITLB_MISSES.STLB_HIT_2M
# Operations that miss the first ITLB level but hit the second and do not cause any page walks.
85.60 ITLB_MISSES.STLB_HIT
# Stalls caused by changing prefix length of the instruction.
87.01 ILD_STALL.LCP
# Not taken macro-conditional branches
88.41 BR_INST_EXEC.NONTAKEN_CONDITIONAL
# Taken speculative and retired macro-conditional branches
88.81 BR_INST_EXEC.TAKEN_CONDITIONAL
# Taken speculative and retired macro-conditional branch instructions excluding calls and indirects
88.82 BR_INST_EXEC.TAKEN_DIRECT_JUMP
# Taken speculative and retired indirect branches excluding calls and returns
88.84 BR_INST_EXEC.TAKEN_INDIRECT_JUMP_NON_CALL_RET
# Taken speculative and retired indirect branches with return mnemonic
88.88 BR_INST_EXEC.TAKEN_INDIRECT_NEAR_RETURN
# Taken speculative and retired direct near calls
88.90 BR_INST_EXEC.TAKEN_DIRECT_NEAR_CALL
# Taken speculative and retired indirect calls
88.A0 BR_INST_EXEC.TAKEN_INDIRECT_NEAR_CALL
# Speculative and retired macro-conditional branches
88.C1 BR_INST_EXEC.ALL_CONDITIONAL
# Speculative and retired macro-unconditional branches excluding calls and indirects
88.C2 BR_INST_EXEC.ALL_DIRECT_JMP
# Speculative and retired indirect branches excluding calls and returns
88.C4 BR_INST_EXEC.ALL_INDIRECT_JUMP_NON_CALL_RET
# Speculative and retired indirect return branches.
88.C8 BR_INST_EXEC.ALL_INDIRECT_NEAR_RETURN
# Speculative and retired direct near calls
88.D0 BR_INST_EXEC.ALL_DIRECT_NEAR_CALL
# Speculative and retired branches
88.FF BR_INST_EXEC.ALL_BRANCHES
# Not taken speculative and retired mispredicted macro conditional branches
89.41 BR_MISP_EXEC.NONTAKEN_CONDITIONAL
# Taken speculative and retired mispredicted macro conditional branches
89.81 BR_MISP_EXEC.TAKEN_CONDITIONAL
# Taken speculative and retired mispredicted indirect branches excluding calls and returns
89.84 BR_MISP_EXEC.TAKEN_INDIRECT_JUMP_NON_CALL_RET
# Taken speculative and retired mispredicted indirect branches with return mnemonic
89.88 BR_MISP_EXEC.TAKEN_RETURN_NEAR
# Taken speculative and retired mispredicted indirect calls.
89.A0 BR_MISP_EXEC.TAKEN_INDIRECT_NEAR_CALL
# Speculative and retired mispredicted macro conditional branches
89.C1 BR_MISP_EXEC.ALL_CONDITIONAL
# Mispredicted indirect branches excluding calls and returns
89.C4 BR_MISP_EXEC.ALL_INDIRECT_JUMP_NON_CALL_RET
# Speculative and retired mispredicted macro conditional branches
89.FF BR_MISP_EXEC.ALL_BRANCHES
# Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
# Cycles with less than 3 uops delivered by the front end.
9C.01.CMSK=1 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE
# Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE.
9C.01.CMSK=1.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
# Cycles with less than 2 uops delivered by the front end.
9C.01.CMSK=2 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE
# Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled
9C.01.CMSK=3 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_1_UOP_DELIV.CORE
# Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled
9C.01.CMSK=4 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
# Micro-op dispatches cancelled due to insufficient SIMD physical register file read ports
A0.03 UOP_DISPATCHES_CANCELLED.SIMD_PRF
# Cycles per thread when uops are executed in port 0
A1.01 UOPS_DISPATCHED_PORT.PORT_0
# Cycles per thread when uops are executed in port 0
A1.01 UOPS_EXECUTED_PORT.PORT_0
# Cycles per core when uops are exectuted in port 0.
A1.01.AnyT UOPS_EXECUTED_PORT.PORT_0_CORE
# Cycles per thread when uops are executed in port 1
A1.02 UOPS_DISPATCHED_PORT.PORT_1
# Cycles per thread when uops are executed in port 1
A1.02 UOPS_EXECUTED_PORT.PORT_1
# Cycles per core when uops are exectuted in port 1.
A1.02.AnyT UOPS_EXECUTED_PORT.PORT_1_CORE
# Cycles per thread when uops are executed in port 2
A1.04 UOPS_DISPATCHED_PORT.PORT_2
# Cycles per thread when uops are executed in port 2
A1.04 UOPS_EXECUTED_PORT.PORT_2
# Cycles per core when uops are dispatched to port 2.
A1.04.AnyT UOPS_EXECUTED_PORT.PORT_2_CORE
# Cycles per thread when uops are executed in port 3
A1.08 UOPS_DISPATCHED_PORT.PORT_3
# Cycles per thread when uops are executed in port 3
A1.08 UOPS_EXECUTED_PORT.PORT_3
# Cycles per core when uops are dispatched to port 3.
A1.08.AnyT UOPS_EXECUTED_PORT.PORT_3_CORE
# Cycles per thread when uops are executed in port 4
A1.10 UOPS_DISPATCHED_PORT.PORT_4
# Cycles per thread when uops are executed in port 4
A1.10 UOPS_EXECUTED_PORT.PORT_4
# Cycles per core when uops are exectuted in port 4.
A1.10.AnyT UOPS_EXECUTED_PORT.PORT_4_CORE
# Cycles per thread when uops are executed in port 5
A1.20 UOPS_DISPATCHED_PORT.PORT_5
# Cycles per thread when uops are executed in port 5
A1.20 UOPS_EXECUTED_PORT.PORT_5
# Cycles per core when uops are exectuted in port 5.
A1.20.AnyT UOPS_EXECUTED_PORT.PORT_5_CORE
# Cycles per thread when uops are executed in port 6
A1.40 UOPS_DISPATCHED_PORT.PORT_6
# Cycles per thread when uops are executed in port 6
A1.40 UOPS_EXECUTED_PORT.PORT_6
# Cycles per core when uops are exectuted in port 6.
A1.40.AnyT UOPS_EXECUTED_PORT.PORT_6_CORE
# Cycles per thread when uops are executed in port 7
A1.80 UOPS_DISPATCHED_PORT.PORT_7
# Cycles per thread when uops are executed in port 7
A1.80 UOPS_EXECUTED_PORT.PORT_7
# Cycles per core when uops are dispatched to port 7.
A1.80.AnyT UOPS_EXECUTED_PORT.PORT_7_CORE
# Resource-related stall cycles
A2.01 RESOURCE_STALLS.ANY
# Cycles stalled due to no eligible RS entry available.
A2.04 RESOURCE_STALLS.RS
# Cycles stalled due to no store buffers available. (not including draining form sync).
A2.08 RESOURCE_STALLS.SB
# Cycles stalled due to re-order buffer full.
A2.10 RESOURCE_STALLS.ROB
# Cycles while L2 cache miss demand load is outstanding.
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS
# Cycles while L2 cache miss demand load is outstanding.
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_PENDING
# Cycles while memory subsystem has an outstanding load.
A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_LDM_PENDING
# Cycles while memory subsystem has an outstanding load.
A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_MEM_ANY
# This event increments by 1 for every cycle where there was no execute for this thread.
A3.04.CMSK=4 CYCLE_ACTIVITY.CYCLES_NO_EXECUTE
# Total execution stalls.
A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
# Execution stalls while L2 cache miss demand load is outstanding.
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS
# Execution stalls while L2 cache miss demand load is outstanding.
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_PENDING
# Execution stalls while memory subsystem has an outstanding load.
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_LDM_PENDING
# Execution stalls while memory subsystem has an outstanding load.
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_MEM_ANY
# Cycles while L1 cache miss demand load is outstanding.
A3.08.CMSK=8.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_MISS
# Cycles while L1 cache miss demand load is outstanding.
A3.08.CMSK=8.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_PENDING
# Execution stalls while L1 cache miss demand load is outstanding.
A3.0C.CMSK=12.CTR=2 CYCLE_ACTIVITY.STALLS_L1D_MISS
# Execution stalls while L1 cache miss demand load is outstanding.
A3.0C.CMSK=12.CTR=2 CYCLE_ACTIVITY.STALLS_L1D_PENDING
# Number of Uops delivered by the LSD.
A8.01 LSD.UOPS
# Cycles Uops delivered by the LSD, but didn't come from the decoder.
A8.01.CMSK=1 LSD.CYCLES_ACTIVE
# Cycles 4 Uops delivered by the LSD, but didn't come from the decoder.
A8.01.CMSK=4 LSD.CYCLES_4_UOPS
# Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles.
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
# Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages.
AE.01 ITLB.ITLB_FLUSH
# Demand Data Read requests sent to uncore
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
# Cacheable and noncachaeble code read requests
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
# Demand RFO requests including regular RFOs, locks, ItoM
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
# Demand and prefetch data reads
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
# Any memory transaction that reached the SQ.
B0.80 OFFCORE_REQUESTS.ALL_REQUESTS
# Counts the number of uops to be executed per-thread each cycle.
B1.01 UOPS_EXECUTED.THREAD
# Cycles where at least 1 uop was executed per-thread.
B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC
# Counts number of cycles no uops were dispatched to be executed on this thread.
B1.01.CMSK=1.INV UOPS_EXECUTED.STALL_CYCLES
# Cycles where at least 2 uops were executed per-thread.
B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC
# Cycles where at least 3 uops were executed per-thread.
B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC
# Cycles where at least 4 uops were executed per-thread.
B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4_UOPS_EXEC
# Number of uops executed on the core.
B1.02 UOPS_EXECUTED.CORE
# Cycles at least 1 micro-op is executed from any thread on physical core.
B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1
# Cycles at least 2 micro-op is executed from any thread on physical core.
B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2
# Cycles at least 3 micro-op is executed from any thread on physical core.
B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3
# Cycles at least 4 micro-op is executed from any thread on physical core.
B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4
# Cycles with no micro-ops executed from any thread on physical core.
B1.02.INV UOPS_EXECUTED.CORE_CYCLES_NONE
# Offcore requests buffer cannot take more entries for this thread core.
B2.01 OFFCORE_REQUESTS_BUFFER.SQ_FULL
# Number of DTLB page walker hits in the L1+FB.
BC.11 PAGE_WALKER_LOADS.DTLB_L1
# Number of DTLB page walker hits in the L2.
BC.12 PAGE_WALKER_LOADS.DTLB_L2
# Number of DTLB page walker hits in the L3 + XSNP.
BC.14 PAGE_WALKER_LOADS.DTLB_L3
# Number of DTLB page walker hits in Memory.
BC.18 PAGE_WALKER_LOADS.DTLB_MEMORY
# Number of ITLB page walker hits in the L1+FB.
BC.21 PAGE_WALKER_LOADS.ITLB_L1
# Number of ITLB page walker hits in the L2.
BC.22 PAGE_WALKER_LOADS.ITLB_L2
# Number of ITLB page walker hits in the L3 + XSNP.
BC.24 PAGE_WALKER_LOADS.ITLB_L3
# DTLB flush attempts of the thread-specific entries
BD.01 TLB_FLUSH.DTLB_THREAD
# STLB flush attempts
BD.20 TLB_FLUSH.STLB_ANY
# Number of instructions retired. General Counter - architectural event
C0.00 INST_RETIRED.ANY_P
# Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution
C0.01.CTR=1 INST_RETIRED.PREC_DIST
# FP operations retired. X87 FP operations that have no exceptions:
C0.02 INST_RETIRED.X87
# Number of transitions from AVX-256 to legacy SSE when penalty applicable.
C1.08 OTHER_ASSISTS.AVX_TO_SSE
# Number of transitions from SSE to AVX-256 when penalty applicable.
C1.10 OTHER_ASSISTS.SSE_TO_AVX
# Number of times any microcode assist is invoked by HW upon uop writeback.
C1.40 OTHER_ASSISTS.ANY_WB_ASSIST
# Actually retired uops.
C2.01 UOPS_RETIRED.ALL
# Cycles without actually retired uops.
C2.01.CMSK=1.INV UOPS_RETIRED.STALL_CYCLES
# Cycles with less than 10 actually retired uops.
C2.01.CMSK=10.INV UOPS_RETIRED.TOTAL_CYCLES
# Retirement slots used.
C2.02 UOPS_RETIRED.RETIRE_SLOTS
# Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes.
C3.01 MACHINE_CLEARS.CYCLES
# Number of machine clears (nukes) of any type.
C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT
# Counts the number of machine clears due to memory order conflicts.
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
# Self-modifying code (SMC) detected.
C3.04 MACHINE_CLEARS.SMC
# This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0.
C3.20 MACHINE_CLEARS.MASKMOV
# All (macro) branch instructions retired.
C4.00 BR_INST_RETIRED.ALL_BRANCHES
# Conditional branch instructions retired.
C4.01 BR_INST_RETIRED.CONDITIONAL
# Direct and indirect near call instructions retired.
C4.02 BR_INST_RETIRED.NEAR_CALL
# Direct and indirect macro near call instructions retired (captured in ring 3).
C4.02 BR_INST_RETIRED.NEAR_CALL_R3
# All (macro) branch instructions retired. (Precise Event - PEBS)
C4.04 BR_INST_RETIRED.ALL_BRANCHES_PEBS
# Return instructions retired.
C4.08 BR_INST_RETIRED.NEAR_RETURN
# Not taken branch instructions retired.
C4.10 BR_INST_RETIRED.NOT_TAKEN
# Taken branch instructions retired.
C4.20 BR_INST_RETIRED.NEAR_TAKEN
# Far branch instructions retired.
C4.40 BR_INST_RETIRED.FAR_BRANCH
# All mispredicted macro branch instructions retired.
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
# Mispredicted conditional branch instructions retired.
C5.01 BR_MISP_RETIRED.CONDITIONAL
# Mispredicted macro branch instructions retired. (Precise Event - PEBS)
C5.04 BR_MISP_RETIRED.ALL_BRANCHES_PEBS
# This event counts the number of mispredicted ret instructions retired. Non PEBS
C5.08 BR_MISP_RETIRED.RET
# number of near branch instructions retired that were mispredicted and taken.
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
# Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
# Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
# Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computation operation. Applies to SSE* and AVX* scalar double and single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.03 FP_ARITH_INST_RETIRED.SCALAR
# Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
# Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 4 calculations per element.
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
# Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 4 calculations per element.
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
# Number of SSE/AVX computational double precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.15 FP_ARITH_INST_RETIRED.DOUBLE
# Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 8 calculations per element.
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
# Number of SSE/AVX computational single precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.2A FP_ARITH_INST_RETIRED.SINGLE
# Number of SSE/AVX computational packed floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* packed double and single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.3C FP_ARITH_INST_RETIRED.PACKED
# Number of times we entered an HLE region; does not count nested transactions
C8.01 HLE_RETIRED.START
# Number of times HLE commit succeeded
C8.02 HLE_RETIRED.COMMIT
# Number of times HLE abort was triggered
C8.04 HLE_RETIRED.ABORTED
# Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts).
C8.08 HLE_RETIRED.ABORTED_MISC1
# Number of times an HLE execution aborted due to uncommon conditions
C8.10 HLE_RETIRED.ABORTED_MISC2
# Number of times an HLE execution aborted due to HLE-unfriendly instructions
C8.20 HLE_RETIRED.ABORTED_MISC3
# Number of times an HLE execution aborted due to incompatible memory type
C8.40 HLE_RETIRED.ABORTED_MISC4
# Number of times an HLE execution aborted due to none of the previous 4 categories (e.g. interrupts)
C8.80 HLE_RETIRED.ABORTED_MISC5
# Number of times we entered an RTM region; does not count nested transactions
C9.01 RTM_RETIRED.START
# Number of times RTM commit succeeded
C9.02 RTM_RETIRED.COMMIT
# Number of times RTM abort was triggered
C9.04 RTM_RETIRED.ABORTED
# Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts)
C9.08 RTM_RETIRED.ABORTED_MISC1
# Number of times an RTM execution aborted due to various memory events (e.g., read/write capacity and conflicts).
C9.10 RTM_RETIRED.ABORTED_MISC2
# Number of times an RTM execution aborted due to HLE-unfriendly instructions
C9.20 RTM_RETIRED.ABORTED_MISC3
# Number of times an RTM execution aborted due to incompatible memory type
C9.40 RTM_RETIRED.ABORTED_MISC4
# Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt)
C9.80 RTM_RETIRED.ABORTED_MISC5
# Number of X87 assists due to output value.
CA.02 FP_ASSIST.X87_OUTPUT
# Number of X87 assists due to input value.
CA.04 FP_ASSIST.X87_INPUT
# Number of SIMD FP assists due to Output values
CA.08 FP_ASSIST.SIMD_OUTPUT
# Number of SIMD FP assists due to input values
CA.10 FP_ASSIST.SIMD_INPUT
# Cycles with any input/output SSE or FP assist
CA.1E.CMSK=1 FP_ASSIST.ANY
# Count cases of saving new LBR
CC.20 ROB_MISC_EVENTS.LBR_INSERTS
# Randomly selected loads with latency value being above 16
CD.01.MSR_3F6H=0x10.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
# Randomly selected loads with latency value being above 256
CD.01.MSR_3F6H=0x100.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
# Randomly selected loads with latency value being above 32
CD.01.MSR_3F6H=0x20.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
# Randomly selected loads with latency value being above 512
CD.01.MSR_3F6H=0x200.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
# Randomly selected loads with latency value being above 4
CD.01.MSR_3F6H=0x4.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
# Randomly selected loads with latency value being above 64
CD.01.MSR_3F6H=0x40.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
# Randomly selected loads with latency value being above 8
CD.01.MSR_3F6H=0x8.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
# Randomly selected loads with latency value being above 128
CD.01.MSR_3F6H=0x80.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
# Retired load uops that miss the STLB.
D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS
# Retired store uops that miss the STLB.
D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES
# Retired load uops with locked access.
D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
# Retired load uops that split across a cacheline boundary.
D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
# Retired store uops that split across a cacheline boundary.
D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
# All retired load uops.
D0.81 MEM_UOPS_RETIRED.ALL_LOADS
# All retired store uops.
D0.82 MEM_UOPS_RETIRED.ALL_STORES
# Retired load uops with L1 cache hits as data sources.
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
# Retired load uops with L2 cache hits as data sources.
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
# Retired load uops which data sources were data hits in L3 without snoops required.
D1.04 MEM_LOAD_UOPS_RETIRED.L3_HIT
# Retired load uops misses in L1 cache as data sources.
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
# Miss in mid-level (L2) cache. Excludes Unknown data-source.
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
# Miss in last-level (L3) cache. Excludes Unknown data-source.
D1.20 MEM_LOAD_UOPS_RETIRED.L3_MISS
# Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready.
D1.40 MEM_LOAD_UOPS_RETIRED.HIT_LFB
# Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache.
D2.01 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS
# Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache.
D2.02 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT
# Retired load uops which data sources were HitM responses from shared L3.
D2.04 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM
# Retired load uops which data sources were hits in L3 without snoops required.
D2.08 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE
# Data from local DRAM either Snoop not needed or Snoop Miss (RspI)
D3.01 MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM
# Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end.
E6.1F BACLEARS.ANY
# Demand Data Read requests that access L2 cache
F0.01 L2_TRANS.DEMAND_DATA_RD
# RFO requests that access L2 cache
F0.02 L2_TRANS.RFO
# L2 cache accesses when fetching instructions
F0.04 L2_TRANS.CODE_RD
# L2 or L3 HW prefetches that access L2 cache
F0.08 L2_TRANS.ALL_PF
# L1D writebacks that access L2 cache
F0.10 L2_TRANS.L1D_WB
# L2 fill requests that access L2 cache
F0.20 L2_TRANS.L2_FILL
# L2 writebacks that access L2 cache
F0.40 L2_TRANS.L2_WB
# Transactions accessing L2 pipe
F0.80 L2_TRANS.ALL_REQUESTS
# L2 cache lines in I state filling L2
F1.01 L2_LINES_IN.I
# L2 cache lines in S state filling L2
F1.02 L2_LINES_IN.S
# L2 cache lines in E state filling L2
F1.04 L2_LINES_IN.E
# L2 cache lines filling L2
F1.07 L2_LINES_IN.ALL
# Clean L2 cache lines evicted by demand.
F2.05 L2_LINES_OUT.DEMAND_CLEAN
# Split locks in SQ
F4.10 SQ_MISC.SPLIT_LOCK