mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-07-21 15:11:03 +02:00
930 lines
36 KiB
Plaintext
930 lines
36 KiB
Plaintext
# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/EMR/events/emeraldrapids_core.json (Version: 1.13)
|
|
# Applies to processors with family-model in {6-CF}
|
|
|
|
# False dependencies in MOB due to partial compare on address.
|
|
03.04 LD_BLOCKS.ADDRESS_ALIAS
|
|
|
|
# Loads blocked due to overlapping with a preceding store that cannot be forwarded.
|
|
03.82 LD_BLOCKS.STORE_FORWARD
|
|
|
|
# The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.
|
|
03.88 LD_BLOCKS.NO_SR
|
|
|
|
# Code miss in all TLB levels causes a page walk that completes. (4K)
|
|
11.02 ITLB_MISSES.WALK_COMPLETED_4K
|
|
|
|
# Code miss in all TLB levels causes a page walk that completes. (2M/4M)
|
|
11.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
|
|
|
|
# Code miss in all TLB levels causes a page walk that completes. (All page sizes)
|
|
11.0E ITLB_MISSES.WALK_COMPLETED
|
|
|
|
# Number of page walks outstanding for an outstanding code request in the PMH each cycle.
|
|
11.10 ITLB_MISSES.WALK_PENDING
|
|
|
|
# Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request.
|
|
11.10.CMSK=1 ITLB_MISSES.WALK_ACTIVE
|
|
|
|
# Instruction fetch requests that miss the ITLB and hit the STLB.
|
|
11.20 ITLB_MISSES.STLB_HIT
|
|
|
|
# Page walks completed due to a demand data load to a 4K page.
|
|
12.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
|
|
|
|
# Page walks completed due to a demand data load to a 2M/4M page.
|
|
12.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
|
|
|
|
# Page walks completed due to a demand data load to a 1G page.
|
|
12.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G
|
|
|
|
# Load miss in all TLB levels causes a page walk that completes. (All page sizes)
|
|
12.0E DTLB_LOAD_MISSES.WALK_COMPLETED
|
|
|
|
# Number of page walks outstanding for a demand load in the PMH each cycle.
|
|
12.10 DTLB_LOAD_MISSES.WALK_PENDING
|
|
|
|
# Cycles when at least one PMH is busy with a page walk for a demand load.
|
|
12.10.CMSK=1 DTLB_LOAD_MISSES.WALK_ACTIVE
|
|
|
|
# Loads that miss the DTLB and hit the STLB.
|
|
12.20 DTLB_LOAD_MISSES.STLB_HIT
|
|
|
|
# Page walks completed due to a demand data store to a 4K page.
|
|
13.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
|
|
|
|
# Page walks completed due to a demand data store to a 2M/4M page.
|
|
13.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
|
|
|
|
# Page walks completed due to a demand data store to a 1G page.
|
|
13.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G
|
|
|
|
# Store misses in all TLB levels causes a page walk that completes. (All page sizes)
|
|
13.0E DTLB_STORE_MISSES.WALK_COMPLETED
|
|
|
|
# Number of page walks outstanding for a store in the PMH each cycle.
|
|
13.10 DTLB_STORE_MISSES.WALK_PENDING
|
|
|
|
# Cycles when at least one PMH is busy with a page walk for a store.
|
|
13.10.CMSK=1 DTLB_STORE_MISSES.WALK_ACTIVE
|
|
|
|
# Stores that miss the DTLB and hit the STLB.
|
|
13.20 DTLB_STORE_MISSES.STLB_HIT
|
|
|
|
# For every cycle, increments by the number of outstanding demand data read requests pending.
|
|
20.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
|
|
|
|
# Cycles where at least 1 outstanding demand data read request is pending.
|
|
20.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
|
|
|
|
# Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle.
|
|
20.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
|
|
|
|
# Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore.
|
|
20.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD
|
|
|
|
# OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
|
|
20.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
|
|
|
|
# OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
|
|
20.08 OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
|
|
|
|
# OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
|
|
20.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
|
|
|
|
# For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.
|
|
20.10 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD
|
|
|
|
# Demand Data Read requests sent to uncore
|
|
21.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
|
|
|
# Cacheable and noncacheable code read requests
|
|
21.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
|
|
|
|
# Demand RFO requests including regular RFOs, locks, ItoM
|
|
21.04 OFFCORE_REQUESTS.DEMAND_RFO
|
|
|
|
# Demand and prefetch data reads
|
|
21.08 OFFCORE_REQUESTS.DATA_RD
|
|
|
|
# Counts demand data read requests that miss the L3 cache.
|
|
21.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
|
|
|
|
# OFFCORE_REQUESTS.ALL_REQUESTS
|
|
21.80 OFFCORE_REQUESTS.ALL_REQUESTS
|
|
|
|
# L2 writebacks that access L2 cache
|
|
23.40 L2_TRANS.L2_WB
|
|
|
|
# Demand Data Read miss L2 cache
|
|
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
|
|
|
|
# RFO requests that miss L2 cache
|
|
24.22 L2_RQSTS.RFO_MISS
|
|
|
|
# L2 cache misses when fetching instructions
|
|
24.24 L2_RQSTS.CODE_RD_MISS
|
|
|
|
# Demand requests that miss L2 cache
|
|
24.27 L2_RQSTS.ALL_DEMAND_MISS
|
|
|
|
# SW prefetch requests that miss L2 cache.
|
|
24.28 L2_RQSTS.SWPF_MISS
|
|
|
|
# L2_RQSTS.HWPF_MISS
|
|
24.30 L2_RQSTS.HWPF_MISS
|
|
|
|
# Read requests with true-miss in L2 cache. [This event is alias to L2_RQSTS.MISS]
|
|
24.3F L2_REQUEST.MISS
|
|
|
|
# Read requests with true-miss in L2 cache. [This event is alias to L2_REQUEST.MISS]
|
|
24.3F L2_RQSTS.MISS
|
|
|
|
# Demand Data Read requests that hit L2 cache
|
|
24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT
|
|
|
|
# RFO requests that hit L2 cache
|
|
24.C2 L2_RQSTS.RFO_HIT
|
|
|
|
# L2 cache hits when fetching instructions, code reads.
|
|
24.C4 L2_RQSTS.CODE_RD_HIT
|
|
|
|
# SW prefetch requests that hit L2 cache.
|
|
24.C8 L2_RQSTS.SWPF_HIT
|
|
|
|
# Demand Data Read access L2 cache
|
|
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
|
|
|
|
# RFO requests to L2 cache
|
|
24.E2 L2_RQSTS.ALL_RFO
|
|
|
|
# L2 code requests
|
|
24.E4 L2_RQSTS.ALL_CODE_RD
|
|
|
|
# Demand requests to L2 cache
|
|
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
|
|
|
|
# L2_RQSTS.ALL_HWPF
|
|
24.F0 L2_RQSTS.ALL_HWPF
|
|
|
|
# All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES]
|
|
24.FF L2_REQUEST.ALL
|
|
|
|
# All accesses to L2 cache [This event is alias to L2_REQUEST.ALL]
|
|
24.FF L2_RQSTS.REFERENCES
|
|
|
|
# L2 cache lines filling L2
|
|
25.1F L2_LINES_IN.ALL
|
|
|
|
# Non-modified cache lines that are silently dropped by L2 cache.
|
|
26.01 L2_LINES_OUT.SILENT
|
|
|
|
# Modified cache lines that are evicted by L2 cache when triggered by an L2 cache fill.
|
|
26.02 L2_LINES_OUT.NON_SILENT
|
|
|
|
# Cache lines that have been L2 hardware prefetched but not used by demand accesses
|
|
26.04 L2_LINES_OUT.USELESS_HWPF
|
|
|
|
# Counts bus locks, accounts for cache line split locks and UC locks.
|
|
2C.10 SQ_MISC.BUS_LOCK
|
|
|
|
# Cycles the uncore cannot take further requests
|
|
2D.01.CMSK=1 XQ.FULL_CYCLES
|
|
|
|
# Core-originated cacheable requests that missed L3 (Except hardware prefetches to the L3)
|
|
2E.41 LONGEST_LAT_CACHE.MISS
|
|
|
|
# Core-originated cacheable requests that refer to L3 (Except hardware prefetches to the L3)
|
|
2E.4F LONGEST_LAT_CACHE.REFERENCE
|
|
|
|
# Thread cycles when thread is not in halt state
|
|
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
# Reference cycles when the core is not in halt state.
|
|
3C.01 CPU_CLK_UNHALTED.REF_TSC_P
|
|
|
|
# Core crystal clock cycles when this thread is unhalted and the other thread is halted.
|
|
3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
|
|
|
|
# Core crystal clock cycles. Cycle counts are evenly distributed between active threads in the Core.
|
|
3C.08 CPU_CLK_UNHALTED.REF_DISTRIBUTED
|
|
|
|
# Number of PREFETCHNTA instructions executed.
|
|
40.01 SW_PREFETCH_ACCESS.NTA
|
|
|
|
# Number of PREFETCHT0 instructions executed.
|
|
40.02 SW_PREFETCH_ACCESS.T0
|
|
|
|
# Number of PREFETCHT1 or PREFETCHT2 instructions executed.
|
|
40.04 SW_PREFETCH_ACCESS.T1_T2
|
|
|
|
# Number of PREFETCHW instructions executed.
|
|
40.08 SW_PREFETCH_ACCESS.PREFETCHW
|
|
|
|
# Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed.
|
|
40.0F SW_PREFETCH_ACCESS.ANY
|
|
|
|
# Completed demand load uops that miss the L1 d-cache.
|
|
43.FD MEM_LOAD_COMPLETED.L1_MISS_ANY
|
|
|
|
# MEM_STORE_RETIRED.L2_HIT
|
|
44.01 MEM_STORE_RETIRED.L2_HIT
|
|
|
|
# Cycles while L1 cache miss demand load is outstanding.
|
|
47.02.CMSK=2 MEMORY_ACTIVITY.CYCLES_L1D_MISS
|
|
|
|
# Execution stalls while L1 cache miss demand load is outstanding.
|
|
47.03.CMSK=3 MEMORY_ACTIVITY.STALLS_L1D_MISS
|
|
|
|
# Execution stalls while L2 cache miss demand cacheable load request is outstanding.
|
|
47.05.CMSK=5 MEMORY_ACTIVITY.STALLS_L2_MISS
|
|
|
|
# Execution stalls while L3 cache miss demand cacheable load request is outstanding.
|
|
47.09.CMSK=9 MEMORY_ACTIVITY.STALLS_L3_MISS
|
|
|
|
# Number of L1D misses that are outstanding
|
|
48.01 L1D_PEND_MISS.PENDING
|
|
|
|
# Cycles with L1D load Misses outstanding.
|
|
48.01.CMSK=1 L1D_PEND_MISS.PENDING_CYCLES
|
|
|
|
# Number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability.
|
|
48.02 L1D_PEND_MISS.FB_FULL
|
|
|
|
# Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.
|
|
48.02.CMSK=1.EDG L1D_PEND_MISS.FB_FULL_PERIODS
|
|
|
|
# Number of cycles a demand request has waited due to L1D due to lack of L2 resources.
|
|
48.04 L1D_PEND_MISS.L2_STALLS
|
|
|
|
# Counts the number of demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch.
|
|
4C.01 LOAD_HIT_PREFETCH.SWPF
|
|
|
|
# Counts the number of cache lines replaced in L1 data cache.
|
|
51.01 L1D.REPLACEMENT
|
|
|
|
# L1D.HWPF_MISS
|
|
51.20 L1D.HWPF_MISS
|
|
|
|
# Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address
|
|
54.01 TX_MEM.ABORT_CONFLICT
|
|
|
|
# Speculatively counts the number of TSX aborts due to a data capacity limitation for transactional writes.
|
|
54.02 TX_MEM.ABORT_CAPACITY_WRITE
|
|
|
|
# Speculatively counts the number of TSX aborts due to a data capacity limitation for transactional reads
|
|
54.80 TX_MEM.ABORT_CAPACITY_READ
|
|
|
|
# Clears due to Unknown Branches.
|
|
60.01 BACLEARS.ANY
|
|
|
|
# DSB-to-MITE switch true penalty cycles.
|
|
61.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
|
|
|
# Instruction decoders utilized in a cycle
|
|
75.01 INST_DECODED.DECODERS
|
|
|
|
# UOPS_DECODED.DEC0_UOPS
|
|
76.01 UOPS_DECODED.DEC0_UOPS
|
|
|
|
# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
|
|
79.04 IDQ.MITE_UOPS
|
|
|
|
# Cycles MITE is delivering any Uop
|
|
79.04.CMSK=1 IDQ.MITE_CYCLES_ANY
|
|
|
|
# Cycles MITE is delivering optimal number of Uops
|
|
79.04.CMSK=6 IDQ.MITE_CYCLES_OK
|
|
|
|
# Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path
|
|
79.08 IDQ.DSB_UOPS
|
|
|
|
# Cycles Decode Stream Buffer (DSB) is delivering any Uop
|
|
79.08.CMSK=1 IDQ.DSB_CYCLES_ANY
|
|
|
|
# Cycles DSB is delivering optimal number of Uops
|
|
79.08.CMSK=6 IDQ.DSB_CYCLES_OK
|
|
|
|
# Uops delivered to IDQ while MS is busy
|
|
79.20 IDQ.MS_UOPS
|
|
|
|
# Cycles when uops are being delivered to IDQ while MS is busy
|
|
79.20.CMSK=1 IDQ.MS_CYCLES_ANY
|
|
|
|
# Number of switches from DSB or MITE to the MS
|
|
79.20.CMSK=1.EDG IDQ.MS_SWITCHES
|
|
|
|
# Cycles where a code fetch is stalled due to L1 instruction cache miss.
|
|
80.04 ICACHE_DATA.STALLS
|
|
|
|
# ICACHE_DATA.STALL_PERIODS
|
|
80.04.CMSK=1.EDG ICACHE_DATA.STALL_PERIODS
|
|
|
|
# Cycles where a code fetch is stalled due to L1 instruction cache tag miss.
|
|
83.04 ICACHE_TAG.STALLS
|
|
|
|
# Stalls caused by changing prefix length of the instruction.
|
|
87.01 DECODE.LCP
|
|
|
|
# Cycles the Microcode Sequencer is busy.
|
|
87.02 DECODE.MS_BUSY
|
|
|
|
# Uops not delivered by IDQ when backend of the machine is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE]
|
|
9C.01 IDQ_BUBBLES.CORE
|
|
|
|
# Uops not delivered by IDQ when backend of the machine is not stalled [This event is alias to IDQ_BUBBLES.CORE]
|
|
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
|
|
|
# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]
|
|
9C.01.CMSK=1.INV IDQ_BUBBLES.CYCLES_FE_WAS_OK
|
|
|
|
# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]
|
|
9C.01.CMSK=1.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
|
|
|
|
# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]
|
|
9C.01.CMSK=6 IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE
|
|
|
|
# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]
|
|
9C.01.CMSK=6 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
|
|
|
|
# Counts cycles where the pipeline is stalled due to serializing operations.
|
|
A2.02 RESOURCE_STALLS.SCOREBOARD
|
|
|
|
# Cycles stalled due to no store buffers available. (not including draining form sync).
|
|
A2.08 RESOURCE_STALLS.SB
|
|
|
|
# Cycles while L2 cache miss demand load is outstanding.
|
|
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS
|
|
|
|
# Total execution stalls.
|
|
A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
|
|
|
|
# Execution stalls while L2 cache miss demand load is outstanding.
|
|
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS
|
|
|
|
# Execution stalls while L3 cache miss demand load is outstanding.
|
|
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_L3_MISS
|
|
|
|
# Cycles while L1 cache miss demand load is outstanding.
|
|
A3.08.CMSK=8 CYCLE_ACTIVITY.CYCLES_L1D_MISS
|
|
|
|
# Execution stalls while L1 cache miss demand load is outstanding.
|
|
A3.0C.CMSK=12 CYCLE_ACTIVITY.STALLS_L1D_MISS
|
|
|
|
# Cycles while memory subsystem has an outstanding load.
|
|
A3.10.CMSK=16 CYCLE_ACTIVITY.CYCLES_MEM_ANY
|
|
|
|
# TMA slots available for an unhalted logical processor. General counter - architectural event
|
|
A4.01 TOPDOWN.SLOTS_P
|
|
|
|
# TMA slots where no uops were being issued due to lack of back-end resources.
|
|
A4.02 TOPDOWN.BACKEND_BOUND_SLOTS
|
|
|
|
# TMA slots wasted due to incorrect speculations.
|
|
A4.04.CTR=0 TOPDOWN.BAD_SPEC_SLOTS
|
|
|
|
# TMA slots wasted due to incorrect speculation by branch mispredictions
|
|
A4.08.CTR=0 TOPDOWN.BR_MISPREDICT_SLOTS
|
|
|
|
# TOPDOWN.MEMORY_BOUND_SLOTS
|
|
A4.10 TOPDOWN.MEMORY_BOUND_SLOTS
|
|
|
|
# Cycles when Reservation Station (RS) is empty due to a resource in the back-end
|
|
A5.01 RS.EMPTY_RESOURCE
|
|
|
|
# Cycles when Reservation Station (RS) is empty for the thread.
|
|
A5.07 RS.EMPTY
|
|
|
|
# Counts end of periods where the Reservation Station (RS) was empty.
|
|
A5.07.CMSK=1.EDG.INV RS.EMPTY_COUNT
|
|
|
|
# Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.
|
|
A6.02 EXE_ACTIVITY.1_PORTS_UTIL
|
|
|
|
# Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.
|
|
A6.04 EXE_ACTIVITY.2_PORTS_UTIL
|
|
|
|
# Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.
|
|
A6.08 EXE_ACTIVITY.3_PORTS_UTIL
|
|
|
|
# Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty.
|
|
A6.0C EXE_ACTIVITY.2_3_PORTS_UTIL
|
|
|
|
# Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.
|
|
A6.10 EXE_ACTIVITY.4_PORTS_UTIL
|
|
|
|
# Execution stalls while memory subsystem has an outstanding load.
|
|
A6.21.CMSK=5 EXE_ACTIVITY.BOUND_ON_LOADS
|
|
|
|
# Cycles where the Store Buffer was full and no loads caused an execution stall.
|
|
A6.40.CMSK=2 EXE_ACTIVITY.BOUND_ON_STORES
|
|
|
|
# Cycles no uop executed while RS was not empty, the SB was not full and there was no outstanding load.
|
|
A6.80 EXE_ACTIVITY.EXE_BOUND_0_PORTS
|
|
|
|
# Number of Uops delivered by the LSD.
|
|
A8.01 LSD.UOPS
|
|
|
|
# Cycles Uops delivered by the LSD, but didn't come from the decoder.
|
|
A8.01.CMSK=1 LSD.CYCLES_ACTIVE
|
|
|
|
# Cycles optimal number of Uops delivered by the LSD, but did not come from the decoder.
|
|
A8.01.CMSK=6 LSD.CYCLES_OK
|
|
|
|
# Core cycles the allocator was stalled due to recovery from earlier clear event for this thread
|
|
AD.01 INT_MISC.RECOVERY_CYCLES
|
|
|
|
# Clears speculative count
|
|
AD.01.CMSK=1.EDG INT_MISC.CLEARS_COUNT
|
|
|
|
# TMA slots where uops got dropped
|
|
AD.10 INT_MISC.UOP_DROPPING
|
|
|
|
# INT_MISC.MBA_STALLS
|
|
AD.20 INT_MISC.MBA_STALLS
|
|
|
|
# Bubble cycles of BAClear (Unknown Branch).
|
|
AD.40.TakenAlone INT_MISC.UNKNOWN_BRANCH_CYCLES
|
|
|
|
# Counts cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.
|
|
AD.80 INT_MISC.CLEAR_RESTEER_CYCLES
|
|
|
|
# Uops that RAT issues to RS
|
|
AE.01 UOPS_ISSUED.ANY
|
|
|
|
# UOPS_ISSUED.CYCLES
|
|
AE.01.CMSK=1 UOPS_ISSUED.CYCLES
|
|
|
|
# ARITH.FPDIV_ACTIVE
|
|
B0.01.CMSK=1 ARITH.FPDIV_ACTIVE
|
|
|
|
# This event counts the cycles the integer divider is busy.
|
|
B0.08.CMSK=1 ARITH.IDIV_ACTIVE
|
|
|
|
# Cycles when divide unit is busy executing divide or square root operations.
|
|
B0.09.CMSK=1 ARITH.DIV_ACTIVE
|
|
|
|
# Counts the number of uops to be executed per-thread each cycle.
|
|
B1.01 UOPS_EXECUTED.THREAD
|
|
|
|
# Cycles where at least 1 uop was executed per-thread
|
|
B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1
|
|
|
|
# Counts number of cycles no uops were dispatched to be executed on this thread.
|
|
B1.01.CMSK=1.INV UOPS_EXECUTED.STALLS
|
|
|
|
# Cycles where at least 2 uops were executed per-thread
|
|
B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2
|
|
|
|
# Cycles where at least 3 uops were executed per-thread
|
|
B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3
|
|
|
|
# Cycles where at least 4 uops were executed per-thread
|
|
B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4
|
|
|
|
# Number of uops executed on the core.
|
|
B1.02 UOPS_EXECUTED.CORE
|
|
|
|
# Cycles at least 1 micro-op is executed from any thread on physical core.
|
|
B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1
|
|
|
|
# Cycles at least 2 micro-op is executed from any thread on physical core.
|
|
B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2
|
|
|
|
# Cycles at least 3 micro-op is executed from any thread on physical core.
|
|
B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3
|
|
|
|
# Cycles at least 4 micro-op is executed from any thread on physical core.
|
|
B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4
|
|
|
|
# Counts the number of x87 uops dispatched.
|
|
B1.10 UOPS_EXECUTED.X87
|
|
|
|
# Uops executed on port 0
|
|
B2.01 UOPS_DISPATCHED.PORT_0
|
|
|
|
# Uops executed on port 1
|
|
B2.02 UOPS_DISPATCHED.PORT_1
|
|
|
|
# Uops executed on ports 2, 3 and 10
|
|
B2.04 UOPS_DISPATCHED.PORT_2_3_10
|
|
|
|
# Uops executed on ports 4 and 9
|
|
B2.10 UOPS_DISPATCHED.PORT_4_9
|
|
|
|
# Uops executed on ports 5 and 11
|
|
B2.20 UOPS_DISPATCHED.PORT_5_11
|
|
|
|
# Uops executed on port 6
|
|
B2.40 UOPS_DISPATCHED.PORT_6
|
|
|
|
# Uops executed on ports 7 and 8
|
|
B2.80 UOPS_DISPATCHED.PORT_7_8
|
|
|
|
# FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]
|
|
B3.01 FP_ARITH_DISPATCHED.PORT_0
|
|
|
|
# FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]
|
|
B3.01 FP_ARITH_DISPATCHED.V0
|
|
|
|
# FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]
|
|
B3.02 FP_ARITH_DISPATCHED.PORT_1
|
|
|
|
# FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]
|
|
B3.02 FP_ARITH_DISPATCHED.V1
|
|
|
|
# FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]
|
|
B3.04 FP_ARITH_DISPATCHED.PORT_5
|
|
|
|
# FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]
|
|
B3.04 FP_ARITH_DISPATCHED.V2
|
|
|
|
# Counts the cycles where the AMX (Advance Matrix Extension) unit is busy performing an operation.
|
|
B7.02 EXE.AMX_BUSY
|
|
|
|
# Number of instructions retired. General Counter - architectural event
|
|
C0.00 INST_RETIRED.ANY_P
|
|
|
|
# Retired NOP instructions.
|
|
C0.02 INST_RETIRED.NOP
|
|
|
|
# Iterations of Repeat string retired instructions.
|
|
C0.08 INST_RETIRED.REP_ITERATION
|
|
|
|
# INST_RETIRED.MACRO_FUSED
|
|
C0.10 INST_RETIRED.MACRO_FUSED
|
|
|
|
# Counts all microcode FP assists.
|
|
C1.02 ASSISTS.FP
|
|
|
|
# ASSISTS.PAGE_FAULT
|
|
C1.08 ASSISTS.PAGE_FAULT
|
|
|
|
# ASSISTS.SSE_AVX_MIX
|
|
C1.10 ASSISTS.SSE_AVX_MIX
|
|
|
|
# Number of occurrences where a microcode assist is invoked by hardware.
|
|
C1.1B ASSISTS.ANY
|
|
|
|
# Retired uops except the last uop of each instruction.
|
|
C2.01 UOPS_RETIRED.HEAVY
|
|
|
|
# Retirement slots used.
|
|
C2.02 UOPS_RETIRED.SLOTS
|
|
|
|
# Cycles with retired uop(s).
|
|
C2.02.CMSK=1 UOPS_RETIRED.CYCLES
|
|
|
|
# Cycles without actually retired uops.
|
|
C2.02.CMSK=1.INV UOPS_RETIRED.STALLS
|
|
|
|
# UOPS_RETIRED.MS
|
|
C2.04.TakenAlone UOPS_RETIRED.MS
|
|
|
|
# Number of machine clears (nukes) of any type.
|
|
C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT
|
|
|
|
# Number of machine clears due to memory ordering conflicts.
|
|
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
|
|
|
# Self-modifying code (SMC) detected.
|
|
C3.04 MACHINE_CLEARS.SMC
|
|
|
|
# All branch instructions retired.
|
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
|
|
|
# Taken conditional branch instructions retired.
|
|
C4.01 BR_INST_RETIRED.COND_TAKEN
|
|
|
|
# Direct and indirect near call instructions retired.
|
|
C4.02 BR_INST_RETIRED.NEAR_CALL
|
|
|
|
# Return instructions retired.
|
|
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
|
|
|
# Not taken branch instructions retired.
|
|
C4.10 BR_INST_RETIRED.COND_NTAKEN
|
|
|
|
# Conditional branch instructions retired.
|
|
C4.11 BR_INST_RETIRED.COND
|
|
|
|
# Taken branch instructions retired.
|
|
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
|
|
|
# Far branch instructions retired.
|
|
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
|
|
|
# Indirect near branch instructions retired (excluding returns)
|
|
C4.80 BR_INST_RETIRED.INDIRECT
|
|
|
|
# All mispredicted branch instructions retired.
|
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
|
|
|
# number of branch instructions retired that were mispredicted and taken.
|
|
C5.01 BR_MISP_RETIRED.COND_TAKEN
|
|
|
|
# Mispredicted indirect CALL retired.
|
|
C5.02 BR_MISP_RETIRED.INDIRECT_CALL
|
|
|
|
# This event counts the number of mispredicted ret instructions retired. Non PEBS
|
|
C5.08 BR_MISP_RETIRED.RET
|
|
|
|
# Mispredicted non-taken conditional branch instructions retired.
|
|
C5.10 BR_MISP_RETIRED.COND_NTAKEN
|
|
|
|
# Mispredicted conditional branch instructions retired.
|
|
C5.11 BR_MISP_RETIRED.COND
|
|
|
|
# Number of near branch instructions retired that were mispredicted and taken.
|
|
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
|
|
|
|
# Miss-predicted near indirect branch instructions retired (excluding returns)
|
|
C5.80 BR_MISP_RETIRED.INDIRECT
|
|
|
|
# Retired Instructions who experienced DSB miss.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.ANY_DSB_MISS
|
|
|
|
# Retired Instructions who experienced a critical DSB miss.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.DSB_MISS
|
|
|
|
# Retired Instructions who experienced iTLB true miss.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.ITLB_MISS
|
|
|
|
# Retired Instructions who experienced Instruction L1 Cache true miss.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.L1I_MISS
|
|
|
|
# Retired Instructions who experienced Instruction L2 Cache true miss.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.L2_MISS
|
|
|
|
# Retired instructions after front-end starvation of at least 1 cycle
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_1
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_128
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_16
|
|
|
|
# Retired instructions after front-end starvation of at least 2 cycles
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_256
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_32
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_4
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_512
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_64
|
|
|
|
# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_8
|
|
|
|
# FRONTEND_RETIRED.MS_FLOWS
|
|
C6.01.TakenAlone FRONTEND_RETIRED.MS_FLOWS
|
|
|
|
# Retired Instructions who experienced STLB (2nd level TLB) true miss.
|
|
C6.01.TakenAlone FRONTEND_RETIRED.STLB_MISS
|
|
|
|
# FRONTEND_RETIRED.UNKNOWN_BRANCH
|
|
C6.01.TakenAlone FRONTEND_RETIRED.UNKNOWN_BRANCH
|
|
|
|
# Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
|
|
|
|
# Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
|
|
|
|
# Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 RANGE SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
|
|
C7.03 FP_ARITH_INST_RETIRED.SCALAR
|
|
|
|
# Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
|
|
|
|
# Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
|
|
|
|
# Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
|
|
|
|
# Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.
|
|
C7.18 FP_ARITH_INST_RETIRED.4_FLOPS
|
|
|
|
# Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
|
|
|
|
# Counts number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.40 FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE
|
|
|
|
# Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.
|
|
C7.60 FP_ARITH_INST_RETIRED.8_FLOPS
|
|
|
|
# Counts number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
|
|
C7.80 FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE
|
|
|
|
# Number of any Vector retired FP arithmetic instructions
|
|
C7.FC FP_ARITH_INST_RETIRED.VECTOR
|
|
|
|
# Number of times an RTM execution started.
|
|
C9.01 RTM_RETIRED.START
|
|
|
|
# Number of times an RTM execution successfully committed
|
|
C9.02 RTM_RETIRED.COMMIT
|
|
|
|
# Number of times an RTM execution aborted.
|
|
C9.04 RTM_RETIRED.ABORTED
|
|
|
|
# Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts)
|
|
C9.08 RTM_RETIRED.ABORTED_MEM
|
|
|
|
# Number of times an RTM execution aborted due to HLE-unfriendly instructions
|
|
C9.20 RTM_RETIRED.ABORTED_UNFRIENDLY
|
|
|
|
# Number of times an RTM execution aborted due to incompatible memory type
|
|
C9.40 RTM_RETIRED.ABORTED_MEMTYPE
|
|
|
|
# Number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt)
|
|
C9.80 RTM_RETIRED.ABORTED_EVENTS
|
|
|
|
# Increments whenever there is an update to the LBR array.
|
|
CC.20 MISC_RETIRED.LBR_INSERTS
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles.
|
|
CD.01.MSR_3F6H=0x10.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles.
|
|
CD.01.MSR_3F6H=0x100.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles.
|
|
CD.01.MSR_3F6H=0x20.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles.
|
|
CD.01.MSR_3F6H=0x200.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles.
|
|
CD.01.MSR_3F6H=0x4.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles.
|
|
CD.01.MSR_3F6H=0x40.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles.
|
|
CD.01.MSR_3F6H=0x400.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles.
|
|
CD.01.MSR_3F6H=0x8.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
|
|
|
|
# Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles.
|
|
CD.01.MSR_3F6H=0x80.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
|
|
|
|
# Retired memory store access operations. A PDist event for PEBS Store Latency Facility.
|
|
CD.02.CTR=0 MEM_TRANS_RETIRED.STORE_SAMPLE
|
|
|
|
# FP_ARITH_INST_RETIRED2.SCALAR_HALF
|
|
CF.01 FP_ARITH_INST_RETIRED2.SCALAR_HALF
|
|
|
|
# FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF
|
|
CF.02 FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF
|
|
|
|
# Number of all Scalar Half-Precision FP arithmetic instructions(1) retired - regular and complex.
|
|
CF.03 FP_ARITH_INST_RETIRED2.SCALAR
|
|
|
|
# FP_ARITH_INST_RETIRED2.128B_PACKED_HALF
|
|
CF.04 FP_ARITH_INST_RETIRED2.128B_PACKED_HALF
|
|
|
|
# FP_ARITH_INST_RETIRED2.256B_PACKED_HALF
|
|
CF.08 FP_ARITH_INST_RETIRED2.256B_PACKED_HALF
|
|
|
|
# FP_ARITH_INST_RETIRED2.512B_PACKED_HALF
|
|
CF.10 FP_ARITH_INST_RETIRED2.512B_PACKED_HALF
|
|
|
|
# Number of all Vector (also called packed) Half-Precision FP arithmetic instructions(1) retired.
|
|
CF.1C FP_ARITH_INST_RETIRED2.VECTOR
|
|
|
|
# Retired load instructions that miss the STLB.
|
|
D0.11 MEM_INST_RETIRED.STLB_MISS_LOADS
|
|
|
|
# Retired store instructions that miss the STLB.
|
|
D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
|
|
|
|
# Retired load instructions with locked access.
|
|
D0.21 MEM_INST_RETIRED.LOCK_LOADS
|
|
|
|
# Retired load instructions that split across a cacheline boundary.
|
|
D0.41 MEM_INST_RETIRED.SPLIT_LOADS
|
|
|
|
# Retired store instructions that split across a cacheline boundary.
|
|
D0.42 MEM_INST_RETIRED.SPLIT_STORES
|
|
|
|
# Retired load instructions.
|
|
D0.81 MEM_INST_RETIRED.ALL_LOADS
|
|
|
|
# Retired store instructions.
|
|
D0.82 MEM_INST_RETIRED.ALL_STORES
|
|
|
|
# All retired memory instructions.
|
|
D0.83 MEM_INST_RETIRED.ANY
|
|
|
|
# Retired load instructions with L1 cache hits as data sources
|
|
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
|
|
|
# Retired load instructions with L2 cache hits as data sources
|
|
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
|
|
|
# Retired load instructions with L3 cache hits as data sources
|
|
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
|
|
|
# Retired load instructions missed L1 cache as data sources
|
|
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
|
|
|
# Retired load instructions missed L2 cache as data sources
|
|
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
|
|
|
# Retired load instructions missed L3 cache as data sources
|
|
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
|
|
|
# Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.
|
|
D1.40 MEM_LOAD_RETIRED.FB_HIT
|
|
|
|
# Retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.
|
|
D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
|
|
|
|
# Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache
|
|
D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD
|
|
|
|
# Retired load instructions whose data sources were HitM responses from shared L3
|
|
D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD
|
|
|
|
# Retired load instructions whose data sources were hits in L3 without snoops required
|
|
D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
|
|
|
|
# Retired load instructions which data sources missed L3 but serviced from local dram
|
|
D3.01 MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM
|
|
|
|
# MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM
|
|
D3.02 MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM
|
|
|
|
# MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM
|
|
D3.04 MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM
|
|
|
|
# Retired load instructions whose data sources was forwarded from a remote cache
|
|
D3.08 MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD
|
|
|
|
# Retired instructions with at least 1 uncacheable load or lock.
|
|
D4.04 MEM_LOAD_MISC_RETIRED.UC
|
|
|
|
# LFENCE instructions retired
|
|
E0.20 MISC2_RETIRED.LFENCE
|
|
|
|
# Retired memory uops for any access
|
|
E5.03 MEM_UOP_RETIRED.ANY
|
|
|
|
# integer ADD, SUB, SAD 128-bit vector instructions.
|
|
E7.03 INT_VEC_RETIRED.ADD_128
|
|
|
|
# integer ADD, SUB, SAD 256-bit vector instructions.
|
|
E7.0C INT_VEC_RETIRED.ADD_256
|
|
|
|
# INT_VEC_RETIRED.VNNI_128
|
|
E7.10 INT_VEC_RETIRED.VNNI_128
|
|
|
|
# INT_VEC_RETIRED.128BIT
|
|
E7.13 INT_VEC_RETIRED.128BIT
|
|
|
|
# INT_VEC_RETIRED.VNNI_256
|
|
E7.20 INT_VEC_RETIRED.VNNI_256
|
|
|
|
# INT_VEC_RETIRED.SHUFFLES
|
|
E7.40 INT_VEC_RETIRED.SHUFFLES
|
|
|
|
# INT_VEC_RETIRED.MUL_256
|
|
E7.80 INT_VEC_RETIRED.MUL_256
|
|
|
|
# INT_VEC_RETIRED.256BIT
|
|
E7.AC INT_VEC_RETIRED.256BIT
|
|
|
|
# Cycle counts are evenly distributed between active threads in the Core.
|
|
EC.02 CPU_CLK_UNHALTED.DISTRIBUTED
|
|
|
|
# Core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.
|
|
EC.10 CPU_CLK_UNHALTED.C01
|
|
|
|
# Core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.
|
|
EC.20 CPU_CLK_UNHALTED.C02
|
|
|
|
# CPU_CLK_UNHALTED.PAUSE
|
|
EC.40 CPU_CLK_UNHALTED.PAUSE
|
|
|
|
# CPU_CLK_UNHALTED.PAUSE_INST
|
|
EC.40.CMSK=1.EDG CPU_CLK_UNHALTED.PAUSE_INST
|
|
|
|
# Core clocks when the thread is in the C0.1 or C0.2 or running a PAUSE in C0 ACPI state.
|
|
EC.70 CPU_CLK_UNHALTED.C0_WAIT
|