nanoBench/configs/cfg_Broadwell_all_core.txt

# Based on https://download.01.org/perfmon/BDW/broadwell_core_v26.json
# Applies to processors with family-model in {6-3D, 6-47}

# Cases when loads get true Block-on-Store blocking code preventing store forwarding
03.02 LD_BLOCKS.STORE_FORWARD

# This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.
03.08 LD_BLOCKS.NO_SR

# Speculative cache line split load uops dispatched to L1 cache
05.01 MISALIGN_MEM_REF.LOADS

# Speculative cache line split STA uops dispatched to L1 cache
05.02 MISALIGN_MEM_REF.STORES

# False dependencies in MOB due to partial compare
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS

# Load misses in all DTLB levels that cause page walks
08.01 DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK

# Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (4K).
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K

# Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (2M/4M).
08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M

# Load miss in all TLB levels causes a page walk that completes. (1G)
08.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G

# Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes of any page size.
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED

# Cycles when PMH is busy with page walks
08.10 DTLB_LOAD_MISSES.WALK_DURATION

# Load misses that miss the  DTLB and hit the STLB (4K).
08.20 DTLB_LOAD_MISSES.STLB_HIT_4K

# Load misses that miss the  DTLB and hit the STLB (2M).
08.40 DTLB_LOAD_MISSES.STLB_HIT_2M

# Load operations that miss the first DTLB level but hit the second and do not cause page walks.
08.60 DTLB_LOAD_MISSES.STLB_HIT

# Core cycles the allocator was stalled due to recovery from earlier clear event for this thread (e.g. misprediction or memory nuke)
0D.03.CMSK=1 INT_MISC.RECOVERY_CYCLES

# Core cycles the allocator was stalled due to recovery from earlier clear event for any thread running on the physical core (e.g. misprediction or memory nuke).
0D.03.CMSK=1.AnyT INT_MISC.RECOVERY_CYCLES_ANY

# Cycles when Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for the thread
0D.08 INT_MISC.RAT_STALL_CYCLES

# Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS)
0E.01 UOPS_ISSUED.ANY

# Cycles when Resource Allocation Table (RAT) does not issue Uops to Reservation Station (RS) for the thread
0E.01.CMSK=1.INV UOPS_ISSUED.STALL_CYCLES

# Number of flags-merge uops being allocated. Such uops considered perf sensitive; added by GSR u-arch.
0E.10 UOPS_ISSUED.FLAGS_MERGE

# Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not.
0E.20 UOPS_ISSUED.SLOW_LEA

# Number of Multiply packed/scalar single precision uops allocated.
0E.40 UOPS_ISSUED.SINGLE_MUL

# Cycles when divider is busy executing divide operations
14.01 ARITH.FPU_DIV_ACTIVE

# Demand Data Read miss L2, no rejects
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS

# RFO requests that miss L2 cache.
24.22 L2_RQSTS.RFO_MISS

# L2 cache misses when fetching instructions.
24.24 L2_RQSTS.CODE_RD_MISS

# Demand requests that miss L2 cache.
24.27 L2_RQSTS.ALL_DEMAND_MISS

# L2 prefetch requests that miss L2 cache
24.30 L2_RQSTS.L2_PF_MISS

# All requests that miss L2 cache.
24.3F L2_RQSTS.MISS

# Demand Data Read requests that hit L2 cache
24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT

# RFO requests that hit L2 cache.
24.C2 L2_RQSTS.RFO_HIT

# L2 cache hits when fetching instructions, code reads.
24.C4 L2_RQSTS.CODE_RD_HIT

# L2 prefetch requests that hit L2 cache
24.D0 L2_RQSTS.L2_PF_HIT

# Demand Data Read requests
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD

# RFO requests to L2 cache
24.E2 L2_RQSTS.ALL_RFO

# L2 code requests
24.E4 L2_RQSTS.ALL_CODE_RD

# Demand requests to L2 cache.
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES

# Requests from L2 hardware prefetchers
24.F8 L2_RQSTS.ALL_PF

# All L2 requests.
24.FF L2_RQSTS.REFERENCES

# Not rejected writebacks that hit L2 cache
27.50 L2_DEMAND_RQSTS.WB_HIT

# Core-originated cacheable demand requests missed L3
2E.41 LONGEST_LAT_CACHE.MISS

# Core-originated cacheable demand requests that refer to L3
2E.4F LONGEST_LAT_CACHE.REFERENCE

# Thread cycles when thread is not in halt state
3C.00 CPU_CLK_UNHALTED.THREAD_P

# Core cycles when at least one thread on the physical core is not in halt state.
3C.00.AnyT CPU_CLK_UNHALTED.THREAD_P_ANY

# Reference cycles when the thread is unhalted (counts at 100 MHz rate)
3C.01 CPU_CLK_THREAD_UNHALTED.REF_XCLK

# Reference cycles when the thread is unhalted (counts at 100 MHz rate)
3C.01 CPU_CLK_UNHALTED.REF_XCLK

# Reference cycles when the at least one thread on the physical core is unhalted (counts at 100 MHz rate).
3C.01.AnyT CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY

# Reference cycles when the at least one thread on the physical core is unhalted (counts at 100 MHz rate).
3C.01.AnyT CPU_CLK_UNHALTED.REF_XCLK_ANY

# Count XClk pulses when this thread is unhalted and the other thread is halted.
3C.02 CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE

# Count XClk pulses when this thread is unhalted and the other thread is halted.
3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE

# Cycles with L1D load Misses outstanding from any thread on physical core.
48.01.CMSK=1.AnyT.CTR=2 L1D_PEND_MISS.PENDING_CYCLES_ANY

# Cycles with L1D load Misses outstanding.
48.01.CMSK=1.CTR=2 L1D_PEND_MISS.PENDING_CYCLES

# L1D miss oustandings duration in cycles
48.01.CTR=2 L1D_PEND_MISS.PENDING

# Cycles a demand request was blocked due to Fill Buffers inavailability.
48.02.CMSK=1 L1D_PEND_MISS.FB_FULL

# Store misses in all DTLB levels that cause page walks
49.01 DTLB_STORE_MISSES.MISS_CAUSES_A_WALK

# Store miss in all TLB levels causes a page walk that completes. (4K)
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K

# Store misses in all DTLB levels that cause completed page walks (2M/4M)
49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M

# Store misses in all DTLB levels that cause completed page walks (1G)
49.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G

# Store misses in all DTLB levels that cause completed page walks.
49.0E DTLB_STORE_MISSES.WALK_COMPLETED

# Cycles when PMH is busy with page walks
49.10 DTLB_STORE_MISSES.WALK_DURATION

# Store misses that miss the  DTLB and hit the STLB (4K).
49.20 DTLB_STORE_MISSES.STLB_HIT_4K

# Store misses that miss the  DTLB and hit the STLB (2M).
49.40 DTLB_STORE_MISSES.STLB_HIT_2M

# Store operations that miss the first TLB level but hit the second and do not cause page walks.
49.60 DTLB_STORE_MISSES.STLB_HIT

# Not software-prefetch load dispatches that hit FB allocated for software prefetch
4C.01 LOAD_HIT_PRE.SW_PF

# Not software-prefetch load dispatches that hit FB allocated for hardware prefetch
4C.02 LOAD_HIT_PRE.HW_PF

# Cycle count for an Extended Page table walk.
4F.10 EPT.WALK_CYCLES

# L1D data line replacements
51.01 L1D.REPLACEMENT

# Number of times a TSX line had a cache conflict
54.01 TX_MEM.ABORT_CONFLICT

# Number of times a TSX Abort was triggered due to an evicted line caused by a transaction overflow
54.02 TX_MEM.ABORT_CAPACITY_WRITE

# Number of times a TSX Abort was triggered due to a non-release/commit store to lock
54.04 TX_MEM.ABORT_HLE_STORE_TO_ELIDED_LOCK

# Number of times a TSX Abort was triggered due to commit but Lock Buffer not empty
54.08 TX_MEM.ABORT_HLE_ELISION_BUFFER_NOT_EMPTY

# Number of times a TSX Abort was triggered due to release/commit but data and address mismatch
54.10 TX_MEM.ABORT_HLE_ELISION_BUFFER_MISMATCH

# Number of times a TSX Abort was triggered due to attempting an unsupported alignment from Lock Buffer
54.20 TX_MEM.ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT

# Number of times we could not allocate Lock Buffer
54.40 TX_MEM.HLE_ELISION_BUFFER_FULL

# Number of integer Move Elimination candidate uops that were eliminated.
58.01 MOVE_ELIMINATION.INT_ELIMINATED

# Number of SIMD Move Elimination candidate uops that were eliminated.
58.02 MOVE_ELIMINATION.SIMD_ELIMINATED

# Number of integer Move Elimination candidate uops that were not eliminated.
58.04 MOVE_ELIMINATION.INT_NOT_ELIMINATED

# Number of SIMD Move Elimination candidate uops that were not eliminated.
58.08 MOVE_ELIMINATION.SIMD_NOT_ELIMINATED

# Unhalted core cycles when the thread is in ring 0
5C.01 CPL_CYCLES.RING0

# Number of intervals between processor halts while thread is in ring 0
5C.01.CMSK=1.EDG CPL_CYCLES.RING0_TRANS

# Unhalted core cycles when thread is in rings 1, 2, or 3
5C.02 CPL_CYCLES.RING123

# Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort.
5D.01 TX_EXEC.MISC1

# Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region
5D.02 TX_EXEC.MISC2

# Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded
5D.04 TX_EXEC.MISC3

# Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region.
5D.08 TX_EXEC.MISC4

# Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region.
5D.10 TX_EXEC.MISC5

# Cycles when Reservation Station (RS) is empty for the thread
5E.01 RS_EVENTS.EMPTY_CYCLES

# Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues.
5E.01.CMSK=1.EDG.INV RS_EVENTS.EMPTY_END

# Offcore outstanding Demand Data Read transactions in uncore queue.
60.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD

# Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore
60.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD

# Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue.
60.01.CMSK=6 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6

# Offcore outstanding code reads transactions in SuperQueue (SQ), queue to uncore, every cycle
60.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD

# Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore
60.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO

# Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle
60.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO

# Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD

# Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore
60.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD

# Cycles when L1 and L2 are locked due to UC or split lock
63.01 LOCK_CYCLES.SPLIT_LOCK_UC_LOCK_DURATION

# Cycles when L1D is locked
63.02 LOCK_CYCLES.CACHE_LOCK_DURATION

# Instruction Decode Queue (IDQ) empty cycles
79.02 IDQ.EMPTY

# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
79.04 IDQ.MITE_UOPS

# Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path
79.04.CMSK=1 IDQ.MITE_CYCLES

# Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path
79.08 IDQ.DSB_UOPS

# Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path
79.08.CMSK=1 IDQ.DSB_CYCLES

# Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.10 IDQ.MS_DSB_UOPS

# Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.10.CMSK=1 IDQ.MS_DSB_CYCLES

# Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy
79.10.CMSK=1.EDG IDQ.MS_DSB_OCCUR

# Cycles Decode Stream Buffer (DSB) is delivering any Uop
79.18.CMSK=1 IDQ.ALL_DSB_CYCLES_ANY_UOPS

# Cycles Decode Stream Buffer (DSB) is delivering 4 Uops
79.18.CMSK=4 IDQ.ALL_DSB_CYCLES_4_UOPS

# Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.20 IDQ.MS_MITE_UOPS

# Cycles MITE is delivering any Uop
79.24.CMSK=1 IDQ.ALL_MITE_CYCLES_ANY_UOPS

# Cycles MITE is delivering 4 Uops
79.24.CMSK=4 IDQ.ALL_MITE_CYCLES_4_UOPS

# Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.30 IDQ.MS_UOPS

# Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy
79.30.CMSK=1 IDQ.MS_CYCLES

# Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.
79.30.CMSK=1.EDG IDQ.MS_SWITCHES

# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
79.3C IDQ.MITE_ALL_UOPS

# Number of Instruction Cache, Streaming Buffer and Victim Cache Reads. both cacheable and noncacheable, including UC fetches
80.01 ICACHE.HIT

# Number of Instruction Cache, Streaming Buffer and Victim Cache Misses. Includes Uncacheable accesses.
80.02 ICACHE.MISSES

# Cycles where a code fetch is stalled due to L1 instruction-cache miss.
80.04 ICACHE.IFDATA_STALL

# Misses at all ITLB levels that cause page walks
85.01 ITLB_MISSES.MISS_CAUSES_A_WALK

# Code miss in all TLB levels causes a page walk that completes. (4K)
85.02 ITLB_MISSES.WALK_COMPLETED_4K

# Code miss in all TLB levels causes a page walk that completes. (2M/4M)
85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M

# Store miss in all TLB levels causes a page walk that completes. (1G)
85.08 ITLB_MISSES.WALK_COMPLETED_1G

# Misses in all ITLB levels that cause completed page walks.
85.0E ITLB_MISSES.WALK_COMPLETED

# Cycles when PMH is busy with page walks
85.10 ITLB_MISSES.WALK_DURATION

# Core misses that miss the  DTLB and hit the STLB (4K).
85.20 ITLB_MISSES.STLB_HIT_4K

# Code misses that miss the  DTLB and hit the STLB (2M).
85.40 ITLB_MISSES.STLB_HIT_2M

# Operations that miss the first ITLB level but hit the second and do not cause any page walks.
85.60 ITLB_MISSES.STLB_HIT

# Stalls caused by changing prefix length of the instruction.
87.01 ILD_STALL.LCP

# Not taken macro-conditional branches
88.41 BR_INST_EXEC.NONTAKEN_CONDITIONAL

# Taken speculative and retired macro-conditional branches
88.81 BR_INST_EXEC.TAKEN_CONDITIONAL

# Taken speculative and retired macro-conditional branch instructions excluding calls and indirects
88.82 BR_INST_EXEC.TAKEN_DIRECT_JUMP

# Taken speculative and retired indirect branches excluding calls and returns
88.84 BR_INST_EXEC.TAKEN_INDIRECT_JUMP_NON_CALL_RET

# Taken speculative and retired indirect branches with return mnemonic
88.88 BR_INST_EXEC.TAKEN_INDIRECT_NEAR_RETURN

# Taken speculative and retired direct near calls
88.90 BR_INST_EXEC.TAKEN_DIRECT_NEAR_CALL

# Taken speculative and retired indirect calls
88.A0 BR_INST_EXEC.TAKEN_INDIRECT_NEAR_CALL

# Speculative and retired macro-conditional branches
88.C1 BR_INST_EXEC.ALL_CONDITIONAL

# Speculative and retired macro-unconditional branches excluding calls and indirects
88.C2 BR_INST_EXEC.ALL_DIRECT_JMP

# Speculative and retired indirect branches excluding calls and returns
88.C4 BR_INST_EXEC.ALL_INDIRECT_JUMP_NON_CALL_RET

# Speculative and retired indirect return branches.
88.C8 BR_INST_EXEC.ALL_INDIRECT_NEAR_RETURN

# Speculative and retired direct near calls
88.D0 BR_INST_EXEC.ALL_DIRECT_NEAR_CALL

# Speculative and retired  branches
88.FF BR_INST_EXEC.ALL_BRANCHES

# Not taken speculative and retired mispredicted macro conditional branches
89.41 BR_MISP_EXEC.NONTAKEN_CONDITIONAL

# Taken speculative and retired mispredicted macro conditional branches
89.81 BR_MISP_EXEC.TAKEN_CONDITIONAL

# Taken speculative and retired mispredicted indirect branches excluding calls and returns
89.84 BR_MISP_EXEC.TAKEN_INDIRECT_JUMP_NON_CALL_RET

# Taken speculative and retired mispredicted indirect branches with return mnemonic
89.88 BR_MISP_EXEC.TAKEN_RETURN_NEAR

# Taken speculative and retired mispredicted indirect calls.
89.A0 BR_MISP_EXEC.TAKEN_INDIRECT_NEAR_CALL

# Speculative and retired mispredicted macro conditional branches
89.C1 BR_MISP_EXEC.ALL_CONDITIONAL

# Mispredicted indirect branches excluding calls and returns
89.C4 BR_MISP_EXEC.ALL_INDIRECT_JUMP_NON_CALL_RET

# Speculative and retired mispredicted macro conditional branches
89.FF BR_MISP_EXEC.ALL_BRANCHES

# Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE

# Cycles with less than 3 uops delivered by the front end.
9C.01.CMSK=1 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE

# Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE.
9C.01.CMSK=1.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK

# Cycles with less than 2 uops delivered by the front end.
9C.01.CMSK=2 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE

# Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled
9C.01.CMSK=3 IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_1_UOP_DELIV.CORE

# Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled
9C.01.CMSK=4 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE

# Micro-op dispatches cancelled due to insufficient SIMD physical register file read ports
A0.03 UOP_DISPATCHES_CANCELLED.SIMD_PRF

# Cycles per thread when uops are executed in port 0
A1.01 UOPS_DISPATCHED_PORT.PORT_0

# Cycles per thread when uops are executed in port 0
A1.01 UOPS_EXECUTED_PORT.PORT_0

# Cycles per core when uops are exectuted in port 0.
A1.01.AnyT UOPS_EXECUTED_PORT.PORT_0_CORE

# Cycles per thread when uops are executed in port 1
A1.02 UOPS_DISPATCHED_PORT.PORT_1

# Cycles per thread when uops are executed in port 1
A1.02 UOPS_EXECUTED_PORT.PORT_1

# Cycles per core when uops are exectuted in port 1.
A1.02.AnyT UOPS_EXECUTED_PORT.PORT_1_CORE

# Cycles per thread when uops are executed in port 2
A1.04 UOPS_DISPATCHED_PORT.PORT_2

# Cycles per thread when uops are executed in port 2
A1.04 UOPS_EXECUTED_PORT.PORT_2

# Cycles per core when uops are dispatched to port 2.
A1.04.AnyT UOPS_EXECUTED_PORT.PORT_2_CORE

# Cycles per thread when uops are executed in port 3
A1.08 UOPS_DISPATCHED_PORT.PORT_3

# Cycles per thread when uops are executed in port 3
A1.08 UOPS_EXECUTED_PORT.PORT_3

# Cycles per core when uops are dispatched to port 3.
A1.08.AnyT UOPS_EXECUTED_PORT.PORT_3_CORE

# Cycles per thread when uops are executed in port 4
A1.10 UOPS_DISPATCHED_PORT.PORT_4

# Cycles per thread when uops are executed in port 4
A1.10 UOPS_EXECUTED_PORT.PORT_4

# Cycles per core when uops are exectuted in port 4.
A1.10.AnyT UOPS_EXECUTED_PORT.PORT_4_CORE

# Cycles per thread when uops are executed in port 5
A1.20 UOPS_DISPATCHED_PORT.PORT_5

# Cycles per thread when uops are executed in port 5
A1.20 UOPS_EXECUTED_PORT.PORT_5

# Cycles per core when uops are exectuted in port 5.
A1.20.AnyT UOPS_EXECUTED_PORT.PORT_5_CORE

# Cycles per thread when uops are executed in port 6
A1.40 UOPS_DISPATCHED_PORT.PORT_6

# Cycles per thread when uops are executed in port 6
A1.40 UOPS_EXECUTED_PORT.PORT_6

# Cycles per core when uops are exectuted in port 6.
A1.40.AnyT UOPS_EXECUTED_PORT.PORT_6_CORE

# Cycles per thread when uops are executed in port 7
A1.80 UOPS_DISPATCHED_PORT.PORT_7

# Cycles per thread when uops are executed in port 7
A1.80 UOPS_EXECUTED_PORT.PORT_7

# Cycles per core when uops are dispatched to port 7.
A1.80.AnyT UOPS_EXECUTED_PORT.PORT_7_CORE

# Resource-related stall cycles
A2.01 RESOURCE_STALLS.ANY

# Cycles stalled due to no eligible RS entry available.
A2.04 RESOURCE_STALLS.RS

# Cycles stalled due to no store buffers available. (not including draining form sync).
A2.08 RESOURCE_STALLS.SB

# Cycles stalled due to re-order buffer full.
A2.10 RESOURCE_STALLS.ROB

# Cycles while L2 cache miss demand load is outstanding.
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS

# Cycles while L2 cache miss demand load is outstanding.
A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_PENDING

# Cycles while memory subsystem has an outstanding load.
A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_LDM_PENDING

# Cycles while memory subsystem has an outstanding load.
A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_MEM_ANY

# This event increments by 1 for every cycle where there was no execute for this thread.
A3.04.CMSK=4 CYCLE_ACTIVITY.CYCLES_NO_EXECUTE

# Total execution stalls.
A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL

# Execution stalls while L2 cache miss demand load is outstanding.
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS

# Execution stalls while L2 cache miss demand load is outstanding.
A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_PENDING

# Execution stalls while memory subsystem has an outstanding load.
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_LDM_PENDING

# Execution stalls while memory subsystem has an outstanding load.
A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_MEM_ANY

# Cycles while L1 cache miss demand load is outstanding.
A3.08.CMSK=8.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_MISS

# Cycles while L1 cache miss demand load is outstanding.
A3.08.CMSK=8.CTR=2 CYCLE_ACTIVITY.CYCLES_L1D_PENDING

# Execution stalls while L1 cache miss demand load is outstanding.
A3.0C.CMSK=12.CTR=2 CYCLE_ACTIVITY.STALLS_L1D_MISS

# Execution stalls while L1 cache miss demand load is outstanding.
A3.0C.CMSK=12.CTR=2 CYCLE_ACTIVITY.STALLS_L1D_PENDING

# Number of Uops delivered by the LSD.
A8.01 LSD.UOPS

# Cycles Uops delivered by the LSD, but didn't come from the decoder.
A8.01.CMSK=1 LSD.CYCLES_ACTIVE

# Cycles 4 Uops delivered by the LSD, but didn't come from the decoder.
A8.01.CMSK=4 LSD.CYCLES_4_UOPS

# Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles.
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES

# Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages.
AE.01 ITLB.ITLB_FLUSH

# Demand Data Read requests sent to uncore
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD

# Cacheable and noncachaeble code read requests
B0.02 OFFCORE_REQUESTS.DEMAND_CODE_RD

# Demand RFO requests including regular RFOs, locks, ItoM
B0.04 OFFCORE_REQUESTS.DEMAND_RFO

# Demand and prefetch data reads
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD

# Any memory transaction that reached the SQ.
B0.80 OFFCORE_REQUESTS.ALL_REQUESTS

# Counts the number of uops to be executed per-thread each cycle.
B1.01 UOPS_EXECUTED.THREAD

# Cycles where at least 1 uop was executed per-thread.
B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC

# Counts number of cycles no uops were dispatched to be executed on this thread.
B1.01.CMSK=1.INV UOPS_EXECUTED.STALL_CYCLES

# Cycles where at least 2 uops were executed per-thread.
B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC

# Cycles where at least 3 uops were executed per-thread.
B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC

# Cycles where at least 4 uops were executed per-thread.
B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4_UOPS_EXEC

# Number of uops executed on the core.
B1.02 UOPS_EXECUTED.CORE

# Cycles at least 1 micro-op is executed from any thread on physical core.
B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1

# Cycles at least 2 micro-op is executed from any thread on physical core.
B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2

# Cycles at least 3 micro-op is executed from any thread on physical core.
B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3

# Cycles at least 4 micro-op is executed from any thread on physical core.
B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4

# Cycles with no micro-ops executed from any thread on physical core.
B1.02.INV UOPS_EXECUTED.CORE_CYCLES_NONE

# Offcore requests buffer cannot take more entries for this thread core.
B2.01 OFFCORE_REQUESTS_BUFFER.SQ_FULL

# Number of DTLB page walker hits in the L1+FB.
BC.11 PAGE_WALKER_LOADS.DTLB_L1

# Number of DTLB page walker hits in the L2.
BC.12 PAGE_WALKER_LOADS.DTLB_L2

# Number of DTLB page walker hits in the L3 + XSNP.
BC.14 PAGE_WALKER_LOADS.DTLB_L3

# Number of DTLB page walker hits in Memory.
BC.18 PAGE_WALKER_LOADS.DTLB_MEMORY

# Number of ITLB page walker hits in the L1+FB.
BC.21 PAGE_WALKER_LOADS.ITLB_L1

# Number of ITLB page walker hits in the L2.
BC.22 PAGE_WALKER_LOADS.ITLB_L2

# Number of ITLB page walker hits in the L3 + XSNP.
BC.24 PAGE_WALKER_LOADS.ITLB_L3

# DTLB flush attempts of the thread-specific entries
BD.01 TLB_FLUSH.DTLB_THREAD

# STLB flush attempts
BD.20 TLB_FLUSH.STLB_ANY

# Number of instructions retired. General Counter   - architectural event
C0.00 INST_RETIRED.ANY_P

# Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution
C0.01.CTR=1 INST_RETIRED.PREC_DIST

# FP operations  retired. X87 FP operations that have no exceptions:
C0.02 INST_RETIRED.X87

# Number of transitions from AVX-256 to legacy SSE when penalty applicable.
C1.08 OTHER_ASSISTS.AVX_TO_SSE

# Number of transitions from SSE to AVX-256 when penalty applicable.
C1.10 OTHER_ASSISTS.SSE_TO_AVX

# Number of times any microcode assist is invoked by HW upon uop writeback.
C1.40 OTHER_ASSISTS.ANY_WB_ASSIST

# Actually retired uops.
C2.01 UOPS_RETIRED.ALL

# Cycles without actually retired uops.
C2.01.CMSK=1.INV UOPS_RETIRED.STALL_CYCLES

# Cycles with less than 10 actually retired uops.
C2.01.CMSK=10.INV UOPS_RETIRED.TOTAL_CYCLES

# Retirement slots used.
C2.02 UOPS_RETIRED.RETIRE_SLOTS

# Cycles there was a Nuke. Account for both thread-specific and All Thread Nukes.
C3.01 MACHINE_CLEARS.CYCLES

# Number of machine clears (nukes) of any type.
C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT

# Counts the number of machine clears due to memory order conflicts.
C3.02 MACHINE_CLEARS.MEMORY_ORDERING

# Self-modifying code (SMC) detected.
C3.04 MACHINE_CLEARS.SMC

# This event counts the number of executed Intel AVX masked load operations that refer to an illegal address range with the mask bits set to 0.
C3.20 MACHINE_CLEARS.MASKMOV

# All (macro) branch instructions retired.
C4.00 BR_INST_RETIRED.ALL_BRANCHES

# Conditional branch instructions retired.
C4.01 BR_INST_RETIRED.CONDITIONAL

# Direct and indirect near call instructions retired.
C4.02 BR_INST_RETIRED.NEAR_CALL

# Direct and indirect macro near call instructions retired (captured in ring 3).
C4.02 BR_INST_RETIRED.NEAR_CALL_R3

# All (macro) branch instructions retired. (Precise Event - PEBS)
C4.04 BR_INST_RETIRED.ALL_BRANCHES_PEBS

# Return instructions retired.
C4.08 BR_INST_RETIRED.NEAR_RETURN

# Not taken branch instructions retired.
C4.10 BR_INST_RETIRED.NOT_TAKEN

# Taken branch instructions retired.
C4.20 BR_INST_RETIRED.NEAR_TAKEN

# Far branch instructions retired.
C4.40 BR_INST_RETIRED.FAR_BRANCH

# All mispredicted macro branch instructions retired.
C5.00 BR_MISP_RETIRED.ALL_BRANCHES

# Mispredicted conditional branch instructions retired.
C5.01 BR_MISP_RETIRED.CONDITIONAL

# Mispredicted macro branch instructions retired. (Precise Event - PEBS)
C5.04 BR_MISP_RETIRED.ALL_BRANCHES_PEBS

# This event counts the number of mispredicted ret instructions retired. Non PEBS
C5.08 BR_MISP_RETIRED.RET

# number of near branch instructions retired that were mispredicted and taken.
C5.20 BR_MISP_RETIRED.NEAR_TAKEN

# Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE

# Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE

# Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computation operation.   Applies to SSE* and AVX* scalar double and single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.03 FP_ARITH_INST_RETIRED.SCALAR

# Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE

# Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 4 calculations per element.
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE

# Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 4 calculations per element.
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE

# Number of SSE/AVX computational double precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.15 FP_ARITH_INST_RETIRED.DOUBLE

# Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 8 calculations per element.
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE

# Number of SSE/AVX computational single precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.2A FP_ARITH_INST_RETIRED.SINGLE

# Number of SSE/AVX computational packed floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* packed double and single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
C7.3C FP_ARITH_INST_RETIRED.PACKED

# Number of times we entered an HLE region; does not count nested transactions
C8.01 HLE_RETIRED.START

# Number of times HLE commit succeeded
C8.02 HLE_RETIRED.COMMIT

# Number of times HLE abort was triggered
C8.04 HLE_RETIRED.ABORTED

# Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts).
C8.08 HLE_RETIRED.ABORTED_MISC1

# Number of times an HLE execution aborted due to uncommon conditions
C8.10 HLE_RETIRED.ABORTED_MISC2

# Number of times an HLE execution aborted due to HLE-unfriendly instructions
C8.20 HLE_RETIRED.ABORTED_MISC3

# Number of times an HLE execution aborted due to incompatible memory type
C8.40 HLE_RETIRED.ABORTED_MISC4

# Number of times an HLE execution aborted due to none of the previous 4 categories (e.g. interrupts)
C8.80 HLE_RETIRED.ABORTED_MISC5

# Number of times we entered an RTM region; does not count nested transactions
C9.01 RTM_RETIRED.START

# Number of times RTM commit succeeded
C9.02 RTM_RETIRED.COMMIT

# Number of times RTM abort was triggered
C9.04 RTM_RETIRED.ABORTED

# Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts)
C9.08 RTM_RETIRED.ABORTED_MISC1

# Number of times an RTM execution aborted due to various memory events (e.g., read/write capacity and conflicts).
C9.10 RTM_RETIRED.ABORTED_MISC2

# Number of times an RTM execution aborted due to HLE-unfriendly instructions
C9.20 RTM_RETIRED.ABORTED_MISC3

# Number of times an RTM execution aborted due to incompatible memory type
C9.40 RTM_RETIRED.ABORTED_MISC4

# Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt)
C9.80 RTM_RETIRED.ABORTED_MISC5

# Number of X87 assists due to output value.
CA.02 FP_ASSIST.X87_OUTPUT

# Number of X87 assists due to input value.
CA.04 FP_ASSIST.X87_INPUT

# Number of SIMD FP assists due to Output values
CA.08 FP_ASSIST.SIMD_OUTPUT

# Number of SIMD FP assists due to input values
CA.10 FP_ASSIST.SIMD_INPUT

# Cycles with any input/output SSE or FP assist
CA.1E.CMSK=1 FP_ASSIST.ANY

# Count cases of saving new LBR
CC.20 ROB_MISC_EVENTS.LBR_INSERTS

# Randomly selected loads with latency value being above 16
CD.01.MSR_3F6H=0x10.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16

# Randomly selected loads with latency value being above 256
CD.01.MSR_3F6H=0x100.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256

# Randomly selected loads with latency value being above 32
CD.01.MSR_3F6H=0x20.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32

# Randomly selected loads with latency value being above 512
CD.01.MSR_3F6H=0x200.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512

# Randomly selected loads with latency value being above 4
CD.01.MSR_3F6H=0x4.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4

# Randomly selected loads with latency value being above 64
CD.01.MSR_3F6H=0x40.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64

# Randomly selected loads with latency value being above 8
CD.01.MSR_3F6H=0x8.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8

# Randomly selected loads with latency value being above 128
CD.01.MSR_3F6H=0x80.CTR=3.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128

# Retired load uops that miss the STLB.
D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS

# Retired store uops that miss the STLB.
D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES

# Retired load uops with locked access.
D0.21 MEM_UOPS_RETIRED.LOCK_LOADS

# Retired load uops that split across a cacheline boundary.
D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS

# Retired store uops that split across a cacheline boundary.
D0.42 MEM_UOPS_RETIRED.SPLIT_STORES

# All retired load uops.
D0.81 MEM_UOPS_RETIRED.ALL_LOADS

# All retired store uops.
D0.82 MEM_UOPS_RETIRED.ALL_STORES

# Retired load uops with L1 cache hits as data sources.
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT

# Retired load uops with L2 cache hits as data sources.
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT

# Retired load uops which data sources were data hits in L3 without snoops required.
D1.04 MEM_LOAD_UOPS_RETIRED.L3_HIT

# Retired load uops misses in L1 cache as data sources.
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS

# Miss in mid-level (L2) cache. Excludes Unknown data-source.
D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS

# Miss in last-level (L3) cache. Excludes Unknown data-source.
D1.20 MEM_LOAD_UOPS_RETIRED.L3_MISS

# Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready.
D1.40 MEM_LOAD_UOPS_RETIRED.HIT_LFB

# Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache.
D2.01 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS

# Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache.
D2.02 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT

# Retired load uops which data sources were HitM responses from shared L3.
D2.04 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM

# Retired load uops which data sources were hits in L3 without snoops required.
D2.08 MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE

# Data from local DRAM either Snoop not needed or Snoop Miss (RspI)
D3.01 MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM

# Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end.
E6.1F BACLEARS.ANY

# Demand Data Read requests that access L2 cache
F0.01 L2_TRANS.DEMAND_DATA_RD

# RFO requests that access L2 cache
F0.02 L2_TRANS.RFO

# L2 cache accesses when fetching instructions
F0.04 L2_TRANS.CODE_RD

# L2 or L3 HW prefetches that access L2 cache
F0.08 L2_TRANS.ALL_PF

# L1D writebacks that access L2 cache
F0.10 L2_TRANS.L1D_WB

# L2 fill requests that access L2 cache
F0.20 L2_TRANS.L2_FILL

# L2 writebacks that access L2 cache
F0.40 L2_TRANS.L2_WB

# Transactions accessing L2 pipe
F0.80 L2_TRANS.ALL_REQUESTS

# L2 cache lines in I state filling L2
F1.01 L2_LINES_IN.I

# L2 cache lines in S state filling L2
F1.02 L2_LINES_IN.S

# L2 cache lines in E state filling L2
F1.04 L2_LINES_IN.E

# L2 cache lines filling L2
F1.07 L2_LINES_IN.ALL

# Clean L2 cache lines evicted by demand.
F2.05 L2_LINES_OUT.DEMAND_CLEAN

# Split locks in SQ
F4.10 SQ_MISC.SPLIT_LOCK