configs for EMR, MTL, and ARL

2025-07-21 15:11:03 +02:00 · 2025-05-18 16:22:49 +02:00
parent c7bce4bebc
commit 08e351b7fb
16 changed files with 4188 additions and 5 deletions
--- a/configs/cfg_ArrowLakeE_Skymont_all_core.txt
+++ b/configs/cfg_ArrowLakeE_Skymont_all_core.txt
@@ -0,0 +1,485 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/ARL/events/arrowlake_skymont_core.json (Version: 1.09)
+# Applies to processors with family-model in {6-C5, 6-C6}
+
+# Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls.
+00.06.CTR=37 TOPDOWN_FE_BOUND.ALL
+
+# Fixed Counter: Counts the number of consumed retirement slots.
+00.07.CTR=38 TOPDOWN_RETIRING.ALL
+
+# Counts the number of occurrences a retired load gets blocked because its address exactly matches an older store whose data is not ready (a.k.a. unknown).  unready_fwd
+03.01 LD_BLOCKS.DATA_UNKNOWN
+
+# Counts the number of occurrences a retired load gets blocked because its address partially overlaps with an older store (size mismatch) - unknown_sta/bad_forward
+03.02 LD_BLOCKS.STORE_FORWARD
+
+# Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.
+03.04 LD_BLOCKS.ADDRESS_ALIAS
+
+# Counts the number of cycles that uops are blocked due to store buffer full
+04.01 MEM_SCHEDULER_BLOCK.ST_BUF
+
+# Counts the number of cycles that uops are blocked due to load buffer full
+04.02 MEM_SCHEDULER_BLOCK.LD_BUF
+
+# Counts the number of cycles that uops are blocked due to RSV full
+04.04 MEM_SCHEDULER_BLOCK.RSV
+
+# Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.
+04.07 MEM_SCHEDULER_BLOCK.ALL
+
+# Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a DL1 miss.
+05.01 LD_HEAD.L1_MISS
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.
+05.81 LD_HEAD.L1_MISS_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to request buffers full or lock in progress.
+05.82 LD_HEAD.WCB_FULL_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.
+05.84 LD_HEAD.ST_ADDR_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.
+05.90 LD_HEAD.DTLB_MISS_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.
+05.A0 LD_HEAD.PGWALK_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.
+05.C0 LD_HEAD.OTHER_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.
+05.F4 LD_HEAD.L1_BOUND_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.
+05.FF LD_HEAD.ANY_AT_RET
+
+# Counts the number of page walks initiated by a demand load that missed the first and second level TLBs.
+08.01 DTLB_LOAD_MISSES.MISS_CAUSED_WALK
+
+# Counts the number of page walks completed due to load DTLB misses to a 4K page.
+08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
+
+# Counts the number of page walks completed due to load DTLB misses to a 2M or 4M page.
+08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
+
+# Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.
+08.10 DTLB_LOAD_MISSES.WALK_PENDING
+
+# Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.
+08.20 DTLB_LOAD_MISSES.STLB_HIT
+
+# When 4-uops are requested and only 2-uops are delivered, the event counts 2.  Uops_issued correlates to the number of ROB entries.  If uop takes 2 ROB slots it counts as 2 uops_issued.
+0E.00 UOPS_ISSUED.ANY
+
+# Counts misaligned loads that are 4K page splits.
+13.02 MISALIGN_MEM_REF.LOAD_PAGE_SPLIT
+
+# Counts misaligned stores that are 4K page splits.
+13.04 MISALIGN_MEM_REF.STORE_PAGE_SPLIT
+
+# Counts the number of demand and prefetch transactions that the External Queue (XQ) rejects due to a full or near full condition.
+30.00 L2_REJECT_XQ.ANY
+
+# Counts the number of request that were not accepted into the L2Q because the L2Q is FULL.
+31.00 CORE_REJECT_L2Q.ANY
+
+# Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.
+34.01 MEM_BOUND_STALLS_LOAD.L2_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC.
+34.06 MEM_BOUND_STALLS_LOAD.LLC_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.
+34.78 MEM_BOUND_STALLS_LOAD.LLC_MISS
+
+# Counts the number of cycles the core is stalled due to a demand load which missed in the L2 cache.
+34.7E MEM_BOUND_STALLS_LOAD.L2_MISS
+
+# Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.
+34.7F MEM_BOUND_STALLS_LOAD.ALL
+
+# Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.
+35.01 MEM_BOUND_STALLS_IFETCH.L2_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which hit in the LLC.
+35.06 MEM_BOUND_STALLS_IFETCH.LLC_HIT
+
+# Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which missed in the L2 cache.
+35.7E MEM_BOUND_STALLS_IFETCH.L2_MISS
+
+# Counts the number of cycles the core is stalled due to an instruction cache or TLB miss.
+35.7F MEM_BOUND_STALLS_IFETCH.ALL
+
+# Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.THREAD_P]
+3C.00 CPU_CLK_UNHALTED.CORE_P
+
+# Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.CORE_P]
+3C.00 CPU_CLK_UNHALTED.THREAD_P
+
+# Counts the number of unhalted reference clock cycles
+3C.01 CPU_CLK_UNHALTED.REF_TSC_P
+
+# Counts the number of page walks initiated by a store that missed the first and second level TLBs.
+49.01 DTLB_STORE_MISSES.MISS_CAUSED_WALK
+
+# Counts the number of page walks completed due to store DTLB misses to a 4K page.
+49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
+
+# Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle.
+49.10 DTLB_STORE_MISSES.WALK_PENDING
+
+# Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.
+49.20 DTLB_STORE_MISSES.STLB_HIT
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to ms
+71.01 TOPDOWN_FE_BOUND.CISC
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear
+71.02 TOPDOWN_FE_BOUND.BRANCH_DETECT
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to predecode wrong
+71.04 TOPDOWN_FE_BOUND.PREDECODE
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stall
+71.08 TOPDOWN_FE_BOUND.DECODE
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to itlb miss
+71.10 TOPDOWN_FE_BOUND.ITLB_MISS
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to an icache miss
+71.20 TOPDOWN_FE_BOUND.ICACHE
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to BTClear
+71.40 TOPDOWN_FE_BOUND.BRANCH_RESTEER
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.
+71.72 TOPDOWN_FE_BOUND.FRONTEND_LATENCY
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend that do not categorize into any other common frontend stall
+71.80 TOPDOWN_FE_BOUND.OTHER
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.
+71.8D TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH
+
+# Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.
+73.00 TOPDOWN_BAD_SPECULATION.ALL_P
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).
+73.01 TOPDOWN_BAD_SPECULATION.NUKE
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to Fast Nukes such as  Memory Ordering Machine clears and MRN nukes
+73.02 TOPDOWN_BAD_SPECULATION.FASTNUKE
+
+# Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.
+73.03 TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict
+73.04 TOPDOWN_BAD_SPECULATION.MISPREDICT
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to due to certain allocation restrictions
+74.01 TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop).  This could be caused by RSV full or load/store buffer block.
+74.02 TOPDOWN_BE_BOUND.MEM_SCHEDULER
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to IEC and FPC RAT stalls - which can be due to the FIQ and IEC reservation station stall (integer, FP and SIMD scheduler not being able to accept another uop. )
+74.08 TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb
+74.10 TOPDOWN_BE_BOUND.SERIALIZATION
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to mrbl stall.  A 'marble' refers to a physical register file entry, also known as the physical destination (PDST).
+74.20 TOPDOWN_BE_BOUND.REGISTER
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full
+74.40 TOPDOWN_BE_BOUND.REORDER_BUFFER
+
+# Counts the number of issue slots where no uop could issue due to an IQ scoreboard that stalls allocation until a specified older uop retires or (in the case of jump scoreboard) executes. Commonly executed instructions with IQ scoreboards include LFENCE and MFENCE.
+75.01 SERIALIZATION.IQ_JEU_SCB
+
+# Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires.
+75.02 SERIALIZATION.NON_C01_MS_SCB
+
+# Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.
+75.04 SERIALIZATION.C01_MS_SCB
+
+# Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are present.
+80.01 ICACHE.HIT
+
+# Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are not present. -
+80.02 ICACHE.MISSES
+
+# Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.
+80.03 ICACHE.ACCESSES
+
+# Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.
+85.01 ITLB_MISSES.MISS_CAUSED_WALK
+
+# Counts the number of page walks completed due to instruction fetch misses to a 4K page.
+85.02 ITLB_MISSES.WALK_COMPLETED_4K
+
+# Counts the number of page walks completed due to instruction fetch misses to a 2M or 4M page.
+85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
+
+# Counts the number of page walks completed due to instruction fetch misses to any page size.
+85.0E ITLB_MISSES.WALK_COMPLETED
+
+# Counts the number of page walks outstanding for iside in PMH every cycle.
+85.10 ITLB_MISSES.WALK_PENDING
+
+# Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.
+85.20 ITLB_MISSES.STLB_HIT
+
+# Counts the number of retirement slots not consumed due to front end stalls
+9C.01 TOPDOWN_FE_BOUND.ALL_P
+
+# Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]
+A4.02 TOPDOWN_BE_BOUND.ALL
+
+# Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]
+A4.02 TOPDOWN_BE_BOUND.ALL_P
+
+# Counts the number of uops executed on floating point and vector integer store data port.
+B2.01 FP_VINT_UOPS_EXECUTED.STD
+
+# Counts the number of uops executed on floating point and vector integer port 0, 1, 2, 3.
+B2.1E FP_VINT_UOPS_EXECUTED.PRIMARY
+
+# Counts the number of uops executed on a load port.
+B3.01 INT_UOPS_EXECUTED.LD
+
+# Counts the number of uops executed on a Store address port.
+B3.02 INT_UOPS_EXECUTED.STA
+
+# Counts the number of uops executed on an integer store data and jump port.
+B3.04 INT_UOPS_EXECUTED.STD_JMP
+
+# Counts the number of uops executed on integer port  0,1, 2, 3.
+B3.78 INT_UOPS_EXECUTED.PRIMARY
+
+# Counts the number of uops executed on secondary integer ports 0,1,2,3.
+B3.80 INT_UOPS_EXECUTED.2ND
+
+# Counts the number of instructions retired
+C0.00 INST_RETIRED.ANY_P
+
+# Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.
+C2.01 UOPS_RETIRED.MS
+
+# Counts the number of consumed retirement slots.
+C2.02 TOPDOWN_RETIRING.ALL_P
+
+# Counts the number of floating point divide uops retired (x87 and sse, including x87 sqrt)
+C2.08 UOPS_RETIRED.FPDIV
+
+# Counts the number of integer divide uops retired
+C2.10 UOPS_RETIRED.IDIV
+
+# Counts the number of x87 uops retired, includes those in ms flows
+C2.20 UOPS_RETIRED.X87
+
+# Counts all machine clears for any reason including, but not limited to memory ordering, SMC, and FP assist.
+C3.00 MACHINE_CLEARS.ANY
+
+# Counts the number of memory ordering machine clears triggered due to a snoop from an external agent. Does not count internally generated machine clears such as those due to disambiguations.
+C3.02 MACHINE_CLEARS.MEMORY_ORDERING
+
+# Counts the number of floating point operations retired that required microcode assist.
+C3.04 MACHINE_CLEARS.FP_ASSIST
+
+# Counts the number of memory ordering machine clears triggered due to an internal load passing an older store within the same CPU.
+C3.08 MACHINE_CLEARS.DISAMBIGUATION
+
+# Counts the number of nukes due to memory renaming
+C3.10 MACHINE_CLEARS.MRN_NUKE
+
+# Counts the number of times that the machine clears due to a page fault.  Covers both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation.
+C3.20 MACHINE_CLEARS.PAGE_FAULT
+
+# Counts the total number of branch instructions retired for all branch types.
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+
+# Counts retired JCC (Jump on Conditional Code) branch instructions retired includes both taken and not taken branches
+C4.7E BR_INST_RETIRED.COND
+
+# Counts the number of far branch instructions retired, includes far jump, far call and return, and Interrupt call and return
+C4.BF BR_INST_RETIRED.FAR_BRANCH
+
+# Counts the number of near indirect JMP and near indirect CALL branch instructions retired
+C4.EB BR_INST_RETIRED.INDIRECT
+
+# Counts the number of near RET branch instructions retired
+C4.F7 BR_INST_RETIRED.NEAR_RETURN
+
+# Counts the number of near CALL branch instructions retired
+C4.F9 BR_INST_RETIRED.NEAR_CALL
+
+# Counts the number of near indirect CALL branch instructions retired
+C4.FB BR_INST_RETIRED.INDIRECT_CALL
+
+# Counts the number of near relative CALL branch instructions retired
+C4.FD BR_INST_RETIRED.REL_CALL
+
+# Counts the number of taken JCC branch instructions retired
+C4.FE BR_INST_RETIRED.COND_TAKEN
+
+# Counts the total number of mispredicted branch instructions retired for all branch types.
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+
+# Counts the number of mispredicted JCC branch instructions retired
+C5.7E BR_MISP_RETIRED.COND
+
+# Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired
+C5.EB BR_MISP_RETIRED.INDIRECT
+
+# Counts the number of mispredicted near RET branch instructions retired
+C5.F7 BR_MISP_RETIRED.RETURN
+
+# Counts the number of mispredicted near indirect CALL branch instructions retired
+C5.FB BR_MISP_RETIRED.INDIRECT_CALL
+
+# Counts the number of mispredicted taken JCC branch instructions retired
+C5.FE BR_MISP_RETIRED.COND_TAKEN
+
+# Counts the number of instructions retired that were tagged with having preceded with frontend bound behavior
+C6.00 FRONTEND_RETIRED.ALL
+
+# Counts the number of instructions retired that were tagged following an ms flow due to the bubble/wasted issue slot from exiting long ms flow
+C6.01 FRONTEND_RETIRED.CISC
+
+# Counts the number of instruction retired that are tagged after a branch instruction causes bubbles/empty issue slots due to a baclear
+C6.02 FRONTEND_RETIRED.BRANCH_DETECT
+
+# Counts the number of instruction retired that are tagged after a branch instruction causes bubbles/empty issue slots due to a predecode wrong
+C6.04 FRONTEND_RETIRED.PREDECODE
+
+# Counts the number of instructions retired that were tagged every cycle the decoder is unable to send 4 uops
+C6.08 FRONTEND_RETIRED.DECODE
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss
+C6.10 FRONTEND_RETIRED.ITLB_MISS
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to icache miss
+C6.20 FRONTEND_RETIRED.ICACHE
+
+# Counts the number of instruction retired that are tagged after a branch instruction causes bubbles /empty issue slots due to a btclear
+C6.40 FRONTEND_RETIRED.BRANCH_RESTEER
+
+# Counts the number of instruction retired tagged after a wasted issue slot if none of the previous events occurred
+C6.80 FRONTEND_RETIRED.OTHER
+
+# Counts the number of retired instructions whose sources are a scalar 32bit single precision floating point
+C7.01 FP_INST_RETIRED.32B_SP
+
+# Counts the number of retired instructions whose sources are a scalar 64 bit double precision floating point
+C7.02 FP_INST_RETIRED.64B_DP
+
+# Counts the number of retired instructions whose sources are a packed 128 bit single precision floating point. This may be SSE or AVX.128 operations.
+C7.04 FP_INST_RETIRED.128B_SP
+
+# Counts the number of retired instructions whose sources are a packed 128 bit double precision floating point. This may be SSE or AVX.128 operations.
+C7.08 FP_INST_RETIRED.128B_DP
+
+# Counts the number of retired instructions whose sources are a packed 256 bit single precision floating point.
+C7.10 FP_INST_RETIRED.256B_SP
+
+# Counts the number of retired instructions whose sources are a packed 256 bit double precision floating point.
+C7.20 FP_INST_RETIRED.256B_DP
+
+# Counts the total number of  floating point retired instructions.
+C7.3F FP_INST_RETIRED.ALL
+
+# Counts the number of floating point operations that produce 64 bit double precision results
+C8.01 FP_FLOPS_RETIRED.FP64
+
+# Counts the number of floating point operations that produce 32 bit single precision results
+C8.02 FP_FLOPS_RETIRED.FP32
+
+# Counts the number of all types of floating point operations per uop with all default weighting
+C8.03 FP_FLOPS_RETIRED.ALL
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to Instruction L1 cache miss, that hit in the L2 cache.
+C9.01 FRONTEND_RETIRED_SOURCE.ICACHE_L2_HIT
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to Instruction L1 cache miss, that hit in the L3 cache.
+C9.06 FRONTEND_RETIRED_SOURCE.ICACHE_L3_HIT
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to Instruction L1 cache miss, that missed in the L2 cache.
+C9.0E FRONTEND_RETIRED_SOURCE.ICACHE_L2_MISS
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that hit in the second level TLB.
+C9.10 FRONTEND_RETIRED_SOURCE.ITLB_STLB_HIT
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that also missed the second level TLB.
+C9.20 FRONTEND_RETIRED_SOURCE.ITLB_STLB_MISS
+
+# Counts the number of cycles when any of the floating point or integer dividers are active.
+CD.03.CMSK=1 ARITH.DIV_ACTIVE
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x10.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x100.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x20.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x200.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x4.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x40.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x8.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled
+D0.05.MSR_3F6H=0x80.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128
+
+# Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES
+D0.06 MEM_UOPS_RETIRED.STORE_LATENCY
+
+# Counts the number of load uops retired that performed one or more locks
+D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
+
+# Counts the number of retired split load uops.
+D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
+
+# Counts the number of retired split store uops.
+D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
+
+# Counts the number of memory uops retired that were splits.
+D0.43 MEM_UOPS_RETIRED.SPLIT
+
+# Counts the number of load uops retired.
+D0.81 MEM_UOPS_RETIRED.ALL_LOADS
+
+# Counts the number of store uops retired.
+D0.82 MEM_UOPS_RETIRED.ALL_STORES
+
+# Counts the number of load ops retired that hit the L1 data cache
+D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
+
+# Counts the number of load ops retired that hit in the L2 cache
+D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
+
+# Counts the number of load ops retired that hit in the L3 cache.
+D1.1C MEM_LOAD_UOPS_RETIRED.L3_HIT
+
+# Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.
+D1.20 MEM_LOAD_UOPS_RETIRED.WCB_HIT
+
+# Counts the number of load ops retired that miss in the L1 data cache
+D1.40 MEM_LOAD_UOPS_RETIRED.L1_MISS
+
+# Counts the number of load ops retired that miss in the L2 cache
+D1.80 MEM_LOAD_UOPS_RETIRED.L2_MISS
+
+# Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.
+E6.01 BACLEARS.ANY
--- a/configs/cfg_ArrowLakeE_Skymont_all_offcore.txt
+++ b/configs/cfg_ArrowLakeE_Skymont_all_offcore.txt
@@ -0,0 +1,11 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/ARL/events/arrowlake_skymont_core.json (Version: 1.09)
+# Applies to processors with family-model in {6-C5, 6-C6}
+
+# Counts streaming stores that have any type of response.
+B7.01.MSR_RSP0=0x10800.TakenAlone OCR.STREAMING_WR.ANY_RESPONSE
+
+# Counts streaming stores which modify only part of a 64 byte cacheline that have any type of response.
+B7.01.MSR_RSP0=0x400000010000.TakenAlone OCR.PARTIAL_STREAMING_WR.ANY_RESPONSE
+
+# Counts streaming stores which modify a full 64 byte cacheline that have any type of response.
+B7.01.MSR_RSP0=0x800000010000.TakenAlone OCR.FULL_STREAMING_WR.ANY_RESPONSE
--- a/configs/cfg_ArrowLakeE_Skymont_common.txt
+++ b/configs/cfg_ArrowLakeE_Skymont_common.txt
@@ -0,0 +1,21 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/ARL/events/arrowlake_skymont_core.json (Version: 1.09)
+# Applies to processors with family-model in {6-C5, 6-C6}
+
+3C.00 CORE_CYCLES
+C0.00 INST_RETIRED
+C2.00 UOPS_RETIRED.ALL
+C2.01 UOPS_RETIRED.MS
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+B2.01 FP_VINT_UOPS_EXECUTED.STD
+B2.1E FP_VINT_UOPS_EXECUTED.PRIMARY
+B3.01 INT_UOPS_EXECUTED.LD
+B3.02 INT_UOPS_EXECUTED.STA
+B3.04 INT_UOPS_EXECUTED.STD_JMP
+B3.78 INT_UOPS_EXECUTED.PRIMARY
+B3.80 INT_UOPS_EXECUTED.2ND
+D0.81 MEM_UOPS_RETIRED.ALL_LOADS
+D0.82 MEM_UOPS_RETIRED.ALL_STORES
+D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
+D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
+D1.1C MEM_LOAD_UOPS_RETIRED.L3_HIT
--- a/configs/cfg_ArrowLakeP_LionCove_all_core.txt
+++ b/configs/cfg_ArrowLakeP_LionCove_all_core.txt
@@ -0,0 +1,875 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/ARL/events/arrowlake_lioncove_core.json (Version: 1.09)
+# Applies to processors with family-model in {6-C5, 6-C6}
+
+# Count number of times a load is depending on another load that had just write back its data or in previous or  2 cycles back. This event supports in-direct dependency through a single uop.
+02.07 DEPENDENT_LOADS.ANY
+
+# False dependencies in MOB due to partial compare on address.
+03.04 LD_BLOCKS.ADDRESS_ALIAS
+
+# Loads blocked due to overlapping with a preceding store that cannot be forwarded.
+03.82 LD_BLOCKS.STORE_FORWARD
+
+# The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.
+03.88 LD_BLOCKS.NO_SR
+
+# Code miss in all TLB levels causes a page walk that completes. (4K)
+11.02 ITLB_MISSES.WALK_COMPLETED_4K
+
+# Code miss in all TLB levels causes a page walk that completes. (2M/4M)
+11.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
+
+# Code miss in all TLB levels causes a page walk that completes. (All page sizes)
+11.0E ITLB_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for an outstanding code request in the PMH each cycle.
+11.10 ITLB_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request.
+11.10.CMSK=1 ITLB_MISSES.WALK_ACTIVE
+
+# Instruction fetch requests that miss the ITLB and hit the STLB.
+11.20 ITLB_MISSES.STLB_HIT
+
+# Page walks completed due to a demand data load to a 4K page.
+12.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
+
+# Page walks completed due to a demand data load to a 2M/4M page.
+12.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walks completed due to a demand data load to a 1G page.
+12.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G
+
+# Load miss in all TLB levels causes a page walk that completes. (All page sizes)
+12.0E DTLB_LOAD_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for a demand load in the PMH each cycle.
+12.10 DTLB_LOAD_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for a demand load.
+12.10.CMSK=1 DTLB_LOAD_MISSES.WALK_ACTIVE
+
+# Loads that miss the DTLB and hit the STLB.
+12.20 DTLB_LOAD_MISSES.STLB_HIT
+
+# Page walks completed due to a demand data store to a 4K page.
+13.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
+
+# Page walks completed due to a demand data store to a 2M/4M page.
+13.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walks completed due to a demand data store to a 1G page.
+13.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G
+
+# Store misses in all TLB levels causes a page walk that completes. (All page sizes)
+13.0E DTLB_STORE_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for a store in the PMH each cycle.
+13.10 DTLB_STORE_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for a store.
+13.10.CMSK=1 DTLB_STORE_MISSES.WALK_ACTIVE
+
+# Stores that miss the DTLB and hit the STLB.
+13.20 DTLB_STORE_MISSES.STLB_HIT
+
+# For every cycle, increments by the number of outstanding demand data read requests pending.
+20.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
+
+# Cycles where at least 1 outstanding demand data read request is pending.
+20.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
+
+# Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle.
+20.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
+
+# Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore.
+20.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD
+
+# Store Read transactions pending for off-core. Highly correlated.
+20.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
+
+# Cycles with offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore.
+20.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
+
+# Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore
+20.08 OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
+
+# Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore.
+20.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
+
+# For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.
+20.10 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD
+
+# Cycles where data return is pending for a Demand Data Read request who miss L3 cache.
+20.10.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD
+
+# Demand Data Read requests sent to uncore
+21.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
+
+# Cacheable and Non-Cacheable code read requests
+21.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
+
+# Demand RFO requests including regular RFOs, locks, ItoM
+21.04 OFFCORE_REQUESTS.DEMAND_RFO
+
+# Demand and prefetch data reads
+21.08 OFFCORE_REQUESTS.DATA_RD
+
+# Counts demand data read requests that miss the L3 cache.
+21.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
+
+# Any memory transaction that reached the SQ.
+21.80 OFFCORE_REQUESTS.ALL_REQUESTS
+
+# Demand Data Read miss L2 cache
+24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
+
+# RFO requests that miss L2 cache
+24.22 L2_RQSTS.RFO_MISS
+
+# L2 cache misses when fetching instructions
+24.24 L2_RQSTS.CODE_RD_MISS
+
+# Read requests with true-miss in L2 cache [This event is alias to L2_RQSTS.MISS]
+24.3F L2_REQUEST.MISS
+
+# Read requests with true-miss in L2 cache [This event is alias to L2_REQUEST.MISS]
+24.3F L2_RQSTS.MISS
+
+# Demand Data Read requests that hit L2 cache
+24.41 L2_RQSTS.DEMAND_DATA_RD_HIT
+
+# RFO requests that hit L2 cache
+24.42 L2_RQSTS.RFO_HIT
+
+# L2 cache hits when fetching instructions, code reads.
+24.44 L2_RQSTS.CODE_RD_HIT
+
+# Demand Data Read access L2 cache
+24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
+
+# L2 code requests
+24.E4 L2_RQSTS.ALL_CODE_RD
+
+# All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES, L2_RQSTS.ANY]
+24.FF L2_REQUEST.ALL
+
+# All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES, L2_REQUEST.ALL]
+24.FF L2_RQSTS.ANY
+
+# All accesses to L2 cache [This event is alias to L2_REQUEST.ALL,L2_RQSTS.ANY]
+24.FF L2_RQSTS.REFERENCES
+
+# L2 cache lines filling L2
+25.1F L2_LINES_IN.ALL
+
+# Non-modified cache lines that are silently dropped by L2 cache.
+26.01 L2_LINES_OUT.SILENT
+
+# Modified cache lines that are evicted by L2 cache when triggered by an L2 cache fill.
+26.02 L2_LINES_OUT.NON_SILENT
+
+# Cache lines that have been L2 hardware prefetched but not used by demand accesses
+26.04 L2_LINES_OUT.USELESS_HWPF
+
+# Counts bus locks, accounts for cache line split locks and UC locks.
+2C.10 SQ_MISC.BUS_LOCK
+
+# Cycles the uncore cannot take further requests
+2D.01.CMSK=1 XQ.FULL
+
+# Core-originated cacheable requests that missed L3  (Except hardware prefetches to the L3)
+2E.41 LONGEST_LAT_CACHE.MISS
+
+# Core-originated cacheable requests that refer to L3 (Except hardware prefetches to the L3)
+2E.4F LONGEST_LAT_CACHE.REFERENCE
+
+# Thread cycles when thread is not in halt state [This event is alias to CPU_CLK_UNHALTED.THREAD_P]
+3C.00 CPU_CLK_UNHALTED.CORE_P
+
+# Thread cycles when thread is not in halt state [This event is alias to CPU_CLK_UNHALTED.CORE_P]
+3C.00 CPU_CLK_UNHALTED.THREAD_P
+
+# Reference cycles when the core is not in halt state.
+3C.01 CPU_CLK_UNHALTED.REF_TSC_P
+
+# Number of PREFETCHNTA instructions executed.
+40.01 SW_PREFETCH_ACCESS.NTA
+
+# Number of PREFETCHT0 instructions executed.
+40.02 SW_PREFETCH_ACCESS.T0
+
+# Number of PREFETCHT1 or PREFETCHT2 instructions executed.
+40.04 SW_PREFETCH_ACCESS.T1_T2
+
+# Number of PREFETCHW instructions executed.
+40.08 SW_PREFETCH_ACCESS.PREFETCHW
+
+# Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed.
+40.0F SW_PREFETCH_ACCESS.ANY
+
+# Cycles when L1D is locked
+42.02 LOCK_CYCLES.CACHE_LOCK_DURATION
+
+# MEM_STORE_RETIRED.L2_HIT
+44.01 MEM_STORE_RETIRED.L2_HIT
+
+# Counts cycles where no execution is happening due to loads waiting for L1 cache (that is: no execution & load in flight & no load missed L1 cache)
+46.01 MEMORY_STALLS.L1
+
+# Counts cycles where no execution is happening due to loads waiting for L2 cache (that is: no execution & load in flight & load missed L1 & no load missed L2 cache)
+46.02 MEMORY_STALLS.L2
+
+# Counts cycles where no execution is happening due to loads waiting for L3 cache (that is: no execution & load in flight & load missed L1 & load missed L2 cache & no load missed L3 Cache)
+46.04 MEMORY_STALLS.L3
+
+# Counts cycles where no execution is happening due to loads waiting for Memory (that is: no execution & load in flight & a load missed L3 cache)
+46.08 MEMORY_STALLS.MEM
+
+# Cycles with L1D load Misses outstanding.
+48.01.CMSK=1.CTR=2 L1D_PENDING.LOAD_CYCLES
+
+# Number of L1D misses that are outstanding
+48.01.CTR=2 L1D_PENDING.LOAD
+
+# Number of demand requests that missed L1D cache
+49.01 L1D_MISS.LOAD
+
+# Number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability.
+49.02 L1D_MISS.FB_FULL
+
+# Number of cycles a demand request has waited due to L1D due to lack of L2 resources.
+49.04 L1D_MISS.L2_STALLS
+
+# Counts the number of cache lines replaced in L0 data cache.
+51.01 L1D.L0_REPLACEMENT
+
+# Cachelines replaced into the L0 and L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case
+51.05 L1D.REPLACEMENT
+
+# Clears due to Unknown Branches.
+60.01 BACLEARS.ANY
+
+# DSB-to-MITE switch true penalty cycles.
+61.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
+
+# Instruction decoders utilized in a cycle
+75.01.CTR=2 INST_DECODED.DECODERS
+
+# Number of non dec-by-all uops decoded by decoder
+76.01 UOPS_DECODED.DEC0_UOPS
+
+# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
+79.04 IDQ.MITE_UOPS
+
+# Cycles MITE is delivering any Uop
+79.04.CMSK=1 IDQ.MITE_CYCLES_ANY
+
+# Cycles MITE is delivering optimal number of Uops
+79.04.CMSK=8 IDQ.MITE_CYCLES_OK
+
+# Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path
+79.08 IDQ.DSB_UOPS
+
+# Cycles Decode Stream Buffer (DSB) is delivering any Uop
+79.08.CMSK=1 IDQ.DSB_CYCLES_ANY
+
+# Cycles DSB is delivering optimal number of Uops
+79.08.CMSK=8 IDQ.DSB_CYCLES_OK
+
+# Uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy
+79.20 IDQ.MS_UOPS
+
+# Cycles when uops are being delivered to IDQ while MS is busy
+79.20.CMSK=1 IDQ.MS_CYCLES_ANY
+
+# Number of switches from DSB or MITE to the MS
+79.20.CMSK=1.EDG IDQ.MS_SWITCHES
+
+# Cycles where a code fetch is stalled due to L1 instruction cache miss.
+80.04 ICACHE_DATA.STALLS
+
+# ICACHE_DATA.STALL_PERIODS
+80.04.CMSK=1.EDG ICACHE_DATA.STALL_PERIODS
+
+# Cycles where a code fetch is stalled due to L1 instruction cache tag miss.
+83.04 ICACHE_TAG.STALLS
+
+# Stalls caused by changing prefix length of the instruction.
+87.01 DECODE.LCP
+
+# Cycles the Microcode Sequencer is busy.
+87.02 DECODE.MS_BUSY
+
+# This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.
+9C.01 IDQ_BUBBLES.CORE
+
+# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled
+9C.01.CMSK=1.INV IDQ_BUBBLES.CYCLES_FE_WAS_OK
+
+# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]
+9C.01.CMSK=8 IDQ_BUBBLES.STARVATION_CYCLES
+
+# Cycles when no uops are delivered by the IDQ for 2 or more cycles when backend of the machine is not stalled - normally indicating a Fetch Latency issue
+9C.04 IDQ_BUBBLES.FETCH_LATENCY
+
+# Counts cycles where the pipeline is stalled due to serializing operations.
+A2.02 BE_STALLS.SCOREBOARD
+
+# Total execution stalls.
+A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
+
+# Cycles while memory subsystem has an outstanding load.
+A3.10.CMSK=16 CYCLE_ACTIVITY.CYCLES_MEM_ANY
+
+# TMA slots available for an unhalted logical processor. General counter - architectural event
+A4.01 TOPDOWN.SLOTS_P
+
+# This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.
+A4.02 TOPDOWN.BACKEND_BOUND_SLOTS
+
+# TMA slots wasted due to incorrect speculations.
+A4.04.CTR=0 TOPDOWN.BAD_SPEC_SLOTS
+
+# TMA slots wasted due to incorrect speculation by branch mispredictions
+A4.08.CTR=0 TOPDOWN.BR_MISPREDICT_SLOTS
+
+# TOPDOWN.MEMORY_BOUND_SLOTS
+A4.10.CTR=3 TOPDOWN.MEMORY_BOUND_SLOTS
+
+# Cycles when RS was empty and a resource allocation stall is asserted
+A5.01 RS.EMPTY_RESOURCE
+
+# Cycles when Reservation Station (RS) is empty for the thread.
+A5.07 RS.EMPTY
+
+# Counts end of periods where the Reservation Station (RS) was empty.
+A5.07.CMSK=1.EDG.INV RS.EMPTY_COUNT
+
+# Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.
+A6.02 EXE_ACTIVITY.1_PORTS_UTIL
+
+# Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.
+A6.04 EXE_ACTIVITY.2_PORTS_UTIL
+
+# Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.
+A6.08 EXE_ACTIVITY.3_PORTS_UTIL
+
+# Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty.
+A6.0C EXE_ACTIVITY.2_3_PORTS_UTIL
+
+# Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.
+A6.10 EXE_ACTIVITY.4_PORTS_UTIL
+
+# Execution stalls while memory subsystem has an outstanding load.
+A6.21.CMSK=5 EXE_ACTIVITY.BOUND_ON_LOADS
+
+# Cycles where the Store Buffer was full and no loads caused an execution stall.
+A6.40.CMSK=2 EXE_ACTIVITY.BOUND_ON_STORES
+
+# Cycles no uop executed while RS was not empty, the SB was not full and there was no outstanding load.
+A6.80 EXE_ACTIVITY.EXE_BOUND_0_PORTS
+
+# Number of Uops delivered by the LSD.
+A8.01 LSD.UOPS
+
+# Cycles Uops delivered by the LSD, but didn't come from the decoder.
+A8.01.CMSK=1 LSD.CYCLES_ACTIVE
+
+# Cycles optimal number of Uops delivered by the LSD, but did not come from the decoder.
+A8.01.CMSK=8 LSD.CYCLES_OK
+
+# Core cycles the allocator was stalled due to recovery from earlier clear event for this thread
+AD.01 INT_MISC.RECOVERY_CYCLES
+
+# Clears speculative count
+AD.01.CMSK=1.EDG INT_MISC.CLEARS_COUNT
+
+# TMA slots where uops got dropped
+AD.10 INT_MISC.UOP_DROPPING
+
+# Bubble cycles of BPClear.
+AD.40.TakenAlone INT_MISC.BPCLEAR_CYCLES
+
+# Bubble cycles of BAClear (Unknown Branch).
+AD.40.TakenAlone INT_MISC.UNKNOWN_BRANCH_CYCLES
+
+# Counts cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.
+AD.80 INT_MISC.CLEAR_RESTEER_CYCLES
+
+# Uops that RAT issues to RS
+AE.01 UOPS_ISSUED.ANY
+
+# UOPS_ISSUED.CYCLES
+AE.01.CMSK=1 UOPS_ISSUED.CYCLES
+
+# Cycles when floating-point divide unit is busy executing divide or square root operations.
+B0.01.CMSK=1 ARITH.FPDIV_ACTIVE
+
+# Cycles when integer divide unit is busy executing divide or square root operations.
+B0.08.CMSK=1 ARITH.IDIV_ACTIVE
+
+# Cycles when divide unit is busy executing divide or square root operations.
+B0.09.CMSK=1 ARITH.DIV_ACTIVE
+
+# Cycles where at least 1 uop was executed per-thread
+B1.01.CMSK=1.CTR=3 UOPS_EXECUTED.CYCLES_GE_1
+
+# Counts number of cycles no uops were dispatched to be executed on this thread.
+B1.01.CMSK=1.INV.CTR=3 UOPS_EXECUTED.STALLS
+
+# Cycles where at least 2 uops were executed per-thread
+B1.01.CMSK=2.CTR=3 UOPS_EXECUTED.CYCLES_GE_2
+
+# Cycles where at least 3 uops were executed per-thread
+B1.01.CMSK=3.CTR=3 UOPS_EXECUTED.CYCLES_GE_3
+
+# Cycles where at least 4 uops were executed per-thread
+B1.01.CMSK=4.CTR=3 UOPS_EXECUTED.CYCLES_GE_4
+
+# Counts the number of uops to be executed per-thread each cycle.
+B1.01.CTR=3 UOPS_EXECUTED.THREAD
+
+# Counts the number of x87 uops dispatched.
+B1.10 UOPS_EXECUTED.X87
+
+# Uops executed on any INT EU ports
+B2.01 UOPS_DISPATCHED.INT_EU_ALL
+
+# Uops executed on INT EU ALU ports.
+B2.02 UOPS_DISPATCHED.ALU
+
+# Uops executed on Load ports
+B2.04 UOPS_DISPATCHED.LOAD
+
+# Number of Uops dispatched/executed by Slow EU (e.g. 3+ cycles LEA, >1 cycles shift, iDIVs, CR; *H operation)
+B2.08 UOPS_DISPATCHED.SLOW
+
+# Uops executed on STD ports
+B2.10 UOPS_DISPATCHED.STD
+
+# Number of (shift) 1-cycle Uops dispatched/executed by any of the Shift Eus
+B2.20 UOPS_DISPATCHED.SHIFT
+
+# Number of Uops dispatched/executed by any of the 3 JEUs (all ups that hold the JEU including macro; micro jumps; fetch-from-eip)
+B2.40 UOPS_DISPATCHED.JMP
+
+# Number of Uops dispatched on STA ports
+B2.80 UOPS_DISPATCHED.STA
+
+# Number of FP-arith-uops dispatched on 1st VEC port (port 0). FP-arith-uops are of type ADD* / SUB* / MUL / FMA* / DPP.
+B3.01 FP_ARITH_DISPATCHED.V0
+
+# Number of FP-arith-uops dispatched on 2nd VEC port (port 1)
+B3.02 FP_ARITH_DISPATCHED.V1
+
+# Number of FP-arith-uops dispatched on 3rd VEC port (port 5)
+B3.04 FP_ARITH_DISPATCHED.V2
+
+# Number of FP-arith-uops dispatched on 4th VEC port
+B3.08 FP_ARITH_DISPATCHED.V3
+
+# Number of instructions retired. General Counter - architectural event
+C0.00 INST_RETIRED.ANY_P
+
+# Retired NOP instructions.
+C0.02 INST_RETIRED.NOP
+
+# Iterations of Repeat string retired instructions.
+C0.08 INST_RETIRED.REP_ITERATION
+
+# retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon)
+C0.10 INST_RETIRED.BR_FUSED
+
+# INST_RETIRED.MACRO_FUSED
+C0.30 INST_RETIRED.MACRO_FUSED
+
+# Counts all microcode FP assists.
+C1.02 ASSISTS.FP
+
+# Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events.
+C1.04 ASSISTS.HARDWARE
+
+# ASSISTS.PAGE_FAULT
+C1.08 ASSISTS.PAGE_FAULT
+
+# ASSISTS.SSE_AVX_MIX
+C1.10 ASSISTS.SSE_AVX_MIX
+
+# Number of occurrences where a microcode assist is invoked by hardware.
+C1.1F ASSISTS.ANY
+
+# Retired uops except the last uop of each instruction.
+C2.01 UOPS_RETIRED.HEAVY
+
+# This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance  for example, as measured by the instructions-per-cycle metric.
+C2.02 UOPS_RETIRED.SLOTS
+
+# Cycles with retired uop(s).
+C2.02.CMSK=1 UOPS_RETIRED.CYCLES
+
+# Cycles without actually retired uops.
+C2.02.CMSK=1.INV UOPS_RETIRED.STALLS
+
+# Number of non-speculative switches to the Microcode Sequencer (MS)
+C2.04.CMSK=1.EDG.TakenAlone UOPS_RETIRED.MS_SWITCHES
+
+# UOPS_RETIRED.MS
+C2.04.TakenAlone UOPS_RETIRED.MS
+
+# Number of machine clears (nukes) of any type.
+C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT
+
+# Number of machine clears due to memory ordering conflicts.
+C3.02 MACHINE_CLEARS.MEMORY_ORDERING
+
+# Self-modifying code (SMC) detected.
+C3.04 MACHINE_CLEARS.SMC
+
+# All branch instructions retired.
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+
+# Taken conditional branch instructions retired.
+C4.01 BR_INST_RETIRED.COND_TAKEN
+
+# Taken backward conditional branch instructions retired.
+C4.01 BR_INST_RETIRED.COND_TAKEN_BWD
+
+# Taken forward conditional branch instructions retired.
+C4.02 BR_INST_RETIRED.COND_TAKEN_FWD
+
+# Direct and indirect near call instructions retired.
+C4.02 BR_INST_RETIRED.NEAR_CALL
+
+# Return instructions retired.
+C4.08 BR_INST_RETIRED.NEAR_RETURN
+
+# Not taken branch instructions retired.
+C4.10 BR_INST_RETIRED.COND_NTAKEN
+
+# Conditional branch instructions retired.
+C4.11 BR_INST_RETIRED.COND
+
+# Taken branch instructions retired.
+C4.20 BR_INST_RETIRED.NEAR_TAKEN
+
+# Far branch instructions retired.
+C4.40 BR_INST_RETIRED.FAR_BRANCH
+
+# Indirect near branch instructions retired (excluding returns)
+C4.80 BR_INST_RETIRED.INDIRECT
+
+# All mispredicted branch instructions retired.
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+
+# number of branch instructions retired that were mispredicted and taken forward.
+C5.00 BR_MISP_RETIRED.COND_TAKEN_FWD
+
+# number of branch instructions retired that were mispredicted and taken.
+C5.01 BR_MISP_RETIRED.COND_TAKEN
+
+# number of branch instructions retired that were mispredicted and taken backward.
+C5.01 BR_MISP_RETIRED.COND_TAKEN_BWD
+
+# number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.01 BR_MISP_RETIRED.COND_TAKEN_BWD_COST
+
+# number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.02 BR_MISP_RETIRED.COND_TAKEN_FWD_COST
+
+# Mispredicted indirect CALL retired.
+C5.02 BR_MISP_RETIRED.INDIRECT_CALL
+
+# This event counts the number of mispredicted ret instructions retired. Non PEBS
+C5.08 BR_MISP_RETIRED.RET
+
+# Mispredicted non-taken conditional branch instructions retired.
+C5.10 BR_MISP_RETIRED.COND_NTAKEN
+
+# Mispredicted conditional branch instructions retired.
+C5.11 BR_MISP_RETIRED.COND
+
+# Number of near branch instructions retired that were mispredicted and taken.
+C5.20 BR_MISP_RETIRED.NEAR_TAKEN
+
+# Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.41 BR_MISP_RETIRED.COND_TAKEN_COST
+
+# Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.42 BR_MISP_RETIRED.INDIRECT_CALL_COST
+
+# All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.44 BR_MISP_RETIRED.ALL_BRANCHES_COST
+
+# Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.48 BR_MISP_RETIRED.RET_COST
+
+# Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.50 BR_MISP_RETIRED.COND_NTAKEN_COST
+
+# Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.51 BR_MISP_RETIRED.COND_COST
+
+# Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.60 BR_MISP_RETIRED.NEAR_TAKEN_COST
+
+# Miss-predicted near indirect branch instructions retired (excluding returns)
+C5.80 BR_MISP_RETIRED.INDIRECT
+
+# Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.C0 BR_MISP_RETIRED.INDIRECT_COST
+
+# Mispredicted Retired ANT branches
+C6.02.TakenAlone FRONTEND_RETIRED.MISP_ANT
+
+# Retired ANT branches
+C6.03.TakenAlone FRONTEND_RETIRED.ANY_ANT
+
+# Retired Instructions who experienced DSB miss.
+C6.03.TakenAlone FRONTEND_RETIRED.ANY_DSB_MISS
+
+# Retired Instructions who experienced a critical DSB miss.
+C6.03.TakenAlone FRONTEND_RETIRED.DSB_MISS
+
+# Retired Instructions who experienced iTLB true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.ITLB_MISS
+
+# Retired Instructions who experienced Instruction L1 Cache true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.L1I_MISS
+
+# Retired Instructions who experienced Instruction L2 Cache true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.L2_MISS
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_128
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_16
+
+# Retired instructions after front-end starvation of at least 2 cycles
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_256
+
+# Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_32
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_4
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_512
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_64
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_8
+
+# Counts flows delivered by the Microcode Sequencer
+C6.03.TakenAlone FRONTEND_RETIRED.MS_FLOWS
+
+# Retired Instructions who experienced STLB (2nd level TLB) true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.STLB_MISS
+
+# Retired instructions that caused clears due to being Unknown Branches.
+C6.03.TakenAlone FRONTEND_RETIRED.UNKNOWN_BRANCH
+
+# Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.01 FP_ARITH_OPS_RETIRED.SCALAR_DOUBLE
+
+# Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.02 FP_ARITH_OPS_RETIRED.SCALAR_SINGLE
+
+# Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below.  Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 RANGE SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
+C7.03 FP_ARITH_OPS_RETIRED.SCALAR
+
+# Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.04 FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.08 FP_ARITH_OPS_RETIRED.128B_PACKED_SINGLE
+
+# FP_ARITH_INST_RETIRED.VECTOR_128B [This event is alias to FP_ARITH_OPS_RETIRED.VECTOR_128B]
+C7.0C FP_ARITH_INST_RETIRED.VECTOR_128B
+
+# FP_ARITH_OPS_RETIRED.VECTOR_128B [This event is alias to FP_ARITH_INST_RETIRED.VECTOR_128B]
+C7.0C FP_ARITH_OPS_RETIRED.VECTOR_128B
+
+# Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.10 FP_ARITH_OPS_RETIRED.256B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below.  Each count represents 2 or/and 4 computation operations, 1 for each element.  Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.
+C7.18 FP_ARITH_OPS_RETIRED.4_FLOPS
+
+# Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.20 FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE
+
+# FP_ARITH_INST_RETIRED.VECTOR_256B [This event is alias to FP_ARITH_OPS_RETIRED.VECTOR_256B]
+C7.30 FP_ARITH_INST_RETIRED.VECTOR_256B
+
+# FP_ARITH_OPS_RETIRED.VECTOR_256B [This event is alias to FP_ARITH_INST_RETIRED.VECTOR_256B]
+C7.30 FP_ARITH_OPS_RETIRED.VECTOR_256B
+
+# Number of any Vector retired FP arithmetic instructions
+C7.3C FP_ARITH_OPS_RETIRED.VECTOR
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles.
+CD.01.MSR_3F6H=0x10.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles.
+CD.01.MSR_3F6H=0x100.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles.
+CD.01.MSR_3F6H=0x20.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles.
+CD.01.MSR_3F6H=0x200.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles.
+CD.01.MSR_3F6H=0x4.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles.
+CD.01.MSR_3F6H=0x40.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles.
+CD.01.MSR_3F6H=0x400.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles.
+CD.01.MSR_3F6H=0x8.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles.
+CD.01.MSR_3F6H=0x80.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles.
+CD.01.MSR_3F6H=0x800.CTR=2.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_2048
+
+# Retired memory store access operations. A PDist event for PEBS Store Latency Facility.
+CD.02.CTR=0 MEM_TRANS_RETIRED.STORE_SAMPLE
+
+# Retired load instructions that hit the STLB.
+D0.09 MEM_INST_RETIRED.STLB_HIT_LOADS
+
+# Retired store instructions that hit the STLB.
+D0.0A MEM_INST_RETIRED.STLB_HIT_STORES
+
+# Retired load instructions that miss the STLB.
+D0.11 MEM_INST_RETIRED.STLB_MISS_LOADS
+
+# Retired store instructions that miss the STLB.
+D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
+
+# Retired load instructions with locked access.
+D0.21 MEM_INST_RETIRED.LOCK_LOADS
+
+# Retired load instructions that split across a cacheline boundary.
+D0.41 MEM_INST_RETIRED.SPLIT_LOADS
+
+# Retired store instructions that split across a cacheline boundary.
+D0.42 MEM_INST_RETIRED.SPLIT_STORES
+
+# Counts all retired load instructions.
+D0.81 MEM_INST_RETIRED.ALL_LOADS
+
+# Retired store instructions.
+D0.82 MEM_INST_RETIRED.ALL_STORES
+
+# Retired software prefetch instructions.
+D0.84 MEM_INST_RETIRED.ALL_SWPF
+
+# All retired memory instructions.
+D0.87 MEM_INST_RETIRED.ANY
+
+# Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache.
+D1.00 MEM_LOAD_RETIRED.L1_HIT_L1
+
+# Retired load instructions with L1 cache hits as data sources
+D1.01 MEM_LOAD_RETIRED.L1_HIT
+
+# Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source.
+D1.01 MEM_LOAD_RETIRED.L1_HIT_L0
+
+# Retired load instructions with L2 cache hits as data sources
+D1.02 MEM_LOAD_RETIRED.L2_HIT
+
+# Retired load instructions with L3 cache hits as data sources
+D1.04 MEM_LOAD_RETIRED.L3_HIT
+
+# Retired load instructions missed L1 cache as data sources
+D1.08 MEM_LOAD_RETIRED.L1_MISS
+
+# Retired load instructions missed L2 cache as data sources
+D1.10 MEM_LOAD_RETIRED.L2_MISS
+
+# Retired load instructions missed L3 cache as data sources
+D1.20 MEM_LOAD_RETIRED.L3_MISS
+
+# Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.
+D1.40 MEM_LOAD_RETIRED.FB_HIT
+
+# Retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.
+D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
+
+# Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache
+D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD
+
+# Retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded.
+D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM
+
+# Retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$)
+D2.10 MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD
+
+# Retired instructions with at least 1 uncacheable load or lock.
+D4.04 MEM_LOAD_MISC_RETIRED.UC
+
+# LFENCE instructions retired
+E0.20 MISC2_RETIRED.LFENCE
+
+# LBR record is inserted
+E4.01 MISC_RETIRED.LBR_INSERTS
+
+# Retired memory uops for any access
+E5.0F MEM_UOP_RETIRED.ANY
+
+# integer ADD, SUB, SAD 128-bit vector instructions.
+E7.03 INT_VEC_RETIRED.ADD_128
+
+# integer ADD, SUB, SAD 256-bit vector instructions.
+E7.0C INT_VEC_RETIRED.ADD_256
+
+# INT_VEC_RETIRED.VNNI_128
+E7.10 INT_VEC_RETIRED.VNNI_128
+
+# Number of vector integer instructions retired of 128-bit vector-width.
+E7.13 INT_VEC_RETIRED.128BIT
+
+# INT_VEC_RETIRED.VNNI_256
+E7.20 INT_VEC_RETIRED.VNNI_256
+
+# INT_VEC_RETIRED.SHUFFLES
+E7.40 INT_VEC_RETIRED.SHUFFLES
+
+# INT_VEC_RETIRED.MUL_256
+E7.80 INT_VEC_RETIRED.MUL_256
+
+# Number of vector integer instructions retired of 256-bit vector-width.
+E7.AC INT_VEC_RETIRED.256BIT
+
+# Core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.
+EC.10 CPU_CLK_UNHALTED.C01
+
+# Core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.
+EC.20 CPU_CLK_UNHALTED.C02
+
+# Core clocks when a PAUSE is pending.
+EC.40 CPU_CLK_UNHALTED.PAUSE
+
+# Number of Pause instructions
+EC.40.CMSK=1.EDG CPU_CLK_UNHALTED.PAUSE_INST
+
+# Core clocks when the thread is in the C0.1 or C0.2 or running a PAUSE in C0 ACPI state.
+EC.70 CPU_CLK_UNHALTED.C0_WAIT
--- a/configs/cfg_ArrowLakeP_LionCove_all_offcore.txt
+++ b/configs/cfg_ArrowLakeP_LionCove_all_offcore.txt
@@ -0,0 +1,29 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/ARL/events/arrowlake_lioncove_core.json (Version: 1.09)
+# Applies to processors with family-model in {6-C5, 6-C6}
+
+# Counts demand data reads that have any type of response.
+2A.01.MSR_RSP0=0x10001.TakenAlone OCR.DEMAND_DATA_RD.ANY_RESPONSE
+
+# Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.
+2A.01.MSR_RSP0=0x10002.TakenAlone OCR.DEMAND_RFO.ANY_RESPONSE
+
+# Counts streaming stores that have any type of response.
+2A.01.MSR_RSP0=0x10800.TakenAlone OCR.STREAMING_WR.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by DRAM.
+2A.01.MSR_RSP0=0x1E780000001.TakenAlone OCR.DEMAND_DATA_RD.DRAM
+
+# Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches which forwarded the unmodified data to the requesting core.
+2A.01.MSR_RSP0=0x20001E00001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD
+
+# Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.
+2A.01.MSR_RSP0=0x40001E00001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM
+
+# Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.
+2A.01.MSR_RSP0=0x40001E00002.TakenAlone OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM
+
+# Counts demand data reads that were not supplied by the L3 cache.
+2A.01.MSR_RSP0=0xFE7F8000001.TakenAlone OCR.DEMAND_DATA_RD.L3_MISS
+
+# Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.
+2A.01.MSR_RSP0=0xFE7F8000002.TakenAlone OCR.DEMAND_RFO.L3_MISS
--- a/configs/cfg_ArrowLakeP_LionCove_common.txt
+++ b/configs/cfg_ArrowLakeP_LionCove_common.txt
@@ -0,0 +1,32 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/ARL/events/arrowlake_lioncove_core.json (Version: 1.09)
+# Applies to processors with family-model in {6-C5, 6-C6}
+
+3C.00 CORE_CYCLES
+C0.00 INST_RETIRED
+79.04 IDQ.MITE_UOPS
+79.08 IDQ.DSB_UOPS
+79.20 IDQ.MS_UOPS
+A8.01 LSD.UOPS
+AE.01 UOPS_ISSUED
+B1.01.CTR=3 UOPS_EXECUTED
+C2.02 UOPS_RETIRED.SLOTS
+B2.01 UOPS_DISPATCHED.INT_EU_ALL
+B2.02 UOPS_DISPATCHED.ALU
+B2.08 UOPS_DISPATCHED.SLOW
+B2.04 UOPS_DISPATCHED.LOAD
+B2.10 UOPS_DISPATCHED.STD
+B2.20 UOPS_DISPATCHED.SHIFT
+B2.40 UOPS_DISPATCHED.JMP
+B2.80 UOPS_DISPATCHED.STA
+B3.01 FP_ARITH_DISPATCHED.V0
+B3.02 FP_ARITH_DISPATCHED.V1
+B3.04 FP_ARITH_DISPATCHED.V2
+B3.08 FP_ARITH_DISPATCHED.V3
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+D1.01 MEM_LOAD_RETIRED.L1_HIT
+D1.08 MEM_LOAD_RETIRED.L1_MISS
+D1.02 MEM_LOAD_RETIRED.L2_HIT
+D1.10 MEM_LOAD_RETIRED.L2_MISS
+D1.04 MEM_LOAD_RETIRED.L3_HIT
+D1.20 MEM_LOAD_RETIRED.L3_MISS
--- a/configs/cfg_EmeraldRapids_all_core.txt
+++ b/configs/cfg_EmeraldRapids_all_core.txt
@@ -0,0 +1,929 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/EMR/events/emeraldrapids_core.json (Version: 1.13)
+# Applies to processors with family-model in {6-CF}
+
+# False dependencies in MOB due to partial compare on address.
+03.04 LD_BLOCKS.ADDRESS_ALIAS
+
+# Loads blocked due to overlapping with a preceding store that cannot be forwarded.
+03.82 LD_BLOCKS.STORE_FORWARD
+
+# The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.
+03.88 LD_BLOCKS.NO_SR
+
+# Code miss in all TLB levels causes a page walk that completes. (4K)
+11.02 ITLB_MISSES.WALK_COMPLETED_4K
+
+# Code miss in all TLB levels causes a page walk that completes. (2M/4M)
+11.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
+
+# Code miss in all TLB levels causes a page walk that completes. (All page sizes)
+11.0E ITLB_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for an outstanding code request in the PMH each cycle.
+11.10 ITLB_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request.
+11.10.CMSK=1 ITLB_MISSES.WALK_ACTIVE
+
+# Instruction fetch requests that miss the ITLB and hit the STLB.
+11.20 ITLB_MISSES.STLB_HIT
+
+# Page walks completed due to a demand data load to a 4K page.
+12.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
+
+# Page walks completed due to a demand data load to a 2M/4M page.
+12.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walks completed due to a demand data load to a 1G page.
+12.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G
+
+# Load miss in all TLB levels causes a page walk that completes. (All page sizes)
+12.0E DTLB_LOAD_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for a demand load in the PMH each cycle.
+12.10 DTLB_LOAD_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for a demand load.
+12.10.CMSK=1 DTLB_LOAD_MISSES.WALK_ACTIVE
+
+# Loads that miss the DTLB and hit the STLB.
+12.20 DTLB_LOAD_MISSES.STLB_HIT
+
+# Page walks completed due to a demand data store to a 4K page.
+13.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
+
+# Page walks completed due to a demand data store to a 2M/4M page.
+13.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walks completed due to a demand data store to a 1G page.
+13.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G
+
+# Store misses in all TLB levels causes a page walk that completes. (All page sizes)
+13.0E DTLB_STORE_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for a store in the PMH each cycle.
+13.10 DTLB_STORE_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for a store.
+13.10.CMSK=1 DTLB_STORE_MISSES.WALK_ACTIVE
+
+# Stores that miss the DTLB and hit the STLB.
+13.20 DTLB_STORE_MISSES.STLB_HIT
+
+# For every cycle, increments by the number of outstanding demand data read requests pending.
+20.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
+
+# Cycles where at least 1 outstanding demand data read request is pending.
+20.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
+
+# Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle.
+20.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
+
+# Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore.
+20.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD
+
+# OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
+20.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
+
+# OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
+20.08 OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
+
+# OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
+20.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
+
+# For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.
+20.10 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD
+
+# Demand Data Read requests sent to uncore
+21.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
+
+# Cacheable and noncacheable code read requests
+21.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
+
+# Demand RFO requests including regular RFOs, locks, ItoM
+21.04 OFFCORE_REQUESTS.DEMAND_RFO
+
+# Demand and prefetch data reads
+21.08 OFFCORE_REQUESTS.DATA_RD
+
+# Counts demand data read requests that miss the L3 cache.
+21.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
+
+# OFFCORE_REQUESTS.ALL_REQUESTS
+21.80 OFFCORE_REQUESTS.ALL_REQUESTS
+
+# L2 writebacks that access L2 cache
+23.40 L2_TRANS.L2_WB
+
+# Demand Data Read miss L2 cache
+24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
+
+# RFO requests that miss L2 cache
+24.22 L2_RQSTS.RFO_MISS
+
+# L2 cache misses when fetching instructions
+24.24 L2_RQSTS.CODE_RD_MISS
+
+# Demand requests that miss L2 cache
+24.27 L2_RQSTS.ALL_DEMAND_MISS
+
+# SW prefetch requests that miss L2 cache.
+24.28 L2_RQSTS.SWPF_MISS
+
+# L2_RQSTS.HWPF_MISS
+24.30 L2_RQSTS.HWPF_MISS
+
+# Read requests with true-miss in L2 cache. [This event is alias to L2_RQSTS.MISS]
+24.3F L2_REQUEST.MISS
+
+# Read requests with true-miss in L2 cache. [This event is alias to L2_REQUEST.MISS]
+24.3F L2_RQSTS.MISS
+
+# Demand Data Read requests that hit L2 cache
+24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT
+
+# RFO requests that hit L2 cache
+24.C2 L2_RQSTS.RFO_HIT
+
+# L2 cache hits when fetching instructions, code reads.
+24.C4 L2_RQSTS.CODE_RD_HIT
+
+# SW prefetch requests that hit L2 cache.
+24.C8 L2_RQSTS.SWPF_HIT
+
+# Demand Data Read access L2 cache
+24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
+
+# RFO requests to L2 cache
+24.E2 L2_RQSTS.ALL_RFO
+
+# L2 code requests
+24.E4 L2_RQSTS.ALL_CODE_RD
+
+# Demand requests to L2 cache
+24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
+
+# L2_RQSTS.ALL_HWPF
+24.F0 L2_RQSTS.ALL_HWPF
+
+# All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES]
+24.FF L2_REQUEST.ALL
+
+# All accesses to L2 cache [This event is alias to L2_REQUEST.ALL]
+24.FF L2_RQSTS.REFERENCES
+
+# L2 cache lines filling L2
+25.1F L2_LINES_IN.ALL
+
+# Non-modified cache lines that are silently dropped by L2 cache.
+26.01 L2_LINES_OUT.SILENT
+
+# Modified cache lines that are evicted by L2 cache when triggered by an L2 cache fill.
+26.02 L2_LINES_OUT.NON_SILENT
+
+# Cache lines that have been L2 hardware prefetched but not used by demand accesses
+26.04 L2_LINES_OUT.USELESS_HWPF
+
+# Counts bus locks, accounts for cache line split locks and UC locks.
+2C.10 SQ_MISC.BUS_LOCK
+
+# Cycles the uncore cannot take further requests
+2D.01.CMSK=1 XQ.FULL_CYCLES
+
+# Core-originated cacheable requests that missed L3  (Except hardware prefetches to the L3)
+2E.41 LONGEST_LAT_CACHE.MISS
+
+# Core-originated cacheable requests that refer to L3 (Except hardware prefetches to the L3)
+2E.4F LONGEST_LAT_CACHE.REFERENCE
+
+# Thread cycles when thread is not in halt state
+3C.00 CPU_CLK_UNHALTED.THREAD_P
+
+# Reference cycles when the core is not in halt state.
+3C.01 CPU_CLK_UNHALTED.REF_TSC_P
+
+# Core crystal clock cycles when this thread is unhalted and the other thread is halted.
+3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
+
+# Core crystal clock cycles. Cycle counts are evenly distributed between active threads in the Core.
+3C.08 CPU_CLK_UNHALTED.REF_DISTRIBUTED
+
+# Number of PREFETCHNTA instructions executed.
+40.01 SW_PREFETCH_ACCESS.NTA
+
+# Number of PREFETCHT0 instructions executed.
+40.02 SW_PREFETCH_ACCESS.T0
+
+# Number of PREFETCHT1 or PREFETCHT2 instructions executed.
+40.04 SW_PREFETCH_ACCESS.T1_T2
+
+# Number of PREFETCHW instructions executed.
+40.08 SW_PREFETCH_ACCESS.PREFETCHW
+
+# Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed.
+40.0F SW_PREFETCH_ACCESS.ANY
+
+# Completed demand load uops that miss the L1 d-cache.
+43.FD MEM_LOAD_COMPLETED.L1_MISS_ANY
+
+# MEM_STORE_RETIRED.L2_HIT
+44.01 MEM_STORE_RETIRED.L2_HIT
+
+# Cycles while L1 cache miss demand load is outstanding.
+47.02.CMSK=2 MEMORY_ACTIVITY.CYCLES_L1D_MISS
+
+# Execution stalls while L1 cache miss demand load is outstanding.
+47.03.CMSK=3 MEMORY_ACTIVITY.STALLS_L1D_MISS
+
+# Execution stalls while L2 cache miss demand cacheable load request is outstanding.
+47.05.CMSK=5 MEMORY_ACTIVITY.STALLS_L2_MISS
+
+# Execution stalls while L3 cache miss demand cacheable load request is outstanding.
+47.09.CMSK=9 MEMORY_ACTIVITY.STALLS_L3_MISS
+
+# Number of L1D misses that are outstanding
+48.01 L1D_PEND_MISS.PENDING
+
+# Cycles with L1D load Misses outstanding.
+48.01.CMSK=1 L1D_PEND_MISS.PENDING_CYCLES
+
+# Number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability.
+48.02 L1D_PEND_MISS.FB_FULL
+
+# Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.
+48.02.CMSK=1.EDG L1D_PEND_MISS.FB_FULL_PERIODS
+
+# Number of cycles a demand request has waited due to L1D due to lack of L2 resources.
+48.04 L1D_PEND_MISS.L2_STALLS
+
+# Counts the number of demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch.
+4C.01 LOAD_HIT_PREFETCH.SWPF
+
+# Counts the number of cache lines replaced in L1 data cache.
+51.01 L1D.REPLACEMENT
+
+# L1D.HWPF_MISS
+51.20 L1D.HWPF_MISS
+
+# Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address
+54.01 TX_MEM.ABORT_CONFLICT
+
+# Speculatively counts the number of TSX aborts due to a data capacity limitation for transactional writes.
+54.02 TX_MEM.ABORT_CAPACITY_WRITE
+
+# Speculatively counts the number of TSX aborts due to a data capacity limitation for transactional reads
+54.80 TX_MEM.ABORT_CAPACITY_READ
+
+# Clears due to Unknown Branches.
+60.01 BACLEARS.ANY
+
+# DSB-to-MITE switch true penalty cycles.
+61.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
+
+# Instruction decoders utilized in a cycle
+75.01 INST_DECODED.DECODERS
+
+# UOPS_DECODED.DEC0_UOPS
+76.01 UOPS_DECODED.DEC0_UOPS
+
+# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
+79.04 IDQ.MITE_UOPS
+
+# Cycles MITE is delivering any Uop
+79.04.CMSK=1 IDQ.MITE_CYCLES_ANY
+
+# Cycles MITE is delivering optimal number of Uops
+79.04.CMSK=6 IDQ.MITE_CYCLES_OK
+
+# Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path
+79.08 IDQ.DSB_UOPS
+
+# Cycles Decode Stream Buffer (DSB) is delivering any Uop
+79.08.CMSK=1 IDQ.DSB_CYCLES_ANY
+
+# Cycles DSB is delivering optimal number of Uops
+79.08.CMSK=6 IDQ.DSB_CYCLES_OK
+
+# Uops delivered to IDQ while MS is busy
+79.20 IDQ.MS_UOPS
+
+# Cycles when uops are being delivered to IDQ while MS is busy
+79.20.CMSK=1 IDQ.MS_CYCLES_ANY
+
+# Number of switches from DSB or MITE to the MS
+79.20.CMSK=1.EDG IDQ.MS_SWITCHES
+
+# Cycles where a code fetch is stalled due to L1 instruction cache miss.
+80.04 ICACHE_DATA.STALLS
+
+# ICACHE_DATA.STALL_PERIODS
+80.04.CMSK=1.EDG ICACHE_DATA.STALL_PERIODS
+
+# Cycles where a code fetch is stalled due to L1 instruction cache tag miss.
+83.04 ICACHE_TAG.STALLS
+
+# Stalls caused by changing prefix length of the instruction.
+87.01 DECODE.LCP
+
+# Cycles the Microcode Sequencer is busy.
+87.02 DECODE.MS_BUSY
+
+# Uops not delivered by IDQ when backend of the machine is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE]
+9C.01 IDQ_BUBBLES.CORE
+
+# Uops not delivered by IDQ when backend of the machine is not stalled [This event is alias to IDQ_BUBBLES.CORE]
+9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
+
+# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]
+9C.01.CMSK=1.INV IDQ_BUBBLES.CYCLES_FE_WAS_OK
+
+# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]
+9C.01.CMSK=1.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
+
+# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]
+9C.01.CMSK=6 IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE
+
+# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]
+9C.01.CMSK=6 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
+
+# Counts cycles where the pipeline is stalled due to serializing operations.
+A2.02 RESOURCE_STALLS.SCOREBOARD
+
+# Cycles stalled due to no store buffers available. (not including draining form sync).
+A2.08 RESOURCE_STALLS.SB
+
+# Cycles while L2 cache miss demand load is outstanding.
+A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS
+
+# Total execution stalls.
+A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
+
+# Execution stalls while L2 cache miss demand load is outstanding.
+A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS
+
+# Execution stalls while L3 cache miss demand load is outstanding.
+A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_L3_MISS
+
+# Cycles while L1 cache miss demand load is outstanding.
+A3.08.CMSK=8 CYCLE_ACTIVITY.CYCLES_L1D_MISS
+
+# Execution stalls while L1 cache miss demand load is outstanding.
+A3.0C.CMSK=12 CYCLE_ACTIVITY.STALLS_L1D_MISS
+
+# Cycles while memory subsystem has an outstanding load.
+A3.10.CMSK=16 CYCLE_ACTIVITY.CYCLES_MEM_ANY
+
+# TMA slots available for an unhalted logical processor. General counter - architectural event
+A4.01 TOPDOWN.SLOTS_P
+
+# TMA slots where no uops were being issued due to lack of back-end resources.
+A4.02 TOPDOWN.BACKEND_BOUND_SLOTS
+
+# TMA slots wasted due to incorrect speculations.
+A4.04.CTR=0 TOPDOWN.BAD_SPEC_SLOTS
+
+# TMA slots wasted due to incorrect speculation by branch mispredictions
+A4.08.CTR=0 TOPDOWN.BR_MISPREDICT_SLOTS
+
+# TOPDOWN.MEMORY_BOUND_SLOTS
+A4.10 TOPDOWN.MEMORY_BOUND_SLOTS
+
+# Cycles when Reservation Station (RS) is empty due to a resource in the back-end
+A5.01 RS.EMPTY_RESOURCE
+
+# Cycles when Reservation Station (RS) is empty for the thread.
+A5.07 RS.EMPTY
+
+# Counts end of periods where the Reservation Station (RS) was empty.
+A5.07.CMSK=1.EDG.INV RS.EMPTY_COUNT
+
+# Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.
+A6.02 EXE_ACTIVITY.1_PORTS_UTIL
+
+# Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.
+A6.04 EXE_ACTIVITY.2_PORTS_UTIL
+
+# Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.
+A6.08 EXE_ACTIVITY.3_PORTS_UTIL
+
+# Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty.
+A6.0C EXE_ACTIVITY.2_3_PORTS_UTIL
+
+# Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.
+A6.10 EXE_ACTIVITY.4_PORTS_UTIL
+
+# Execution stalls while memory subsystem has an outstanding load.
+A6.21.CMSK=5 EXE_ACTIVITY.BOUND_ON_LOADS
+
+# Cycles where the Store Buffer was full and no loads caused an execution stall.
+A6.40.CMSK=2 EXE_ACTIVITY.BOUND_ON_STORES
+
+# Cycles no uop executed while RS was not empty, the SB was not full and there was no outstanding load.
+A6.80 EXE_ACTIVITY.EXE_BOUND_0_PORTS
+
+# Number of Uops delivered by the LSD.
+A8.01 LSD.UOPS
+
+# Cycles Uops delivered by the LSD, but didn't come from the decoder.
+A8.01.CMSK=1 LSD.CYCLES_ACTIVE
+
+# Cycles optimal number of Uops delivered by the LSD, but did not come from the decoder.
+A8.01.CMSK=6 LSD.CYCLES_OK
+
+# Core cycles the allocator was stalled due to recovery from earlier clear event for this thread
+AD.01 INT_MISC.RECOVERY_CYCLES
+
+# Clears speculative count
+AD.01.CMSK=1.EDG INT_MISC.CLEARS_COUNT
+
+# TMA slots where uops got dropped
+AD.10 INT_MISC.UOP_DROPPING
+
+# INT_MISC.MBA_STALLS
+AD.20 INT_MISC.MBA_STALLS
+
+# Bubble cycles of BAClear (Unknown Branch).
+AD.40.TakenAlone INT_MISC.UNKNOWN_BRANCH_CYCLES
+
+# Counts cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.
+AD.80 INT_MISC.CLEAR_RESTEER_CYCLES
+
+# Uops that RAT issues to RS
+AE.01 UOPS_ISSUED.ANY
+
+# UOPS_ISSUED.CYCLES
+AE.01.CMSK=1 UOPS_ISSUED.CYCLES
+
+# ARITH.FPDIV_ACTIVE
+B0.01.CMSK=1 ARITH.FPDIV_ACTIVE
+
+# This event counts the cycles the integer divider is busy.
+B0.08.CMSK=1 ARITH.IDIV_ACTIVE
+
+# Cycles when divide unit is busy executing divide or square root operations.
+B0.09.CMSK=1 ARITH.DIV_ACTIVE
+
+# Counts the number of uops to be executed per-thread each cycle.
+B1.01 UOPS_EXECUTED.THREAD
+
+# Cycles where at least 1 uop was executed per-thread
+B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1
+
+# Counts number of cycles no uops were dispatched to be executed on this thread.
+B1.01.CMSK=1.INV UOPS_EXECUTED.STALLS
+
+# Cycles where at least 2 uops were executed per-thread
+B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2
+
+# Cycles where at least 3 uops were executed per-thread
+B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3
+
+# Cycles where at least 4 uops were executed per-thread
+B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4
+
+# Number of uops executed on the core.
+B1.02 UOPS_EXECUTED.CORE
+
+# Cycles at least 1 micro-op is executed from any thread on physical core.
+B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1
+
+# Cycles at least 2 micro-op is executed from any thread on physical core.
+B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2
+
+# Cycles at least 3 micro-op is executed from any thread on physical core.
+B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3
+
+# Cycles at least 4 micro-op is executed from any thread on physical core.
+B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4
+
+# Counts the number of x87 uops dispatched.
+B1.10 UOPS_EXECUTED.X87
+
+# Uops executed on port 0
+B2.01 UOPS_DISPATCHED.PORT_0
+
+# Uops executed on port 1
+B2.02 UOPS_DISPATCHED.PORT_1
+
+# Uops executed on ports 2, 3 and 10
+B2.04 UOPS_DISPATCHED.PORT_2_3_10
+
+# Uops executed on ports 4 and 9
+B2.10 UOPS_DISPATCHED.PORT_4_9
+
+# Uops executed on ports 5 and 11
+B2.20 UOPS_DISPATCHED.PORT_5_11
+
+# Uops executed on port 6
+B2.40 UOPS_DISPATCHED.PORT_6
+
+# Uops executed on ports 7 and 8
+B2.80 UOPS_DISPATCHED.PORT_7_8
+
+# FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]
+B3.01 FP_ARITH_DISPATCHED.PORT_0
+
+# FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]
+B3.01 FP_ARITH_DISPATCHED.V0
+
+# FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]
+B3.02 FP_ARITH_DISPATCHED.PORT_1
+
+# FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]
+B3.02 FP_ARITH_DISPATCHED.V1
+
+# FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]
+B3.04 FP_ARITH_DISPATCHED.PORT_5
+
+# FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]
+B3.04 FP_ARITH_DISPATCHED.V2
+
+# Counts the cycles where the AMX (Advance Matrix Extension) unit is busy performing an operation.
+B7.02 EXE.AMX_BUSY
+
+# Number of instructions retired. General Counter - architectural event
+C0.00 INST_RETIRED.ANY_P
+
+# Retired NOP instructions.
+C0.02 INST_RETIRED.NOP
+
+# Iterations of Repeat string retired instructions.
+C0.08 INST_RETIRED.REP_ITERATION
+
+# INST_RETIRED.MACRO_FUSED
+C0.10 INST_RETIRED.MACRO_FUSED
+
+# Counts all microcode FP assists.
+C1.02 ASSISTS.FP
+
+# ASSISTS.PAGE_FAULT
+C1.08 ASSISTS.PAGE_FAULT
+
+# ASSISTS.SSE_AVX_MIX
+C1.10 ASSISTS.SSE_AVX_MIX
+
+# Number of occurrences where a microcode assist is invoked by hardware.
+C1.1B ASSISTS.ANY
+
+# Retired uops except the last uop of each instruction.
+C2.01 UOPS_RETIRED.HEAVY
+
+# Retirement slots used.
+C2.02 UOPS_RETIRED.SLOTS
+
+# Cycles with retired uop(s).
+C2.02.CMSK=1 UOPS_RETIRED.CYCLES
+
+# Cycles without actually retired uops.
+C2.02.CMSK=1.INV UOPS_RETIRED.STALLS
+
+# UOPS_RETIRED.MS
+C2.04.TakenAlone UOPS_RETIRED.MS
+
+# Number of machine clears (nukes) of any type.
+C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT
+
+# Number of machine clears due to memory ordering conflicts.
+C3.02 MACHINE_CLEARS.MEMORY_ORDERING
+
+# Self-modifying code (SMC) detected.
+C3.04 MACHINE_CLEARS.SMC
+
+# All branch instructions retired.
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+
+# Taken conditional branch instructions retired.
+C4.01 BR_INST_RETIRED.COND_TAKEN
+
+# Direct and indirect near call instructions retired.
+C4.02 BR_INST_RETIRED.NEAR_CALL
+
+# Return instructions retired.
+C4.08 BR_INST_RETIRED.NEAR_RETURN
+
+# Not taken branch instructions retired.
+C4.10 BR_INST_RETIRED.COND_NTAKEN
+
+# Conditional branch instructions retired.
+C4.11 BR_INST_RETIRED.COND
+
+# Taken branch instructions retired.
+C4.20 BR_INST_RETIRED.NEAR_TAKEN
+
+# Far branch instructions retired.
+C4.40 BR_INST_RETIRED.FAR_BRANCH
+
+# Indirect near branch instructions retired (excluding returns)
+C4.80 BR_INST_RETIRED.INDIRECT
+
+# All mispredicted branch instructions retired.
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+
+# number of branch instructions retired that were mispredicted and taken.
+C5.01 BR_MISP_RETIRED.COND_TAKEN
+
+# Mispredicted indirect CALL retired.
+C5.02 BR_MISP_RETIRED.INDIRECT_CALL
+
+# This event counts the number of mispredicted ret instructions retired. Non PEBS
+C5.08 BR_MISP_RETIRED.RET
+
+# Mispredicted non-taken conditional branch instructions retired.
+C5.10 BR_MISP_RETIRED.COND_NTAKEN
+
+# Mispredicted conditional branch instructions retired.
+C5.11 BR_MISP_RETIRED.COND
+
+# Number of near branch instructions retired that were mispredicted and taken.
+C5.20 BR_MISP_RETIRED.NEAR_TAKEN
+
+# Miss-predicted near indirect branch instructions retired (excluding returns)
+C5.80 BR_MISP_RETIRED.INDIRECT
+
+# Retired Instructions who experienced DSB miss.
+C6.01.TakenAlone FRONTEND_RETIRED.ANY_DSB_MISS
+
+# Retired Instructions who experienced a critical DSB miss.
+C6.01.TakenAlone FRONTEND_RETIRED.DSB_MISS
+
+# Retired Instructions who experienced iTLB true miss.
+C6.01.TakenAlone FRONTEND_RETIRED.ITLB_MISS
+
+# Retired Instructions who experienced Instruction L1 Cache true miss.
+C6.01.TakenAlone FRONTEND_RETIRED.L1I_MISS
+
+# Retired Instructions who experienced Instruction L2 Cache true miss.
+C6.01.TakenAlone FRONTEND_RETIRED.L2_MISS
+
+# Retired instructions after front-end starvation of at least 1 cycle
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_1
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_128
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_16
+
+# Retired instructions after front-end starvation of at least 2 cycles
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_256
+
+# Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_32
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_4
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_512
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_64
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.
+C6.01.TakenAlone FRONTEND_RETIRED.LATENCY_GE_8
+
+# FRONTEND_RETIRED.MS_FLOWS
+C6.01.TakenAlone FRONTEND_RETIRED.MS_FLOWS
+
+# Retired Instructions who experienced STLB (2nd level TLB) true miss.
+C6.01.TakenAlone FRONTEND_RETIRED.STLB_MISS
+
+# FRONTEND_RETIRED.UNKNOWN_BRANCH
+C6.01.TakenAlone FRONTEND_RETIRED.UNKNOWN_BRANCH
+
+# Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
+
+# Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
+
+# Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below.  Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 RANGE SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
+C7.03 FP_ARITH_INST_RETIRED.SCALAR
+
+# Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
+
+# Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below.  Each count represents 2 or/and 4 computation operations, 1 for each element.  Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.
+C7.18 FP_ARITH_INST_RETIRED.4_FLOPS
+
+# Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
+
+# Counts number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.40 FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision  FP instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, 1 for each element.  Applies to SSE* and AVX* packed single precision and double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.
+C7.60 FP_ARITH_INST_RETIRED.8_FLOPS
+
+# Counts number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 16 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.80 FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE
+
+# Number of any Vector retired FP arithmetic instructions
+C7.FC FP_ARITH_INST_RETIRED.VECTOR
+
+# Number of times an RTM execution started.
+C9.01 RTM_RETIRED.START
+
+# Number of times an RTM execution successfully committed
+C9.02 RTM_RETIRED.COMMIT
+
+# Number of times an RTM execution aborted.
+C9.04 RTM_RETIRED.ABORTED
+
+# Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts)
+C9.08 RTM_RETIRED.ABORTED_MEM
+
+# Number of times an RTM execution aborted due to HLE-unfriendly instructions
+C9.20 RTM_RETIRED.ABORTED_UNFRIENDLY
+
+# Number of times an RTM execution aborted due to incompatible memory type
+C9.40 RTM_RETIRED.ABORTED_MEMTYPE
+
+# Number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt)
+C9.80 RTM_RETIRED.ABORTED_EVENTS
+
+# Increments whenever there is an update to the LBR array.
+CC.20 MISC_RETIRED.LBR_INSERTS
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles.
+CD.01.MSR_3F6H=0x10.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles.
+CD.01.MSR_3F6H=0x100.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles.
+CD.01.MSR_3F6H=0x20.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles.
+CD.01.MSR_3F6H=0x200.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles.
+CD.01.MSR_3F6H=0x4.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles.
+CD.01.MSR_3F6H=0x40.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles.
+CD.01.MSR_3F6H=0x400.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles.
+CD.01.MSR_3F6H=0x8.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles.
+CD.01.MSR_3F6H=0x80.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
+
+# Retired memory store access operations. A PDist event for PEBS Store Latency Facility.
+CD.02.CTR=0 MEM_TRANS_RETIRED.STORE_SAMPLE
+
+# FP_ARITH_INST_RETIRED2.SCALAR_HALF
+CF.01 FP_ARITH_INST_RETIRED2.SCALAR_HALF
+
+# FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF
+CF.02 FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF
+
+# Number of all Scalar Half-Precision FP arithmetic instructions(1) retired - regular and complex.
+CF.03 FP_ARITH_INST_RETIRED2.SCALAR
+
+# FP_ARITH_INST_RETIRED2.128B_PACKED_HALF
+CF.04 FP_ARITH_INST_RETIRED2.128B_PACKED_HALF
+
+# FP_ARITH_INST_RETIRED2.256B_PACKED_HALF
+CF.08 FP_ARITH_INST_RETIRED2.256B_PACKED_HALF
+
+# FP_ARITH_INST_RETIRED2.512B_PACKED_HALF
+CF.10 FP_ARITH_INST_RETIRED2.512B_PACKED_HALF
+
+# Number of all Vector (also called packed) Half-Precision FP arithmetic instructions(1) retired.
+CF.1C FP_ARITH_INST_RETIRED2.VECTOR
+
+# Retired load instructions that miss the STLB.
+D0.11 MEM_INST_RETIRED.STLB_MISS_LOADS
+
+# Retired store instructions that miss the STLB.
+D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
+
+# Retired load instructions with locked access.
+D0.21 MEM_INST_RETIRED.LOCK_LOADS
+
+# Retired load instructions that split across a cacheline boundary.
+D0.41 MEM_INST_RETIRED.SPLIT_LOADS
+
+# Retired store instructions that split across a cacheline boundary.
+D0.42 MEM_INST_RETIRED.SPLIT_STORES
+
+# Retired load instructions.
+D0.81 MEM_INST_RETIRED.ALL_LOADS
+
+# Retired store instructions.
+D0.82 MEM_INST_RETIRED.ALL_STORES
+
+# All retired memory instructions.
+D0.83 MEM_INST_RETIRED.ANY
+
+# Retired load instructions with L1 cache hits as data sources
+D1.01 MEM_LOAD_RETIRED.L1_HIT
+
+# Retired load instructions with L2 cache hits as data sources
+D1.02 MEM_LOAD_RETIRED.L2_HIT
+
+# Retired load instructions with L3 cache hits as data sources
+D1.04 MEM_LOAD_RETIRED.L3_HIT
+
+# Retired load instructions missed L1 cache as data sources
+D1.08 MEM_LOAD_RETIRED.L1_MISS
+
+# Retired load instructions missed L2 cache as data sources
+D1.10 MEM_LOAD_RETIRED.L2_MISS
+
+# Retired load instructions missed L3 cache as data sources
+D1.20 MEM_LOAD_RETIRED.L3_MISS
+
+# Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.
+D1.40 MEM_LOAD_RETIRED.FB_HIT
+
+# Retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.
+D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
+
+# Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache
+D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD
+
+# Retired load instructions whose data sources were HitM responses from shared L3
+D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD
+
+# Retired load instructions whose data sources were hits in L3 without snoops required
+D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
+
+# Retired load instructions which data sources missed L3 but serviced from local dram
+D3.01 MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM
+
+# MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM
+D3.02 MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM
+
+# MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM
+D3.04 MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM
+
+# Retired load instructions whose data sources was forwarded from a remote cache
+D3.08 MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD
+
+# Retired instructions with at least 1 uncacheable load or lock.
+D4.04 MEM_LOAD_MISC_RETIRED.UC
+
+# LFENCE instructions retired
+E0.20 MISC2_RETIRED.LFENCE
+
+# Retired memory uops for any access
+E5.03 MEM_UOP_RETIRED.ANY
+
+# integer ADD, SUB, SAD 128-bit vector instructions.
+E7.03 INT_VEC_RETIRED.ADD_128
+
+# integer ADD, SUB, SAD 256-bit vector instructions.
+E7.0C INT_VEC_RETIRED.ADD_256
+
+# INT_VEC_RETIRED.VNNI_128
+E7.10 INT_VEC_RETIRED.VNNI_128
+
+# INT_VEC_RETIRED.128BIT
+E7.13 INT_VEC_RETIRED.128BIT
+
+# INT_VEC_RETIRED.VNNI_256
+E7.20 INT_VEC_RETIRED.VNNI_256
+
+# INT_VEC_RETIRED.SHUFFLES
+E7.40 INT_VEC_RETIRED.SHUFFLES
+
+# INT_VEC_RETIRED.MUL_256
+E7.80 INT_VEC_RETIRED.MUL_256
+
+# INT_VEC_RETIRED.256BIT
+E7.AC INT_VEC_RETIRED.256BIT
+
+# Cycle counts are evenly distributed between active threads in the Core.
+EC.02 CPU_CLK_UNHALTED.DISTRIBUTED
+
+# Core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.
+EC.10 CPU_CLK_UNHALTED.C01
+
+# Core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.
+EC.20 CPU_CLK_UNHALTED.C02
+
+# CPU_CLK_UNHALTED.PAUSE
+EC.40 CPU_CLK_UNHALTED.PAUSE
+
+# CPU_CLK_UNHALTED.PAUSE_INST
+EC.40.CMSK=1.EDG CPU_CLK_UNHALTED.PAUSE_INST
+
+# Core clocks when the thread is in the C0.1 or C0.2 or running a PAUSE in C0 ACPI state.
+EC.70 CPU_CLK_UNHALTED.C0_WAIT
--- a/configs/cfg_EmeraldRapids_all_offcore.txt
+++ b/configs/cfg_EmeraldRapids_all_offcore.txt
@@ -0,0 +1,200 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/EMR/events/emeraldrapids_core.json (Version: 1.13)
+# Applies to processors with family-model in {6-CF}
+
+# Counts demand data reads that have any type of response.
+2A.01.MSR_RSP0=0x10001.TakenAlone OCR.DEMAND_DATA_RD.ANY_RESPONSE
+
+# Counts demand data reads that resulted in a snoop hit a modified line in another core's caches which forwarded the data.
+2A.01.MSR_RSP0=0x10003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that resulted in a snoop hit a modified line in another core's caches which forwarded the data.
+2A.01.MSR_RSP0=0x10003C0002.TakenAlone OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that resulted in a snoop hit a modified line in another core's caches which forwarded the data.
+2A.01.MSR_RSP0=0x10003C0004.TakenAlone OCR.DEMAND_CODE_RD.L3_HIT.SNOOP_HITM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that resulted in a snoop hit a modified line in another core's caches which forwarded the data.
+2A.01.MSR_RSP0=0x10003C4477.TakenAlone OCR.READS_TO_CORE.L3_HIT.SNOOP_HITM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that have any type of response.
+2A.01.MSR_RSP0=0x10004.TakenAlone OCR.DEMAND_CODE_RD.ANY_RESPONSE
+
+# Counts hardware prefetches (which bring data to L2) that have any type of response.
+2A.01.MSR_RSP0=0x10070.TakenAlone OCR.HWPF_L2.ANY_RESPONSE
+
+# Counts demand data reads that hit a modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x1008000001.TakenAlone OCR.DEMAND_DATA_RD.SNC_CACHE.HITM
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that hit a modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x1008000002.TakenAlone OCR.DEMAND_RFO.SNC_CACHE.HITM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that hit a modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x1008000004.TakenAlone OCR.DEMAND_CODE_RD.SNC_CACHE.HITM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that hit a modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x1008004477.TakenAlone OCR.READS_TO_CORE.SNC_CACHE.HITM
+
+# Counts demand data reads that were supplied by a cache on a remote socket where a snoop hit a modified line in another core's caches which forwarded the data.
+2A.01.MSR_RSP0=0x1030000001.TakenAlone OCR.DEMAND_DATA_RD.REMOTE_CACHE.SNOOP_HITM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by a cache on a remote socket where a snoop hit a modified line in another core's caches which forwarded the data.
+2A.01.MSR_RSP0=0x1030004477.TakenAlone OCR.READS_TO_CORE.REMOTE_CACHE.SNOOP_HITM
+
+# Counts data load hardware prefetch requests to the L1 data cache that have any type of response.
+2A.01.MSR_RSP0=0x10400.TakenAlone OCR.HWPF_L1D.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by DRAM attached to this socket, unless in Sub NUMA Cluster(SNC) Mode.  In SNC Mode counts only those DRAM accesses that are controlled by the close SNC Cluster.
+2A.01.MSR_RSP0=0x104000001.TakenAlone OCR.DEMAND_DATA_RD.LOCAL_DRAM
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by DRAM attached to this socket, unless in Sub NUMA Cluster(SNC) Mode.  In SNC Mode counts only those DRAM accesses that are controlled by the close SNC Cluster.
+2A.01.MSR_RSP0=0x104000002.TakenAlone OCR.DEMAND_RFO.LOCAL_DRAM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by DRAM attached to this socket, unless in Sub NUMA Cluster(SNC) Mode.  In SNC Mode counts only those DRAM accesses that are controlled by the close SNC Cluster.
+2A.01.MSR_RSP0=0x104000004.TakenAlone OCR.DEMAND_CODE_RD.LOCAL_DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by DRAM attached to this socket, unless in Sub NUMA Cluster(SNC) Mode.  In SNC Mode counts only those DRAM accesses that are controlled by the close SNC Cluster.
+2A.01.MSR_RSP0=0x104004477.TakenAlone OCR.READS_TO_CORE.LOCAL_DRAM
+
+# Counts streaming stores that have any type of response.
+2A.01.MSR_RSP0=0x10800.TakenAlone OCR.STREAMING_WR.ANY_RESPONSE
+
+# Counts writebacks of modified cachelines and streaming stores that have any type of response.
+2A.01.MSR_RSP0=0x10808.TakenAlone OCR.MODIFIED_WRITE.ANY_RESPONSE
+
+# Counts hardware prefetches to the L3 only that have any type of response.
+2A.01.MSR_RSP0=0x12380.TakenAlone OCR.HWPF_L3.ANY_RESPONSE
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by a cache on a remote socket where a snoop was sent and data was returned (Modified or Not Modified).
+2A.01.MSR_RSP0=0x1830004477.TakenAlone OCR.READS_TO_CORE.REMOTE_CACHE.SNOOP_FWD
+
+# Counts demand reads for ownership (RFO), hardware prefetch RFOs (which bring data to L2), and software prefetches for exclusive ownership (PREFETCHW) that hit to a (M)odified cacheline in the L3 or snoop filter.
+2A.01.MSR_RSP0=0x1F80040022.TakenAlone OCR.RFO_TO_CORE.L3_HIT_M
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches on the same socket.
+2A.01.MSR_RSP0=0x3F003C4477.TakenAlone OCR.READS_TO_CORE.L3_HIT
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were not supplied by the local socket's L1, L2, or L3 caches and the cacheline is homed locally.
+2A.01.MSR_RSP0=0x3F04C04477.TakenAlone OCR.READS_TO_CORE.L3_MISS_LOCAL
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were not supplied by the local socket's L1, L2, or L3 caches and were supplied by a remote socket.
+2A.01.MSR_RSP0=0x3F33004477.TakenAlone OCR.READS_TO_CORE.REMOTE
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the local socket's L1, L2, or L3 caches.
+2A.01.MSR_RSP0=0x3F3FC00002.TakenAlone OCR.DEMAND_RFO.L3_MISS
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were not supplied by the local socket's L1, L2, or L3 caches.
+2A.01.MSR_RSP0=0x3F3FC04477.TakenAlone OCR.READS_TO_CORE.L3_MISS
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.
+2A.01.MSR_RSP0=0x3F3FFC0002.TakenAlone OCR.DEMAND_RFO.ANY_RESPONSE
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that have any type of response.
+2A.01.MSR_RSP0=0x3F3FFC4477.TakenAlone OCR.READS_TO_CORE.ANY_RESPONSE
+
+# Counts demand data reads that hit in the L3 or were snooped from another core's caches on the same socket.
+2A.01.MSR_RSP0=0x3F803C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that hit in the L3 or were snooped from another core's caches on the same socket.
+2A.01.MSR_RSP0=0x3F803C0002.TakenAlone OCR.DEMAND_RFO.L3_HIT
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that hit in the L3 or were snooped from another core's caches on the same socket.
+2A.01.MSR_RSP0=0x3F803C0004.TakenAlone OCR.DEMAND_CODE_RD.L3_HIT
+
+# Counts demand data reads that were not supplied by the local socket's L1, L2, or L3 caches.
+2A.01.MSR_RSP0=0x3FBFC00001.TakenAlone OCR.DEMAND_DATA_RD.L3_MISS
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were not supplied by the local socket's L1, L2, or L3 caches.
+2A.01.MSR_RSP0=0x3FBFC00004.TakenAlone OCR.DEMAND_CODE_RD.L3_MISS
+
+# Counts demand data reads that resulted in a snoop that hit in another core, which did not forward the data.
+2A.01.MSR_RSP0=0x4003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that resulted in a snoop that hit in another core, which did not forward the data.
+2A.01.MSR_RSP0=0x4003C4477.TakenAlone OCR.READS_TO_CORE.L3_HIT.SNOOP_HIT_NO_FWD
+
+# Counts demand data reads that were supplied by DRAM on a distant memory controller of this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x708000001.TakenAlone OCR.DEMAND_DATA_RD.SNC_DRAM
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by DRAM on a distant memory controller of this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x708000002.TakenAlone OCR.DEMAND_RFO.SNC_DRAM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by DRAM on a distant memory controller of this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x708000004.TakenAlone OCR.DEMAND_CODE_RD.SNC_DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by DRAM on a distant memory controller of this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x708004477.TakenAlone OCR.READS_TO_CORE.SNC_DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by DRAM attached to this socket, whether or not in Sub NUMA Cluster(SNC) Mode.  In SNC Mode counts DRAM accesses that are controlled by the close or distant SNC Cluster.
+2A.01.MSR_RSP0=0x70C004477.TakenAlone OCR.READS_TO_CORE.LOCAL_SOCKET_DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that missed the L3 Cache and were supplied by the local socket (DRAM or PMM), whether or not in Sub NUMA Cluster(SNC) Mode.  In SNC Mode counts PMM or DRAM accesses that are controlled by the close or distant SNC Cluster.  It does not count misses to the L3 which go to Local CXL Type 2 Memory or Local Non DRAM.
+2A.01.MSR_RSP0=0x70CC04477.TakenAlone OCR.READS_TO_CORE.L3_MISS_LOCAL_SOCKET
+
+# Counts demand data reads that were supplied by DRAM attached to another socket.
+2A.01.MSR_RSP0=0x730000001.TakenAlone OCR.DEMAND_DATA_RD.REMOTE_DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by DRAM attached to another socket.
+2A.01.MSR_RSP0=0x730004477.TakenAlone OCR.READS_TO_CORE.REMOTE_DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by DRAM or PMM attached to another socket.
+2A.01.MSR_RSP0=0x733004477.TakenAlone OCR.READS_TO_CORE.REMOTE_MEMORY
+
+# Counts demand data reads that were supplied by DRAM.
+2A.01.MSR_RSP0=0x73C000001.TakenAlone OCR.DEMAND_DATA_RD.DRAM
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by DRAM.
+2A.01.MSR_RSP0=0x73C000002.TakenAlone OCR.DEMAND_RFO.DRAM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by DRAM.
+2A.01.MSR_RSP0=0x73C000004.TakenAlone OCR.DEMAND_CODE_RD.DRAM
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by DRAM.
+2A.01.MSR_RSP0=0x73C004477.TakenAlone OCR.READS_TO_CORE.DRAM
+
+# Counts demand data reads that resulted in a snoop hit in another core's caches which forwarded the unmodified data to the requesting core.
+2A.01.MSR_RSP0=0x8003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that resulted in a snoop hit in another core's caches which forwarded the unmodified data to the requesting core.
+2A.01.MSR_RSP0=0x8003C4477.TakenAlone OCR.READS_TO_CORE.L3_HIT.SNOOP_HIT_WITH_FWD
+
+# Counts streaming stores that hit in the L3 or were snooped from another core's caches on the same socket.
+2A.01.MSR_RSP0=0x80080800.TakenAlone OCR.STREAMING_WR.L3_HIT
+
+# Counts hardware prefetches to the L3 only that hit in the L3 or were snooped from another core's caches on the same socket.
+2A.01.MSR_RSP0=0x80082380.TakenAlone OCR.HWPF_L3.L3_HIT
+
+# Counts demand data reads that either hit a non-modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x808000001.TakenAlone OCR.DEMAND_DATA_RD.SNC_CACHE.HIT_WITH_FWD
+
+# Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that either hit a non-modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x808000002.TakenAlone OCR.DEMAND_RFO.SNC_CACHE.HIT_WITH_FWD
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that either hit a non-modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x808000004.TakenAlone OCR.DEMAND_CODE_RD.SNC_CACHE.HIT_WITH_FWD
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that either hit a non-modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.
+2A.01.MSR_RSP0=0x808004477.TakenAlone OCR.READS_TO_CORE.SNC_CACHE.HIT_WITH_FWD
+
+# Counts demand data reads that were supplied by a cache on a remote socket where a snoop hit in another core's caches which forwarded the unmodified data to the requesting core.
+2A.01.MSR_RSP0=0x830000001.TakenAlone OCR.DEMAND_DATA_RD.REMOTE_CACHE.SNOOP_HIT_WITH_FWD
+
+# Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by a cache on a remote socket where a snoop hit in another core's caches which forwarded the unmodified data to the requesting core.
+2A.01.MSR_RSP0=0x830004477.TakenAlone OCR.READS_TO_CORE.REMOTE_CACHE.SNOOP_HIT_WITH_FWD
+
+# Counts streaming stores that were not supplied by the local socket's L1, L2, or L3 caches and the cacheline is homed locally.
+2A.01.MSR_RSP0=0x84000800.TakenAlone OCR.STREAMING_WR.L3_MISS_LOCAL
+
+# Counts hardware prefetches to the L3 only that were not supplied by the local socket's L1, L2, or L3 caches and the cacheline is homed locally.
+2A.01.MSR_RSP0=0x84002380.TakenAlone OCR.HWPF_L3.L3_MISS_LOCAL
+
+# Counts hardware prefetches to the L3 only that were not supplied by the local socket's L1, L2, or L3 caches and the cacheline was homed in a remote socket.
+2A.01.MSR_RSP0=0x90002380.TakenAlone OCR.HWPF_L3.REMOTE
+
+# Counts streaming stores that missed the local socket's L1, L2, and L3 caches.
+2A.01.MSR_RSP0=0x94000800.TakenAlone OCR.STREAMING_WR.L3_MISS
+
+# Counts hardware prefetches to the L3 only that missed the local socket's L1, L2, and L3 caches.
+2A.01.MSR_RSP0=0x94002380.TakenAlone OCR.HWPF_L3.L3_MISS
+
+# Counts Demand RFOs, ItoM's, PREFECTHW's, Hardware RFO Prefetches to the L1/L2 and Streaming stores that likely resulted in a store to Memory (DRAM or PMM)
+2A.01.MSR_RSP0=0xFBFF80822.TakenAlone OCR.WRITE_ESTIMATE.MEMORY
--- a/configs/cfg_EmeraldRapids_common.txt
+++ b/configs/cfg_EmeraldRapids_common.txt
@@ -0,0 +1,27 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/EMR/events/emeraldrapids_core.json (Version: 1.13)
+# Applies to processors with family-model in {6-CF}
+
+3C.00 CORE_CYCLES
+C0.00 INST_RETIRED
+79.04 IDQ.MITE_UOPS
+79.08 IDQ.DSB_UOPS
+79.20 IDQ.MS_UOPS
+A8.01 LSD.UOPS
+AE.01 UOPS_ISSUED
+B1.01 UOPS_EXECUTED
+C2.02 UOPS_RETIRED.SLOTS
+B2.01 UOPS_DISPATCHED_PORT.PORT_0
+B2.02 UOPS_DISPATCHED_PORT.PORT_1
+B2.04 UOPS_DISPATCHED_PORT.PORT_2_3_10
+B2.10 UOPS_DISPATCHED_PORT.PORT_4_9
+B2.20 UOPS_DISPATCHED_PORT.PORT_5_11
+B2.40 UOPS_DISPATCHED_PORT.PORT_6
+B2.80 UOPS_DISPATCHED_PORT.PORT_7_8
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+D1.01 MEM_LOAD_RETIRED.L1_HIT
+D1.08 MEM_LOAD_RETIRED.L1_MISS
+D1.02 MEM_LOAD_RETIRED.L2_HIT
+D1.10 MEM_LOAD_RETIRED.L2_MISS
+D1.04 MEM_LOAD_RETIRED.L3_HIT
+D1.20 MEM_LOAD_RETIRED.L3_MISS
--- a/configs/cfg_MeteorLakeE_Crestmont_all_core.txt
+++ b/configs/cfg_MeteorLakeE_Crestmont_all_core.txt
@@ -0,0 +1,521 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/MTL/events/meteorlake_crestmont_core.json (Version: 1.14)
+# Applies to processors with family-model in {6-AA, 6-AC, 6-B5}
+
+# Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.
+03.01 LD_BLOCKS.DATA_UNKNOWN
+
+# Counts the number of retired loads that are blocked because its address partially overlapped with an older store.
+03.02 LD_BLOCKS.STORE_FORWARD
+
+# Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.
+03.04 LD_BLOCKS.ADDRESS_ALIAS
+
+# Counts the number of cycles that uops are blocked due to a store buffer full condition.
+04.01 MEM_SCHEDULER_BLOCK.ST_BUF
+
+# Counts the number of cycles that uops are blocked due to a load buffer full condition.
+04.02 MEM_SCHEDULER_BLOCK.LD_BUF
+
+# Counts the number of cycles that uops are blocked due to an RSV full condition.
+04.04 MEM_SCHEDULER_BLOCK.RSV
+
+# Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.
+04.07 MEM_SCHEDULER_BLOCK.ALL
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.
+05.81 LD_HEAD.L1_MISS_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.
+05.84 LD_HEAD.ST_ADDR_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.
+05.90 LD_HEAD.DTLB_MISS_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.
+05.A0 LD_HEAD.PGWALK_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.
+05.C0 LD_HEAD.OTHER_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.
+05.F4 LD_HEAD.L1_BOUND_AT_RET
+
+# Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.
+05.FF LD_HEAD.ANY_AT_RET
+
+# Counts the number of page walks completed due to load DTLB misses to a 4K page.
+08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
+
+# Counts the number of page walks completed due to load DTLB misses to a 2M or 4M page.
+08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
+
+# Counts the number of page walks completed due to load DTLB misses.
+08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
+
+# Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.
+08.10 DTLB_LOAD_MISSES.WALK_PENDING
+
+# Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.
+08.20 DTLB_LOAD_MISSES.STLB_HIT
+
+# Counts the number of uops issued by the front end every cycle.
+0E.00 UOPS_ISSUED.ANY
+
+# Counts misaligned loads that are 4K page splits.
+13.02 MISALIGN_MEM_REF.LOAD_PAGE_SPLIT
+
+# Counts misaligned stores that are 4K page splits.
+13.04 MISALIGN_MEM_REF.STORE_PAGE_SPLIT
+
+# Counts the number of total L2 Cache Accesses that resulted in a Miss from a front door request only (does not include rejects or recycles), per core event
+24.01 L2_REQUEST.MISS
+
+# Counts the number of L2 Cache Accesses that resulted in a Hit from a front door request only (does not include rejects or recycles), per core event
+24.02 L2_REQUEST.HIT
+
+# Counts the number of L2 Cache Accesses that miss the L2 and get BBL reject  short and long rejects, per core event
+24.04 L2_REQUEST.REJECTS
+
+# Counts the number of cache lines filled into the L2 cache that are in Shared state
+25.02 L2_LINES_IN.S
+
+# Counts the number of cache lines filled into the L2 cache that are in Exclusive state
+25.04 L2_LINES_IN.E
+
+# Counts the number of cache lines filled into the L2 cache that are in Modified state
+25.08 L2_LINES_IN.M
+
+# Counts the number of cache lines filled into the L2 cache that are in Forward state
+25.10 L2_LINES_IN.F
+
+# Counts the number of L2 cache lines that are silently dropped due to an L2 cache fill
+26.01 L2_LINES_OUT.SILENT
+
+# Counts the number of L2 cache lines that are evicted due to an L2 cache fill
+26.02 L2_LINES_OUT.NON_SILENT
+
+# Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.
+2E.41 LONGEST_LAT_CACHE.MISS
+
+# Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.
+2E.4F LONGEST_LAT_CACHE.REFERENCE
+
+# Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.
+34.01 MEM_BOUND_STALLS_LOAD.L2_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC. If the core has access to an L3 cache, an LLC hit refers to an L3 cache hit, otherwise it counts zeros.
+34.06 MEM_BOUND_STALLS_LOAD.LLC_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.
+34.78 MEM_BOUND_STALLS_LOAD.LLC_MISS
+
+# Counts the number of cycles the core is stalled due to a demand load which missed in the L2 cache.
+34.7E MEM_BOUND_STALLS_LOAD.L2_MISS
+
+# Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.
+34.7F MEM_BOUND_STALLS_LOAD.ALL
+
+# Counts the number of unhalted cycles when the core is stalled to a store buffer full condition
+34.80 MEM_BOUND_STALLS_LOAD.SBFULL
+
+# Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.
+35.01 MEM_BOUND_STALLS_IFETCH.L2_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to an ICACHE or ITLB miss which hit in the LLC. If the core has access to an L3 cache, an LLC hit refers to an L3 cache hit, otherwise it counts zeros.
+35.06 MEM_BOUND_STALLS_IFETCH.LLC_HIT
+
+# Counts the number of unhalted cycles when the core is stalled due to an ICACHE or ITLB miss which missed all the caches. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.
+35.78 MEM_BOUND_STALLS_IFETCH.LLC_MISS
+
+# Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which missed in the L2 cache.
+35.7E MEM_BOUND_STALLS_IFETCH.L2_MISS
+
+# Counts the number of unhalted cycles when the core is stalled due to an instruction cache or TLB miss.
+35.7F MEM_BOUND_STALLS_IFETCH.ALL
+
+# Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.THREAD_P]
+3C.00 CPU_CLK_UNHALTED.CORE_P
+
+# Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.CORE_P]
+3C.00 CPU_CLK_UNHALTED.THREAD_P
+
+# Counts the number of unhalted reference clock cycles at TSC frequency.
+3C.01 CPU_CLK_UNHALTED.REF_TSC_P
+
+# Counts the number of page walks completed due to store DTLB misses to a 4K page.
+49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
+
+# Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.
+49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
+
+# Counts the number of page walks completed due to store DTLB misses to a 1G page.
+49.0E DTLB_STORE_MISSES.WALK_COMPLETED
+
+# Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle.
+49.10 DTLB_STORE_MISSES.WALK_PENDING
+
+# Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all pages sizes. Will result in a DTLB write from STLB.
+49.20 DTLB_STORE_MISSES.STLB_HIT
+
+# Counts the number of L1D cacheline (dirty) evictions caused by load misses, stores, and prefetches.
+51.01 DL1.DIRTY_EVICTION
+
+# Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]
+71.00 TOPDOWN_FE_BOUND.ALL
+
+# Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]
+71.00 TOPDOWN_FE_BOUND.ALL_P
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to ms
+71.01 TOPDOWN_FE_BOUND.CISC
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear
+71.02 TOPDOWN_FE_BOUND.BRANCH_DETECT
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to predecode wrong
+71.04 TOPDOWN_FE_BOUND.PREDECODE
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stall
+71.08 TOPDOWN_FE_BOUND.DECODE
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to itlb miss [This event is alias to TOPDOWN_FE_BOUND.ITLB]
+71.10 TOPDOWN_FE_BOUND.ITLB_MISS
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to an icache miss
+71.20 TOPDOWN_FE_BOUND.ICACHE
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to BTClear
+71.40 TOPDOWN_FE_BOUND.BRANCH_RESTEER
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.
+71.72 TOPDOWN_FE_BOUND.FRONTEND_LATENCY
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend that do not categorize into any other common frontend stall
+71.80 TOPDOWN_FE_BOUND.OTHER
+
+# Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.
+71.8D TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH
+
+# Counts the number of consumed retirement slots. [This event is alias to TOPDOWN_RETIRING.ALL_P]
+72.00 TOPDOWN_RETIRING.ALL
+
+# Counts the number of consumed retirement slots. [This event is alias to TOPDOWN_RETIRING.ALL]
+72.00 TOPDOWN_RETIRING.ALL_P
+
+# Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]
+73.00 TOPDOWN_BAD_SPECULATION.ALL
+
+# Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]
+73.00 TOPDOWN_BAD_SPECULATION.ALL_P
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).
+73.01 TOPDOWN_BAD_SPECULATION.NUKE
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to Fast Nukes such as  Memory Ordering Machine clears and MRN nukes
+73.02 TOPDOWN_BAD_SPECULATION.FASTNUKE
+
+# Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.
+73.03 TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict
+73.04 TOPDOWN_BAD_SPECULATION.MISPREDICT
+
+# Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]
+74.00 TOPDOWN_BE_BOUND.ALL
+
+# Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]
+74.00 TOPDOWN_BE_BOUND.ALL_P
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to due to certain allocation restrictions
+74.01 TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop).  This could be caused by RSV full or load/store buffer block.
+74.02 TOPDOWN_BE_BOUND.MEM_SCHEDULER
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to IEC and FPC RAT stalls - which can be due to the FIQ and IEC reservation station stall (integer, FP and SIMD scheduler not being able to accept another uop. )
+74.08 TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb
+74.10 TOPDOWN_BE_BOUND.SERIALIZATION
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to mrbl stall.  A 'marble' refers to a physical register file entry, also known as the physical destination (PDST).
+74.20 TOPDOWN_BE_BOUND.REGISTER
+
+# Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full
+74.40 TOPDOWN_BE_BOUND.REORDER_BUFFER
+
+# Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.
+75.04 SERIALIZATION.C01_MS_SCB
+
+# Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are not present. -
+80.02 ICACHE.MISSES
+
+# Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.
+80.03 ICACHE.ACCESSES
+
+# Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.
+85.01 ITLB_MISSES.MISS_CAUSED_WALK
+
+# Counts the number of page walks completed due to instruction fetch misses to a 4K page.
+85.02 ITLB_MISSES.WALK_COMPLETED_4K
+
+# Counts the number of page walks completed due to instruction fetch misses to a 2M or 4M page.
+85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
+
+# Counts the number of page walks completed due to instruction fetch misses to any page size.
+85.0E ITLB_MISSES.WALK_COMPLETED
+
+# Counts the number of page walks outstanding for iside in PMH every cycle.
+85.10 ITLB_MISSES.WALK_PENDING
+
+# Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.
+85.20 ITLB_MISSES.STLB_HIT
+
+# Counts the number of uops executed on floating point and vector integer store data port.
+B2.01 FP_VINT_UOPS_EXECUTED.STD
+
+# Counts the number of instructions retired
+C0.00 INST_RETIRED.ANY_P
+
+# Counts the total number of uops retired.
+C2.00 UOPS_RETIRED.ALL
+
+# Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.
+C2.01 UOPS_RETIRED.MS
+
+# Counts the number of x87 uops retired, includes those in ms flows
+C2.02 UOPS_RETIRED.X87
+
+# Counts the number of floating point divide uops retired (x87 and sse, including x87 sqrt).
+C2.08 UOPS_RETIRED.FPDIV
+
+# Counts the number of integer divide uops retired.
+C2.10 UOPS_RETIRED.IDIV
+
+# Counts the total number of machine clears for any reason including, but not limited to, memory ordering, memory disambiguation, SMC, and FP assist.
+C3.00 MACHINE_CLEARS.ANY
+
+# Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.
+C3.01 MACHINE_CLEARS.SMC
+
+# Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.
+C3.02 MACHINE_CLEARS.MEMORY_ORDERING
+
+# Counts the number of floating point operations retired that required microcode assist.
+C3.04 MACHINE_CLEARS.FP_ASSIST
+
+# Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.
+C3.08 MACHINE_CLEARS.DISAMBIGUATION
+
+# Counts the number of machine clears due to a page fault.  Counts both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation occurs.
+C3.20 MACHINE_CLEARS.PAGE_FAULT
+
+# Counts the number of machines clears due to memory renaming.
+C3.80 MACHINE_CLEARS.MRN_NUKE
+
+# Counts the total number of branch instructions retired for all branch types.
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+
+# Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.
+C4.7E BR_INST_RETIRED.COND
+
+# Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.
+C4.BF BR_INST_RETIRED.FAR_BRANCH
+
+# Counts the number of near taken branch instructions retired.
+C4.C0 BR_INST_RETIRED.NEAR_TAKEN
+
+# Counts the number of near relative JMP branch instructions retired.
+C4.DF BR_INST_RETIRED.REL_JMP
+
+# Counts the number of near indirect JMP and near indirect CALL branch instructions retired.
+C4.EB BR_INST_RETIRED.INDIRECT
+
+# Counts the number of near indirect JMP branch instructions retired.
+C4.EF BR_INST_RETIRED.INDIRECT_JMP
+
+# Counts the number of near RET branch instructions retired.
+C4.F7 BR_INST_RETIRED.NEAR_RETURN
+
+# Counts the number of near CALL branch instructions retired.
+C4.F9 BR_INST_RETIRED.NEAR_CALL
+
+# Counts the number of near indirect CALL branch instructions retired.
+C4.FB BR_INST_RETIRED.INDIRECT_CALL
+
+# Counts the number of near relative CALL branch instructions retired.
+C4.FD BR_INST_RETIRED.REL_CALL
+
+# Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.
+C4.FE BR_INST_RETIRED.COND_TAKEN
+
+# Counts the total number of mispredicted branch instructions retired for all branch types.
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+
+# Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.
+C5.7E BR_MISP_RETIRED.COND
+
+# Counts the number of mispredicted near taken branch instructions retired.
+C5.80 BR_MISP_RETIRED.NEAR_TAKEN
+
+# Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.
+C5.EB BR_MISP_RETIRED.INDIRECT
+
+# Counts the number of mispredicted near indirect JMP branch instructions retired.
+C5.EF BR_MISP_RETIRED.INDIRECT_JMP
+
+# Counts the number of mispredicted near RET branch instructions retired.
+C5.F7 BR_MISP_RETIRED.RETURN
+
+# Counts the number of mispredicted near indirect CALL branch instructions retired.
+C5.FB BR_MISP_RETIRED.INDIRECT_CALL
+
+# Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.
+C5.FE BR_MISP_RETIRED.COND_TAKEN
+
+# Counts the number of instructions retired that were tagged with having preceded with frontend bound behavior
+C6.00 FRONTEND_RETIRED.ALL
+
+# Counts the number of instructions retired that were tagged following an ms flow due to the bubble/wasted issue slot from exiting long ms flow
+C6.01 FRONTEND_RETIRED.CISC
+
+# Counts the number of instruction retired that are tagged after a branch instruction causes bubbles/empty issue slots due to a baclear
+C6.02 FRONTEND_RETIRED.BRANCH_DETECT
+
+# Counts the number of instruction retired that are tagged after a branch instruction causes bubbles/empty issue slots due to a predecode wrong.
+C6.04 FRONTEND_RETIRED.PREDECODE
+
+# Counts the number of instructions retired that were tagged every cycle the decoder is unable to send 3 uops per cycle.
+C6.08 FRONTEND_RETIRED.DECODE
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss
+C6.10 FRONTEND_RETIRED.ITLB_MISS
+
+# Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to icache miss
+C6.20 FRONTEND_RETIRED.ICACHE
+
+# Counts the number of instruction retired that are tagged after a branch instruction causes bubbles /empty issue slots due to a btclear
+C6.40 FRONTEND_RETIRED.BRANCH_RESTEER
+
+# Counts the number of instruction retired tagged after a wasted issue slot if none of the previous events occurred
+C6.80 FRONTEND_RETIRED.OTHER
+
+# Counts the number of retired instructions whose sources are a scalar 32bit single precision floating point.
+C7.01 FP_INST_RETIRED.32B_SP
+
+# Counts the number of retired instructions whose sources are a scalar 64 bit double precision floating point.
+C7.02 FP_INST_RETIRED.64B_DP
+
+# Counts the number of retired instructions whose sources are a packed 128 bit single precision floating point. This may be SSE or AVX.128 operations.
+C7.04 FP_INST_RETIRED.128B_SP
+
+# Counts the total number of  floating point retired instructions.
+C7.08 FP_INST_RETIRED.128B_DP
+
+# Counts the number of retired instructions whose sources are a packed 256 bit double precision floating point.
+C7.20 FP_INST_RETIRED.256B_DP
+
+# Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]
+C8.01 FP_FLOPS_RETIRED.FP64
+
+# Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]
+C8.02 FP_FLOPS_RETIRED.FP32
+
+# Counts the number of all types of floating point operations per uop with all default weighting
+C8.03 FP_FLOPS_RETIRED.ALL
+
+# Counts the number of cycles when any of the floating point dividers are active.
+CD.02.CMSK=1 ARITH.FPDIV_ACTIVE
+
+# Counts the number of cycles when any of the floating point or integer dividers are active.
+CD.03.CMSK=1 ARITH.DIV_ACTIVE
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x10.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x100.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x20.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x200.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x4.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x40.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x400.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x8.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x80.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128
+
+# Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.
+D0.05.MSR_3F6H=0x800.CTR=0.TakenAlone MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048
+
+# Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES
+D0.06 MEM_UOPS_RETIRED.STORE_LATENCY
+
+# Counts the number of load uops retired that miss in the second Level TLB.
+D0.11 MEM_UOPS_RETIRED.STLB_MISS_LOADS
+
+# Counts the number of store uops retired that miss in the second level TLB.
+D0.12 MEM_UOPS_RETIRED.STLB_MISS_STORES
+
+# Counts the number of memory uops retired that missed in the second level TLB.
+D0.13 MEM_UOPS_RETIRED.STLB_MISS
+
+# Counts the number of load uops retired that performed one or more locks
+D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
+
+# Counts the number of retired split load uops.
+D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
+
+# Counts the number of retired split store uops.
+D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
+
+# Counts the number of memory uops retired that were splits.
+D0.43 MEM_UOPS_RETIRED.SPLIT
+
+# Counts the number of load ops retired.
+D0.81 MEM_UOPS_RETIRED.ALL_LOADS
+
+# Counts the number of store ops retired.
+D0.82 MEM_UOPS_RETIRED.ALL_STORES
+
+# Counts the number of load ops retired that hit the L1 data cache.
+D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
+
+# Counts the number of load ops retired that hit in the L2 cache.
+D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
+
+# Counts the number of load ops retired that hit in the L3 cache.
+D1.1C MEM_LOAD_UOPS_RETIRED.L3_HIT
+
+# Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.
+D1.20 MEM_LOAD_UOPS_RETIRED.WCB_HIT
+
+# Counts the number of load ops retired that miss in the L1 data cache.
+D1.40 MEM_LOAD_UOPS_RETIRED.L1_MISS
+
+# Counts the number of load ops retired that miss in the L2 cache.
+D1.80 MEM_LOAD_UOPS_RETIRED.L2_MISS
+
+# Counts the number of load ops retired that miss the L3 cache and hit in DRAM
+D4.02 MEM_LOAD_UOPS_MISC_RETIRED.LOCAL_DRAM
+
+# Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]
+E4.01 MISC_RETIRED.LBR_INSERTS
+
+# Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.
+E6.01 BACLEARS.ANY
+
+# Counts the number of cycles that the micro-sequencer is busy.
+E7.04 MS_DECODED.MS_BUSY
--- a/configs/cfg_MeteorLakeE_Crestmont_all_offcore.txt
+++ b/configs/cfg_MeteorLakeE_Crestmont_all_offcore.txt
@@ -0,0 +1,68 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/MTL/events/meteorlake_crestmont_core.json (Version: 1.14)
+# Applies to processors with family-model in {6-AA, 6-AC, 6-B5}
+
+# Counts demand data reads that have any type of response.
+B7.01.MSR_RSP0=0x10001.TakenAlone OCR.DEMAND_DATA_RD.ANY_RESPONSE
+
+# Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.
+B7.01.MSR_RSP0=0x10002.TakenAlone OCR.DEMAND_RFO.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.
+B7.01.MSR_RSP0=0x10003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM
+
+# Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.
+B7.01.MSR_RSP0=0x10003C0002.TakenAlone OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.
+B7.01.MSR_RSP0=0x10003C0004.TakenAlone OCR.DEMAND_CODE_RD.L3_HIT.SNOOP_HITM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that have any type of response.
+B7.01.MSR_RSP0=0x10004.TakenAlone OCR.DEMAND_CODE_RD.ANY_RESPONSE
+
+# Counts streaming stores that have any type of response.
+B7.01.MSR_RSP0=0x10800.TakenAlone OCR.STREAMING_WR.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by DRAM.
+B7.01.MSR_RSP0=0x184000001.TakenAlone OCR.DEMAND_DATA_RD.DRAM
+
+# Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by DRAM.
+B7.01.MSR_RSP0=0x184000002.TakenAlone OCR.DEMAND_RFO.DRAM
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by DRAM.
+B7.01.MSR_RSP0=0x184000004.TakenAlone OCR.DEMAND_CODE_RD.DRAM
+
+# Counts demand data reads that were supplied by the L3 cache.
+B7.01.MSR_RSP0=0x3F803C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT
+
+# Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.
+B7.01.MSR_RSP0=0x3F803C0002.TakenAlone OCR.DEMAND_RFO.L3_HIT
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by the L3 cache.
+B7.01.MSR_RSP0=0x3F803C0004.TakenAlone OCR.DEMAND_CODE_RD.L3_HIT
+
+# Counts demand data reads that were not supplied by the L3 cache.
+B7.01.MSR_RSP0=0x3FBFC00001.TakenAlone OCR.DEMAND_DATA_RD.L3_MISS
+
+# Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.
+B7.01.MSR_RSP0=0x3FBFC00002.TakenAlone OCR.DEMAND_RFO.L3_MISS
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were not supplied by the L3 cache.
+B7.01.MSR_RSP0=0x3FBFC00004.TakenAlone OCR.DEMAND_CODE_RD.L3_MISS
+
+# Counts streaming stores which modify only part of a 64 byte cacheline that have any type of response.
+B7.01.MSR_RSP0=0x400000010000.TakenAlone OCR.PARTIAL_STREAMING_WR.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.
+B7.01.MSR_RSP0=0x4003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.
+B7.01.MSR_RSP0=0x4003C0004.TakenAlone OCR.DEMAND_CODE_RD.L3_HIT.SNOOP_HIT_NO_FWD
+
+# Counts streaming stores which modify a full 64 byte cacheline that have any type of response.
+B7.01.MSR_RSP0=0x800000010000.TakenAlone OCR.FULL_STREAMING_WR.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.
+B7.01.MSR_RSP0=0x8003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD
+
+# Counts demand instruction fetches and L1 instruction cache prefetches that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.
+B7.01.MSR_RSP0=0x8003C0004.TakenAlone OCR.DEMAND_CODE_RD.L3_HIT.SNOOP_HIT_WITH_FWD
--- a/configs/cfg_MeteorLakeE_Crestmont_common.txt
+++ b/configs/cfg_MeteorLakeE_Crestmont_common.txt
@@ -0,0 +1,15 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/MTL/events/meteorlake_crestmont_core.json (Version: 1.14)
+# Applies to processors with family-model in {6-AA, 6-AC, 6-B5}
+
+3C.00 CORE_CYCLES
+C0.00 INST_RETIRED
+C2.00 UOPS_RETIRED.ALL
+C2.01 UOPS_RETIRED.MS
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+B2.01 FP_VINT_UOPS_EXECUTED.STD
+D0.81 MEM_UOPS_RETIRED.ALL_LOADS
+D0.82 MEM_UOPS_RETIRED.ALL_STORES
+D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
+D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
+D1.1C MEM_LOAD_UOPS_RETIRED.L3_HIT
--- a/configs/cfg_MeteorLakeP_RedwoodCove_all_core.txt
+++ b/configs/cfg_MeteorLakeP_RedwoodCove_all_core.txt
@@ -0,0 +1,914 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/MTL/events/meteorlake_redwoodcove_core.json (Version: 1.14)
+# Applies to processors with family-model in {6-AA, 6-AC, 6-B5}
+
+# False dependencies in MOB due to partial compare on address.
+03.04 LD_BLOCKS.ADDRESS_ALIAS
+
+# Loads blocked due to overlapping with a preceding store that cannot be forwarded.
+03.82 LD_BLOCKS.STORE_FORWARD
+
+# The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.
+03.88 LD_BLOCKS.NO_SR
+
+# Code miss in all TLB levels causes a page walk that completes. (4K)
+11.02 ITLB_MISSES.WALK_COMPLETED_4K
+
+# Code miss in all TLB levels causes a page walk that completes. (2M/4M)
+11.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
+
+# Code miss in all TLB levels causes a page walk that completes. (All page sizes)
+11.0E ITLB_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for an outstanding code request in the PMH each cycle.
+11.10 ITLB_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request.
+11.10.CMSK=1 ITLB_MISSES.WALK_ACTIVE
+
+# Instruction fetch requests that miss the ITLB and hit the STLB.
+11.20 ITLB_MISSES.STLB_HIT
+
+# Page walks completed due to a demand data load to a 4K page.
+12.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
+
+# Page walks completed due to a demand data load to a 2M/4M page.
+12.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walks completed due to a demand data load to a 1G page.
+12.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1G
+
+# Load miss in all TLB levels causes a page walk that completes. (All page sizes)
+12.0E DTLB_LOAD_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for a demand load in the PMH each cycle.
+12.10 DTLB_LOAD_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for a demand load.
+12.10.CMSK=1 DTLB_LOAD_MISSES.WALK_ACTIVE
+
+# Loads that miss the DTLB and hit the STLB.
+12.20 DTLB_LOAD_MISSES.STLB_HIT
+
+# Page walks completed due to a demand data store to a 4K page.
+13.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
+
+# Page walks completed due to a demand data store to a 2M/4M page.
+13.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walks completed due to a demand data store to a 1G page.
+13.08 DTLB_STORE_MISSES.WALK_COMPLETED_1G
+
+# Store misses in all TLB levels causes a page walk that completes. (All page sizes)
+13.0E DTLB_STORE_MISSES.WALK_COMPLETED
+
+# Number of page walks outstanding for a store in the PMH each cycle.
+13.10 DTLB_STORE_MISSES.WALK_PENDING
+
+# Cycles when at least one PMH is busy with a page walk for a store.
+13.10.CMSK=1 DTLB_STORE_MISSES.WALK_ACTIVE
+
+# Stores that miss the DTLB and hit the STLB.
+13.20 DTLB_STORE_MISSES.STLB_HIT
+
+# For every cycle, increments by the number of outstanding demand data read requests pending.
+20.01 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD
+
+# Cycles where at least 1 outstanding demand data read request is pending.
+20.01.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD
+
+# Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle.
+20.02 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD
+
+# Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore.
+20.02.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD
+
+# Store Read transactions pending for off-core. Highly correlated.
+20.04 OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO
+
+# Cycles with offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore.
+20.04.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO
+
+# OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
+20.08 OFFCORE_REQUESTS_OUTSTANDING.DATA_RD
+
+# Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore.
+20.08.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
+
+# For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.
+20.10 OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD
+
+# Cycles where data return is pending for a Demand Data Read request who miss L3 cache.
+20.10.CMSK=1 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD
+
+# Demand Data Read requests sent to uncore
+21.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
+
+# Cacheable and Non-Cacheable code read requests
+21.02 OFFCORE_REQUESTS.DEMAND_CODE_RD
+
+# Demand RFO requests including regular RFOs, locks, ItoM
+21.04 OFFCORE_REQUESTS.DEMAND_RFO
+
+# Demand and prefetch data reads
+21.08 OFFCORE_REQUESTS.DATA_RD
+
+# Counts demand data read requests that miss the L3 cache.
+21.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
+
+# Any memory transaction that reached the SQ.
+21.80 OFFCORE_REQUESTS.ALL_REQUESTS
+
+# L2 writebacks that access L2 cache
+23.40 L2_TRANS.L2_WB
+
+# Demand Data Read miss L2 cache
+24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
+
+# RFO requests that miss L2 cache
+24.22 L2_RQSTS.RFO_MISS
+
+# L2 cache misses when fetching instructions
+24.24 L2_RQSTS.CODE_RD_MISS
+
+# Demand requests that miss L2 cache
+24.27 L2_RQSTS.ALL_DEMAND_MISS
+
+# SW prefetch requests that miss L2 cache.
+24.28 L2_RQSTS.SWPF_MISS
+
+# L2_RQSTS.HWPF_MISS
+24.30 L2_RQSTS.HWPF_MISS
+
+# Read requests with true-miss in L2 cache [This event is alias to L2_RQSTS.MISS]
+24.3F L2_REQUEST.MISS
+
+# Read requests with true-miss in L2 cache [This event is alias to L2_REQUEST.MISS]
+24.3F L2_RQSTS.MISS
+
+# Demand Data Read requests that hit L2 cache
+24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT
+
+# RFO requests that hit L2 cache
+24.C2 L2_RQSTS.RFO_HIT
+
+# L2 cache hits when fetching instructions, code reads.
+24.C4 L2_RQSTS.CODE_RD_HIT
+
+# SW prefetch requests that hit L2 cache.
+24.C8 L2_RQSTS.SWPF_HIT
+
+# All requests that hit L2 cache. [This event is alias to L2_RQSTS.HIT]
+24.DF L2_REQUEST.HIT
+
+# All requests that hit L2 cache. [This event is alias to L2_REQUEST.HIT]
+24.DF L2_RQSTS.HIT
+
+# Demand Data Read access L2 cache
+24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
+
+# RFO requests to L2 cache
+24.E2 L2_RQSTS.ALL_RFO
+
+# L2 code requests
+24.E4 L2_RQSTS.ALL_CODE_RD
+
+# Demand requests to L2 cache
+24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
+
+# L2_RQSTS.ALL_HWPF
+24.F0 L2_RQSTS.ALL_HWPF
+
+# All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES]
+24.FF L2_REQUEST.ALL
+
+# All accesses to L2 cache [This event is alias to L2_REQUEST.ALL]
+24.FF L2_RQSTS.REFERENCES
+
+# L2 cache lines filling L2
+25.1F L2_LINES_IN.ALL
+
+# Non-modified cache lines that are silently dropped by L2 cache.
+26.01 L2_LINES_OUT.SILENT
+
+# Modified cache lines that are evicted by L2 cache when triggered by an L2 cache fill.
+26.02 L2_LINES_OUT.NON_SILENT
+
+# Cache lines that have been L2 hardware prefetched but not used by demand accesses
+26.04 L2_LINES_OUT.USELESS_HWPF
+
+# Counts bus locks, accounts for cache line split locks and UC locks.
+2C.10 SQ_MISC.BUS_LOCK
+
+# Cycles the uncore cannot take further requests
+2D.01.CMSK=1 XQ.FULL_CYCLES
+
+# Core-originated cacheable requests that missed L3  (Except hardware prefetches to the L3)
+2E.41 LONGEST_LAT_CACHE.MISS
+
+# Core-originated cacheable requests that refer to L3 (Except hardware prefetches to the L3)
+2E.4F LONGEST_LAT_CACHE.REFERENCE
+
+# Thread cycles when thread is not in halt state
+3C.00 CPU_CLK_UNHALTED.THREAD_P
+
+# Reference cycles when the core is not in halt state.
+3C.01 CPU_CLK_UNHALTED.REF_TSC_P
+
+# Core crystal clock cycles when this thread is unhalted and the other thread is halted.
+3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
+
+# Core crystal clock cycles. Cycle counts are evenly distributed between active threads in the Core.
+3C.08 CPU_CLK_UNHALTED.REF_DISTRIBUTED
+
+# Number of PREFETCHNTA instructions executed.
+40.01 SW_PREFETCH_ACCESS.NTA
+
+# Number of PREFETCHT0 instructions executed.
+40.02 SW_PREFETCH_ACCESS.T0
+
+# Number of PREFETCHT1 or PREFETCHT2 instructions executed.
+40.04 SW_PREFETCH_ACCESS.T1_T2
+
+# Number of PREFETCHW instructions executed.
+40.08 SW_PREFETCH_ACCESS.PREFETCHW
+
+# Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed.
+40.0F SW_PREFETCH_ACCESS.ANY
+
+# Cycles when L1D is locked
+42.02 LOCK_CYCLES.CACHE_LOCK_DURATION
+
+# Completed demand load uops that miss the L1 d-cache.
+43.FD MEM_LOAD_COMPLETED.L1_MISS_ANY
+
+# MEM_STORE_RETIRED.L2_HIT
+44.01 MEM_STORE_RETIRED.L2_HIT
+
+# Cycles while L1 cache miss demand load is outstanding.
+47.02.CMSK=2 MEMORY_ACTIVITY.CYCLES_L1D_MISS
+
+# Execution stalls while L1 cache miss demand load is outstanding.
+47.03.CMSK=3 MEMORY_ACTIVITY.STALLS_L1D_MISS
+
+# Execution stalls while L2 cache miss demand cacheable load request is outstanding.
+47.05.CMSK=5 MEMORY_ACTIVITY.STALLS_L2_MISS
+
+# Execution stalls while L3 cache miss demand cacheable load request is outstanding.
+47.09.CMSK=9 MEMORY_ACTIVITY.STALLS_L3_MISS
+
+# Number of L1D misses that are outstanding
+48.01 L1D_PEND_MISS.PENDING
+
+# Cycles with L1D load Misses outstanding.
+48.01.CMSK=1 L1D_PEND_MISS.PENDING_CYCLES
+
+# Number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability.
+48.02 L1D_PEND_MISS.FB_FULL
+
+# Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.
+48.02.CMSK=1.EDG L1D_PEND_MISS.FB_FULL_PERIODS
+
+# Number of cycles a demand request has waited due to L1D due to lack of L2 resources.
+48.04 L1D_PEND_MISS.L2_STALLS
+
+# Counts the number of demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch.
+4C.01 LOAD_HIT_PREFETCH.SWPF
+
+# Counts the number of cache lines replaced in L1 data cache.
+51.01 L1D.REPLACEMENT
+
+# L1D.HWPF_MISS
+51.20 L1D.HWPF_MISS
+
+# Clears due to Unknown Branches.
+60.01 BACLEARS.ANY
+
+# DSB-to-MITE switch true penalty cycles.
+61.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
+
+# Instruction decoders utilized in a cycle
+75.01 INST_DECODED.DECODERS
+
+# Number of non dec-by-all uops decoded by decoder
+76.01 UOPS_DECODED.DEC0_UOPS
+
+# Uops delivered to Instruction Decode Queue (IDQ) from MITE path
+79.04 IDQ.MITE_UOPS
+
+# Cycles MITE is delivering any Uop
+79.04.CMSK=1 IDQ.MITE_CYCLES_ANY
+
+# Cycles MITE is delivering optimal number of Uops
+79.04.CMSK=6 IDQ.MITE_CYCLES_OK
+
+# Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path
+79.08 IDQ.DSB_UOPS
+
+# Cycles Decode Stream Buffer (DSB) is delivering any Uop
+79.08.CMSK=1 IDQ.DSB_CYCLES_ANY
+
+# Cycles DSB is delivering optimal number of Uops
+79.08.CMSK=6 IDQ.DSB_CYCLES_OK
+
+# Uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy
+79.20 IDQ.MS_UOPS
+
+# Cycles when uops are being delivered to IDQ while MS is busy
+79.20.CMSK=1 IDQ.MS_CYCLES_ANY
+
+# Number of switches from DSB or MITE to the MS
+79.20.CMSK=1.EDG IDQ.MS_SWITCHES
+
+# Cycles where a code fetch is stalled due to L1 instruction cache miss.
+80.04 ICACHE_DATA.STALLS
+
+# ICACHE_DATA.STALL_PERIODS
+80.04.CMSK=1.EDG ICACHE_DATA.STALL_PERIODS
+
+# Cycles where a code fetch is stalled due to L1 instruction cache tag miss.
+83.04 ICACHE_TAG.STALLS
+
+# Stalls caused by changing prefix length of the instruction.
+87.01 DECODE.LCP
+
+# Cycles the Microcode Sequencer is busy.
+87.02 DECODE.MS_BUSY
+
+# This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.
+9C.01 IDQ_BUBBLES.CORE
+
+# Uops not delivered by IDQ when backend of the machine is not stalled
+9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
+
+# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]
+9C.01.CMSK=1.INV IDQ_BUBBLES.CYCLES_FE_WAS_OK
+
+# Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]
+9C.01.CMSK=1.INV IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
+
+# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]
+9C.01.CMSK=6 IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE
+
+# Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]
+9C.01.CMSK=6 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
+
+# Counts cycles where the pipeline is stalled due to serializing operations.
+A2.02 RESOURCE_STALLS.SCOREBOARD
+
+# Cycles stalled due to no store buffers available. (not including draining form sync).
+A2.08 RESOURCE_STALLS.SB
+
+# Cycles while L2 cache miss demand load is outstanding.
+A3.01.CMSK=1 CYCLE_ACTIVITY.CYCLES_L2_MISS
+
+# Cycles while L3 cache miss demand load is outstanding.
+A3.02.CMSK=2 CYCLE_ACTIVITY.CYCLES_L3_MISS
+
+# Total execution stalls.
+A3.04.CMSK=4 CYCLE_ACTIVITY.STALLS_TOTAL
+
+# Execution stalls while L2 cache miss demand load is outstanding.
+A3.05.CMSK=5 CYCLE_ACTIVITY.STALLS_L2_MISS
+
+# Execution stalls while L3 cache miss demand load is outstanding.
+A3.06.CMSK=6 CYCLE_ACTIVITY.STALLS_L3_MISS
+
+# Cycles while L1 cache miss demand load is outstanding.
+A3.08.CMSK=8 CYCLE_ACTIVITY.CYCLES_L1D_MISS
+
+# Execution stalls while L1 cache miss demand load is outstanding.
+A3.0C.CMSK=12 CYCLE_ACTIVITY.STALLS_L1D_MISS
+
+# Cycles while memory subsystem has an outstanding load.
+A3.10.CMSK=16 CYCLE_ACTIVITY.CYCLES_MEM_ANY
+
+# TMA slots available for an unhalted logical processor. General counter - architectural event
+A4.01 TOPDOWN.SLOTS_P
+
+# This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.
+A4.02 TOPDOWN.BACKEND_BOUND_SLOTS
+
+# TMA slots wasted due to incorrect speculations.
+A4.04.CTR=0 TOPDOWN.BAD_SPEC_SLOTS
+
+# TMA slots wasted due to incorrect speculation by branch mispredictions
+A4.08.CTR=0 TOPDOWN.BR_MISPREDICT_SLOTS
+
+# TOPDOWN.MEMORY_BOUND_SLOTS
+A4.10 TOPDOWN.MEMORY_BOUND_SLOTS
+
+# Cycles when RS was empty and a resource allocation stall is asserted
+A5.01 RS.EMPTY_RESOURCE
+
+# Cycles when Reservation Station (RS) is empty for the thread.
+A5.07 RS.EMPTY
+
+# Counts end of periods where the Reservation Station (RS) was empty.
+A5.07.CMSK=1.EDG.INV RS.EMPTY_COUNT
+
+# Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.
+A6.02 EXE_ACTIVITY.1_PORTS_UTIL
+
+# Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.
+A6.04 EXE_ACTIVITY.2_PORTS_UTIL
+
+# Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.
+A6.08 EXE_ACTIVITY.3_PORTS_UTIL
+
+# Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty.
+A6.0C EXE_ACTIVITY.2_3_PORTS_UTIL
+
+# Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.
+A6.10 EXE_ACTIVITY.4_PORTS_UTIL
+
+# Execution stalls while memory subsystem has an outstanding load.
+A6.21.CMSK=5 EXE_ACTIVITY.BOUND_ON_LOADS
+
+# Cycles where the Store Buffer was full and no loads caused an execution stall.
+A6.40.CMSK=2 EXE_ACTIVITY.BOUND_ON_STORES
+
+# Cycles no uop executed while RS was not empty, the SB was not full and there was no outstanding load.
+A6.80 EXE_ACTIVITY.EXE_BOUND_0_PORTS
+
+# Number of Uops delivered by the LSD.
+A8.01 LSD.UOPS
+
+# Cycles Uops delivered by the LSD, but didn't come from the decoder.
+A8.01.CMSK=1 LSD.CYCLES_ACTIVE
+
+# Cycles optimal number of Uops delivered by the LSD, but did not come from the decoder.
+A8.01.CMSK=6 LSD.CYCLES_OK
+
+# Core cycles the allocator was stalled due to recovery from earlier clear event for this thread
+AD.01 INT_MISC.RECOVERY_CYCLES
+
+# Clears speculative count
+AD.01.CMSK=1.EDG INT_MISC.CLEARS_COUNT
+
+# TMA slots where uops got dropped
+AD.10 INT_MISC.UOP_DROPPING
+
+# Bubble cycles of BAClear (Unknown Branch).
+AD.40.TakenAlone INT_MISC.UNKNOWN_BRANCH_CYCLES
+
+# Counts cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.
+AD.80 INT_MISC.CLEAR_RESTEER_CYCLES
+
+# Uops that RAT issues to RS
+AE.01 UOPS_ISSUED.ANY
+
+# UOPS_ISSUED.CYCLES
+AE.01.CMSK=1 UOPS_ISSUED.CYCLES
+
+# This event counts the cycles the floating point divider is busy.
+B0.01.CMSK=1 ARITH.FPDIV_ACTIVE
+
+# This event counts the cycles the integer divider is busy.
+B0.08.CMSK=1 ARITH.IDIV_ACTIVE
+
+# Cycles when divide unit is busy executing divide or square root operations.
+B0.09.CMSK=1 ARITH.DIV_ACTIVE
+
+# Counts the number of uops to be executed per-thread each cycle.
+B1.01 UOPS_EXECUTED.THREAD
+
+# Cycles where at least 1 uop was executed per-thread
+B1.01.CMSK=1 UOPS_EXECUTED.CYCLES_GE_1
+
+# Counts number of cycles no uops were dispatched to be executed on this thread.
+B1.01.CMSK=1.INV UOPS_EXECUTED.STALLS
+
+# Cycles where at least 2 uops were executed per-thread
+B1.01.CMSK=2 UOPS_EXECUTED.CYCLES_GE_2
+
+# Cycles where at least 3 uops were executed per-thread
+B1.01.CMSK=3 UOPS_EXECUTED.CYCLES_GE_3
+
+# Cycles where at least 4 uops were executed per-thread
+B1.01.CMSK=4 UOPS_EXECUTED.CYCLES_GE_4
+
+# Number of uops executed on the core.
+B1.02 UOPS_EXECUTED.CORE
+
+# Cycles at least 1 micro-op is executed from any thread on physical core.
+B1.02.CMSK=1 UOPS_EXECUTED.CORE_CYCLES_GE_1
+
+# Cycles at least 2 micro-op is executed from any thread on physical core.
+B1.02.CMSK=2 UOPS_EXECUTED.CORE_CYCLES_GE_2
+
+# Cycles at least 3 micro-op is executed from any thread on physical core.
+B1.02.CMSK=3 UOPS_EXECUTED.CORE_CYCLES_GE_3
+
+# Cycles at least 4 micro-op is executed from any thread on physical core.
+B1.02.CMSK=4 UOPS_EXECUTED.CORE_CYCLES_GE_4
+
+# Counts the number of x87 uops dispatched.
+B1.10 UOPS_EXECUTED.X87
+
+# Uops executed on port 0
+B2.01 UOPS_DISPATCHED.PORT_0
+
+# Uops executed on port 1
+B2.02 UOPS_DISPATCHED.PORT_1
+
+# Uops executed on ports 2, 3 and 10
+B2.04 UOPS_DISPATCHED.PORT_2_3_10
+
+# Uops executed on ports 4 and 9
+B2.10 UOPS_DISPATCHED.PORT_4_9
+
+# Uops executed on ports 5 and 11
+B2.20 UOPS_DISPATCHED.PORT_5_11
+
+# Uops executed on port 6
+B2.40 UOPS_DISPATCHED.PORT_6
+
+# Uops executed on ports 7 and 8
+B2.80 UOPS_DISPATCHED.PORT_7_8
+
+# FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]
+B3.01 FP_ARITH_DISPATCHED.PORT_0
+
+# FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]
+B3.01 FP_ARITH_DISPATCHED.V0
+
+# FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]
+B3.02 FP_ARITH_DISPATCHED.PORT_1
+
+# FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]
+B3.02 FP_ARITH_DISPATCHED.V1
+
+# FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]
+B3.04 FP_ARITH_DISPATCHED.PORT_5
+
+# FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]
+B3.04 FP_ARITH_DISPATCHED.V2
+
+# Number of instructions retired. General Counter - architectural event
+C0.00 INST_RETIRED.ANY_P
+
+# Retired NOP instructions.
+C0.02 INST_RETIRED.NOP
+
+# Iterations of Repeat string retired instructions.
+C0.08 INST_RETIRED.REP_ITERATION
+
+# INST_RETIRED.MACRO_FUSED
+C0.10 INST_RETIRED.MACRO_FUSED
+
+# Counts all microcode FP assists.
+C1.02 ASSISTS.FP
+
+# Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. the event also counts for Machine Ordering count.
+C1.04 ASSISTS.HARDWARE
+
+# ASSISTS.PAGE_FAULT
+C1.08 ASSISTS.PAGE_FAULT
+
+# ASSISTS.SSE_AVX_MIX
+C1.10 ASSISTS.SSE_AVX_MIX
+
+# Number of occurrences where a microcode assist is invoked by hardware.
+C1.1B ASSISTS.ANY
+
+# Retired uops except the last uop of each instruction.
+C2.01 UOPS_RETIRED.HEAVY
+
+# This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance  for example, as measured by the instructions-per-cycle metric.
+C2.02 UOPS_RETIRED.SLOTS
+
+# Cycles with retired uop(s).
+C2.02.CMSK=1 UOPS_RETIRED.CYCLES
+
+# Cycles without actually retired uops.
+C2.02.CMSK=1.INV UOPS_RETIRED.STALLS
+
+# UOPS_RETIRED.MS
+C2.04.TakenAlone UOPS_RETIRED.MS
+
+# Number of machine clears (nukes) of any type.
+C3.01.CMSK=1.EDG MACHINE_CLEARS.COUNT
+
+# Number of machine clears due to memory ordering conflicts.
+C3.02 MACHINE_CLEARS.MEMORY_ORDERING
+
+# Self-modifying code (SMC) detected.
+C3.04 MACHINE_CLEARS.SMC
+
+# All branch instructions retired.
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+
+# Taken conditional branch instructions retired.
+C4.01 BR_INST_RETIRED.COND_TAKEN
+
+# Direct and indirect near call instructions retired.
+C4.02 BR_INST_RETIRED.NEAR_CALL
+
+# Return instructions retired.
+C4.08 BR_INST_RETIRED.NEAR_RETURN
+
+# Not taken branch instructions retired.
+C4.10 BR_INST_RETIRED.COND_NTAKEN
+
+# Conditional branch instructions retired.
+C4.11 BR_INST_RETIRED.COND
+
+# Taken branch instructions retired.
+C4.20 BR_INST_RETIRED.NEAR_TAKEN
+
+# Far branch instructions retired.
+C4.40 BR_INST_RETIRED.FAR_BRANCH
+
+# Indirect near branch instructions retired (excluding returns)
+C4.80 BR_INST_RETIRED.INDIRECT
+
+# All mispredicted branch instructions retired.
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+
+# number of branch instructions retired that were mispredicted and taken.
+C5.01 BR_MISP_RETIRED.COND_TAKEN
+
+# Mispredicted indirect CALL retired.
+C5.02 BR_MISP_RETIRED.INDIRECT_CALL
+
+# This event counts the number of mispredicted ret instructions retired. Non PEBS
+C5.08 BR_MISP_RETIRED.RET
+
+# Mispredicted non-taken conditional branch instructions retired.
+C5.10 BR_MISP_RETIRED.COND_NTAKEN
+
+# Mispredicted conditional branch instructions retired.
+C5.11 BR_MISP_RETIRED.COND
+
+# Number of near branch instructions retired that were mispredicted and taken.
+C5.20 BR_MISP_RETIRED.NEAR_TAKEN
+
+# Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.41 BR_MISP_RETIRED.COND_TAKEN_COST
+
+# Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.42 BR_MISP_RETIRED.INDIRECT_CALL_COST
+
+# All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.44 BR_MISP_RETIRED.ALL_BRANCHES_COST
+
+# Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.48 BR_MISP_RETIRED.RET_COST
+
+# Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.50 BR_MISP_RETIRED.COND_NTAKEN_COST
+
+# Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.51 BR_MISP_RETIRED.COND_COST
+
+# Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.60 BR_MISP_RETIRED.NEAR_TAKEN_COST
+
+# Miss-predicted near indirect branch instructions retired (excluding returns)
+C5.80 BR_MISP_RETIRED.INDIRECT
+
+# Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch.
+C5.C0 BR_MISP_RETIRED.INDIRECT_COST
+
+# Mispredicted Retired ANT branches
+C6.02.TakenAlone FRONTEND_RETIRED.MISP_ANT
+
+# Retired ANT branches
+C6.03.TakenAlone FRONTEND_RETIRED.ANY_ANT
+
+# Retired Instructions who experienced DSB miss.
+C6.03.TakenAlone FRONTEND_RETIRED.ANY_DSB_MISS
+
+# Retired Instructions who experienced a critical DSB miss.
+C6.03.TakenAlone FRONTEND_RETIRED.DSB_MISS
+
+# Retired Instructions who experienced iTLB true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.ITLB_MISS
+
+# Retired Instructions who experienced Instruction L1 Cache true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.L1I_MISS
+
+# Retired Instructions who experienced Instruction L2 Cache true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.L2_MISS
+
+# Retired instructions after front-end starvation of at least 1 cycle
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_1
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_128
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_16
+
+# Retired instructions after front-end starvation of at least 2 cycles
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_256
+
+# Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_32
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_4
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_512
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_64
+
+# Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.
+C6.03.TakenAlone FRONTEND_RETIRED.LATENCY_GE_8
+
+# FRONTEND_RETIRED.MS_FLOWS
+C6.03.TakenAlone FRONTEND_RETIRED.MS_FLOWS
+
+# Retired Instructions who experienced STLB (2nd level TLB) true miss.
+C6.03.TakenAlone FRONTEND_RETIRED.STLB_MISS
+
+# FRONTEND_RETIRED.UNKNOWN_BRANCH
+C6.03.TakenAlone FRONTEND_RETIRED.UNKNOWN_BRANCH
+
+# Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
+
+# Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
+
+# Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below.  Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 RANGE SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.
+C7.03 FP_ARITH_INST_RETIRED.SCALAR
+
+# Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
+
+# Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
+
+# Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below.  Each count represents 2 or/and 4 computation operations, 1 for each element.  Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.
+C7.18 FP_ARITH_INST_RETIRED.4_FLOPS
+
+# Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.
+C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
+
+# Number of any Vector retired FP arithmetic instructions
+C7.FC FP_ARITH_INST_RETIRED.VECTOR
+
+# Increments whenever there is an update to the LBR array.
+CC.20 MISC_RETIRED.LBR_INSERTS
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles.
+CD.01.MSR_3F6H=0x10.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles.
+CD.01.MSR_3F6H=0x100.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles.
+CD.01.MSR_3F6H=0x20.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles.
+CD.01.MSR_3F6H=0x200.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles.
+CD.01.MSR_3F6H=0x4.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles.
+CD.01.MSR_3F6H=0x40.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles.
+CD.01.MSR_3F6H=0x400.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles.
+CD.01.MSR_3F6H=0x8.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles.
+CD.01.MSR_3F6H=0x80.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
+
+# Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles.
+CD.01.MSR_3F6H=0x800.CTR=1.TakenAlone MEM_TRANS_RETIRED.LOAD_LATENCY_GT_2048
+
+# Retired memory store access operations. A PDist event for PEBS Store Latency Facility.
+CD.02.CTR=0 MEM_TRANS_RETIRED.STORE_SAMPLE
+
+# Retired load instructions that hit the STLB.
+D0.09 MEM_INST_RETIRED.STLB_HIT_LOADS
+
+# Retired store instructions that hit the STLB.
+D0.0A MEM_INST_RETIRED.STLB_HIT_STORES
+
+# Retired load instructions that miss the STLB.
+D0.11 MEM_INST_RETIRED.STLB_MISS_LOADS
+
+# Retired store instructions that miss the STLB.
+D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
+
+# Retired load instructions with locked access.
+D0.21 MEM_INST_RETIRED.LOCK_LOADS
+
+# Retired load instructions that split across a cacheline boundary.
+D0.41 MEM_INST_RETIRED.SPLIT_LOADS
+
+# Retired store instructions that split across a cacheline boundary.
+D0.42 MEM_INST_RETIRED.SPLIT_STORES
+
+# Retired load instructions.
+D0.81 MEM_INST_RETIRED.ALL_LOADS
+
+# Retired store instructions.
+D0.82 MEM_INST_RETIRED.ALL_STORES
+
+# All retired memory instructions.
+D0.83 MEM_INST_RETIRED.ANY
+
+# Retired load instructions with L1 cache hits as data sources
+D1.01 MEM_LOAD_RETIRED.L1_HIT
+
+# Retired load instructions with L2 cache hits as data sources
+D1.02 MEM_LOAD_RETIRED.L2_HIT
+
+# Retired load instructions with L3 cache hits as data sources
+D1.04 MEM_LOAD_RETIRED.L3_HIT
+
+# Retired load instructions missed L1 cache as data sources
+D1.08 MEM_LOAD_RETIRED.L1_MISS
+
+# Retired load instructions missed L2 cache as data sources
+D1.10 MEM_LOAD_RETIRED.L2_MISS
+
+# Retired load instructions missed L3 cache as data sources
+D1.20 MEM_LOAD_RETIRED.L3_MISS
+
+# Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.
+D1.40 MEM_LOAD_RETIRED.FB_HIT
+
+# Retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.
+D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
+
+# Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache
+D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD
+
+# Retired load instructions whose data sources were HitM responses from shared L3
+D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD
+
+# Retired load instructions whose data sources were hits in L3 without snoops required
+D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
+
+# Retired load instructions which data sources missed L3 but serviced from local dram
+D3.01 MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM
+
+# Retired instructions with at least 1 uncacheable load or lock.
+D4.04 MEM_LOAD_MISC_RETIRED.UC
+
+# LFENCE instructions retired
+E0.20 MISC2_RETIRED.LFENCE
+
+# Retired memory uops for any access
+E5.03 MEM_UOP_RETIRED.ANY
+
+# integer ADD, SUB, SAD 128-bit vector instructions.
+E7.03 INT_VEC_RETIRED.ADD_128
+
+# integer ADD, SUB, SAD 256-bit vector instructions.
+E7.0C INT_VEC_RETIRED.ADD_256
+
+# INT_VEC_RETIRED.VNNI_128
+E7.10 INT_VEC_RETIRED.VNNI_128
+
+# INT_VEC_RETIRED.128BIT
+E7.13 INT_VEC_RETIRED.128BIT
+
+# INT_VEC_RETIRED.VNNI_256
+E7.20 INT_VEC_RETIRED.VNNI_256
+
+# INT_VEC_RETIRED.SHUFFLES
+E7.40 INT_VEC_RETIRED.SHUFFLES
+
+# INT_VEC_RETIRED.MUL_256
+E7.80 INT_VEC_RETIRED.MUL_256
+
+# INT_VEC_RETIRED.256BIT
+E7.AC INT_VEC_RETIRED.256BIT
+
+# Cycle counts are evenly distributed between active threads in the Core.
+EC.02 CPU_CLK_UNHALTED.DISTRIBUTED
+
+# Core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.
+EC.10 CPU_CLK_UNHALTED.C01
+
+# Core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.
+EC.20 CPU_CLK_UNHALTED.C02
+
+# CPU_CLK_UNHALTED.PAUSE
+EC.40 CPU_CLK_UNHALTED.PAUSE
+
+# CPU_CLK_UNHALTED.PAUSE_INST
+EC.40.CMSK=1.EDG CPU_CLK_UNHALTED.PAUSE_INST
+
+# Core clocks when the thread is in the C0.1 or C0.2 or running a PAUSE in C0 ACPI state.
+EC.70 CPU_CLK_UNHALTED.C0_WAIT
--- a/configs/cfg_MeteorLakeP_RedwoodCove_all_offcore.txt
+++ b/configs/cfg_MeteorLakeP_RedwoodCove_all_offcore.txt
@@ -0,0 +1,29 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/MTL/events/meteorlake_redwoodcove_core.json (Version: 1.14)
+# Applies to processors with family-model in {6-AA, 6-AC, 6-B5}
+
+# Counts demand data reads that have any type of response.
+2A.01.MSR_RSP0=0x10001.TakenAlone OCR.DEMAND_DATA_RD.ANY_RESPONSE
+
+# Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.
+2A.01.MSR_RSP0=0x10002.TakenAlone OCR.DEMAND_RFO.ANY_RESPONSE
+
+# Counts demand data reads that resulted in a snoop hit in another cores caches, data forwarding is required as the data is modified.
+2A.01.MSR_RSP0=0x10003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM
+
+# Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that resulted in a snoop hit in another cores caches, data forwarding is required as the data is modified.
+2A.01.MSR_RSP0=0x10003C0002.TakenAlone OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM
+
+# Counts streaming stores that have any type of response.
+2A.01.MSR_RSP0=0x10800.TakenAlone OCR.STREAMING_WR.ANY_RESPONSE
+
+# Counts demand data reads that were supplied by DRAM.
+2A.01.MSR_RSP0=0x184000001.TakenAlone OCR.DEMAND_DATA_RD.DRAM
+
+# Counts demand data reads that were not supplied by the L3 cache.
+2A.01.MSR_RSP0=0x3FBFC00001.TakenAlone OCR.DEMAND_DATA_RD.L3_MISS
+
+# Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.
+2A.01.MSR_RSP0=0x3FBFC00002.TakenAlone OCR.DEMAND_RFO.L3_MISS
+
+# Counts demand data reads that resulted in a snoop hit in another cores caches which forwarded the unmodified data to the requesting core.
+2A.01.MSR_RSP0=0x8003C0001.TakenAlone OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD
--- a/configs/cfg_MeteorLakeP_RedwoodCove_common.txt
+++ b/configs/cfg_MeteorLakeP_RedwoodCove_common.txt
@@ -0,0 +1,27 @@
+# Based on https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/MTL/events/meteorlake_redwoodcove_core.json (Version: 1.14)
+# Applies to processors with family-model in {6-AA, 6-AC, 6-B5}
+
+3C.00 CORE_CYCLES
+C0.00 INST_RETIRED
+79.04 IDQ.MITE_UOPS
+79.08 IDQ.DSB_UOPS
+79.20 IDQ.MS_UOPS
+A8.01 LSD.UOPS
+AE.01 UOPS_ISSUED
+B1.01 UOPS_EXECUTED
+C2.02 UOPS_RETIRED.SLOTS
+B2.01 UOPS_DISPATCHED.PORT_0
+B2.02 UOPS_DISPATCHED.PORT_1
+B2.04 UOPS_DISPATCHED.PORT_2_3_10
+B2.10 UOPS_DISPATCHED.PORT_4_9
+B2.20 UOPS_DISPATCHED.PORT_5_11
+B2.40 UOPS_DISPATCHED.PORT_6
+B2.80 UOPS_DISPATCHED.PORT_7_8
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+D1.01 MEM_LOAD_RETIRED.L1_HIT
+D1.08 MEM_LOAD_RETIRED.L1_MISS
+D1.02 MEM_LOAD_RETIRED.L2_HIT
+D1.10 MEM_LOAD_RETIRED.L2_MISS
+D1.04 MEM_LOAD_RETIRED.L3_HIT
+D1.20 MEM_LOAD_RETIRED.L3_MISS
--- a/configs/convertIntelJSON.py
+++ b/configs/convertIntelJSON.py
@@ -8,9 +8,10 @@ parser.add_argument('url', help='URL of JSON file')
 parser.add_argument('-offcore', help='Convert offcore events', action='store_true')
 args = parser.parse_args()

-print('# Based on ' + args.url)
+json = requests.get(args.url).json()
+print(f'# Based on {args.url} (Version: {json["Header"]["Version"]})')

-mapFile = requests.get('https://download.01.org/perfmon/mapfile.csv')
+mapFile = requests.get('https://raw.githubusercontent.com/intel/perfmon/refs/heads/main/mapfile.csv')
 famMod = []
 for l in mapFile.iter_lines():
    fields = l.decode().split(',')
@@ -19,13 +20,12 @@ for l in mapFile.iter_lines():
 if famMod:
    print('# Applies to processors with family-model in {' + str(famMod).replace("'", '')[1:-1] + '}')

-json = requests.get(args.url).json()
-allCtrs = max([ev['Counter'] for ev in json if not 'Fixed' in ev['Counter']], key=len)
+allCtrs = max([ev['Counter'] for ev in json['Events'] if not 'Fixed' in ev['Counter']], key=len)
 if '0,1,2,3' in allCtrs:
    allCtrs = '0,1,2,3' # nanoBench does not use counters >= 4

 evDescriptions = []
-for ev in sorted(json, key=lambda x: (x['EventCode'].upper(), x['UMask'].upper())):
+for ev in sorted(json['Events'], key=lambda x: (x['EventCode'].upper(), x['UMask'].upper())):
    if ('Fixed' in ev['Counter']) or (ev['Counter'] in ['32', '33', '34', '35']):
        continue
    if ev.get('Deprecated') == '1':