mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-07-21 15:11:03 +02:00
3589 lines
186 KiB
Python
Executable File
3589 lines
186 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import xml.etree.ElementTree as ET
|
|
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
|
|
from xml.dom import minidom
|
|
from itertools import chain, count, cycle, groupby, islice
|
|
from collections import namedtuple, OrderedDict
|
|
|
|
import argparse
|
|
import copy
|
|
import datetime
|
|
import math
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import logging
|
|
import pickle
|
|
import shutil
|
|
import tarfile
|
|
|
|
from utils import *
|
|
from x64_lib import *
|
|
|
|
sys.path.append('../..')
|
|
from kernelNanoBench import *
|
|
|
|
sys.path.append('../CPUID')
|
|
import cpuid
|
|
|
|
useIACA=False
|
|
iacaCMDLine = ''
|
|
iacaVersion = ''
|
|
arch = ''
|
|
debugOutput = False
|
|
supportsAVX = False
|
|
instrNodeList = [] # list of all XML instruction nodes that are not filtered out
|
|
instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode
|
|
|
|
#R15: loop counter
|
|
#R14: reserved for memory addresses (base)
|
|
#R13: reserved for memory addresses (index)
|
|
globalDoNotWriteRegs = {'R15', 'R15D', 'R15W', 'R15B', 'RSP', 'ESP' , 'SP', 'SPL',
|
|
'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15',
|
|
'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'K0'}
|
|
memRegs = {'R14', 'R14D', 'R14W', 'R14B', 'R13', 'R13D', 'R13W', 'R13B', 'RDI', 'EDI', 'DI', 'DIL', 'RSI', 'ESI', 'SI', 'SIL', 'RBP', 'EBP', 'BP', 'BPL'}
|
|
|
|
|
|
specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP',
|
|
'TR', 'TSC', 'TSCAUX', 'X87CONTROL', 'X87POP', 'X87POP2', 'X87PUSH', 'X87STATUS', 'X87TAG', 'XCR0', 'XMM0dq', 'CR0', 'CR2', 'CR3', 'CR4', 'CR8', 'ERROR',
|
|
'BND0', 'BND1', 'BND2', 'BND3'}
|
|
|
|
maxTPRep = 16
|
|
|
|
#iforms of serializing and memory-ordering instructions according to Ch. 8.3 of the Intel manual
|
|
serializingInstructions = {'INVD', 'INVEPT', 'INVLPG', 'INVVPID', 'LGDT', 'LIDT', 'LLDT', 'LTR', 'MOV_CR_CR_GPR64', 'MOV_DR_DR_GPR64', 'WBINVD', 'WRMSR',
|
|
'CPUID', 'IRET', 'RSM', 'SFENCE', 'LFENCE', 'MFENCE'}
|
|
|
|
def isAMDCPU():
|
|
return arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']
|
|
|
|
def isIntelCPU():
|
|
return not isAMDCPU()
|
|
|
|
|
|
def getAddrReg(instrNode, opNode):
|
|
if opNode.attrib.get('suppressed', '0') == '1':
|
|
return opNode.attrib['base']
|
|
elif instrNode.attrib.get('high8', '') != '':
|
|
return 'RDI'
|
|
else:
|
|
return 'R14'
|
|
|
|
def getIndexReg(instrNode, opNode):
|
|
if (opNode.attrib.get('suppressed', '0') == '1') and ('index' in opNode.attrib):
|
|
return regTo64(opNode.attrib['index'])
|
|
elif opNode.attrib.get('VSIB', '0') != '0':
|
|
return opNode.attrib.get('VSIB') + '14'
|
|
elif instrNode.attrib.get('high8', '') != '':
|
|
return 'RSI'
|
|
else:
|
|
return 'R13'
|
|
|
|
# registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that
|
|
# prefer low registers to high registers
|
|
def sortRegs(regsList):
|
|
return sorted(regsList, key=lambda r: (not any(i.isdigit() for i in r), 'P' in r, 'I' in r, 'H' in r, 'A' in r, list(map(int, re.findall(r'\d+',r))), r))
|
|
|
|
|
|
# Initialize registers and memory
|
|
def getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr):
|
|
iform = instrNode.attrib['iform']
|
|
iclass = instrNode.attrib['iclass']
|
|
|
|
init = []
|
|
|
|
if iform == 'CLZERO': init += ['MOV RAX, R14']
|
|
if iclass == 'LDMXCSR': init += ['STMXCSR [R14+' + str(memOffset) + ']']
|
|
if iclass == 'VLDMXCSR': init += ['VSTMXCSR [R14+' + str(memOffset) + ']']
|
|
if iform == 'LGDT_MEMs64': init += ['SGDT [R14+' + str(memOffset) + ']']
|
|
if iform == 'LIDT_MEMs64': init += ['SIDT [R14+' + str(memOffset) + ']']
|
|
if iform == 'LLDT_MEMw': init += ['SLDT [R14+' + str(memOffset) + ']']
|
|
if iform == 'XLAT': init += ['MOV RBX, R14', 'mov qword ptr [RBX], 0']
|
|
|
|
if (isSSEInstr(instrNode) or isAVXInstr(instrNode)) and supportsAVX:
|
|
# Zero upper bits to avoid AVX-SSE transition penalties; also, e.g., dep. breaking and zero-latency instructions do not seem to work otherwise
|
|
# we use vzeroall instead of just vzeroupper to make sure that XMM14 is 0 for VSIB addressing
|
|
init += ['VZEROALL']
|
|
|
|
if not isDivOrSqrtInstr(instrNode):
|
|
for opNode in instrNode.findall('./operand[@r="1"]'):
|
|
opIdx = int(opNode.attrib['idx'])
|
|
xtype = opNode.attrib.get('xtype', '')
|
|
|
|
if opNode.attrib['type'] == 'reg':
|
|
reg = opRegDict[opIdx]
|
|
regPrefix = re.sub(r'\d', '', reg)
|
|
|
|
if reg in High8Regs:
|
|
init += ['MOV {}, 0'.format(reg)]
|
|
elif 'MM' in regPrefix and xtype.startswith('f'):
|
|
init += ['MOV RAX, 0x4000000040000000']
|
|
for i in range(0, getRegSize(reg)//8, 8): init += ['MOV [R14+' + str(i) + '], RAX']
|
|
|
|
if isAVXInstr(instrNode):
|
|
init += ['VMOVUPD ' + reg + ', [R14]']
|
|
else:
|
|
init += ['MOVUPD ' + reg + ', [R14]']
|
|
elif regPrefix in ['XMM', 'YMM', 'ZMM'] and isAVXInstr(instrNode):
|
|
init += ['VXORPS '+reg+', '+reg+', '+reg]
|
|
elif 'MM' in regPrefix:
|
|
init += ['PXOR '+reg+', '+reg]
|
|
elif opNode.attrib['type'] == 'mem':
|
|
if xtype.startswith('f'):
|
|
init += ['MOV RAX, 0x4000000040000000']
|
|
for i in range(0, int(opNode.attrib['width'])//8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX']
|
|
|
|
for opNode in instrNode.findall('./operand[@type="mem"]'):
|
|
if opNode.attrib.get('suppressed', '0') == '1': continue
|
|
if 'VSIB' in opNode.attrib:
|
|
vsibReg = getIndexReg(instrNode, opNode)
|
|
init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg]
|
|
elif useIndexedAddr:
|
|
init += ['XOR {0}, {0}'.format(getIndexReg(instrNode, opNode))]
|
|
|
|
return init
|
|
|
|
nExperiments = 0
|
|
def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, warmUpCount=10, basicMode=False, htmlReports=None):
|
|
# we use a default warmUpCount of 10, as ICL requires at least about that much before memory operations run at full speed
|
|
|
|
if init is None: init = []
|
|
localHtmlReports = []
|
|
|
|
global nExperiments
|
|
nExperiments += 1
|
|
|
|
instrCode = re.sub(';+', '; ', instrCode.strip('; '))
|
|
codeObjFile = '/tmp/ramdisk/code.o'
|
|
assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s')
|
|
localHtmlReports.append('<li>Code: <pre>' + getMachineCode(codeObjFile) + '</pre></li>\n')
|
|
|
|
init = list(OrderedDict.fromkeys(init)) # remove duplicates while maintaining the order
|
|
initCode = '; '.join(init)
|
|
useLateInit = any((reg in initCode.upper()) for reg in High8Regs)
|
|
|
|
if (instrNode is not None) and (instrNode.attrib.get('vex', '') == '1' or instrNode.attrib.get('evex', '') == '1'):
|
|
# vex and evex encoded instructions need a warm-up period before memory reads operate at full speed;
|
|
# https://software.intel.com/en-us/forums/intel-isa-extensions/topic/710248
|
|
reg = 'ZMM' if 'ZMM' in instrNode.attrib['iform'] else 'YMM'
|
|
# the instruction needs to be used at least twice in the body of the loop
|
|
# putting it to one_time_init is not sufficient, independently of the loop count, example:
|
|
# "VPTEST YMM0, YMM1;CMOVZ R13, R15; VPBROADCASTQ ZMM0, R13" on CNL
|
|
avxInitCode = 'MOV R15, 10000; L: VADDPS {0}, {1}, {1}; VADDPS {0}, {1}, {1}; DEC R15; JNZ L; '.format(reg + '0', reg + '1')
|
|
initCode = avxInitCode + initCode
|
|
|
|
nanoBenchCmd = 'sudo ./kernel-nanoBench.sh'
|
|
nanoBenchCmd += ' -f'
|
|
nanoBenchCmd += ' -unroll ' + str(unrollCount)
|
|
if loopCount > 0: nanoBenchCmd += ' -loop ' + str(loopCount)
|
|
if basicMode: nanoBenchCmd += ' -basic'
|
|
nanoBenchCmd += ' -warm_up_count ' + str(warmUpCount)
|
|
nanoBenchCmd += ' -asm "' + instrCode + '"'
|
|
|
|
initObjFile = None
|
|
lateInitObjFile=None
|
|
if initCode:
|
|
if debugOutput: print('init: ' + initCode)
|
|
objFile = '/tmp/ramdisk/init.o'
|
|
if useLateInit:
|
|
lateInitObjFile = objFile
|
|
nanoBenchCmd += ' -asm_late_init "' + initCode + '"'
|
|
else:
|
|
initObjFile = objFile
|
|
nanoBenchCmd += ' -asm_init "' + initCode + '"'
|
|
assemble(initCode, objFile, asmFile='/tmp/ramdisk/init.s')
|
|
localHtmlReports.append('<li>Init: <pre>' + re.sub(';[ \t]*(.)', r';\n\1', initCode) + '</pre></li>\n')
|
|
|
|
localHtmlReports.append('<li><a href="javascript:;" onclick="this.outerHTML = \'<pre>' + nanoBenchCmd + '</pre>\'">Show nanoBench command</a></li>\n')
|
|
if debugOutput: print(nanoBenchCmd)
|
|
|
|
setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode)
|
|
|
|
measuredCycles = []
|
|
while True:
|
|
ret = runNanoBench(codeObjFile=codeObjFile, initObjFile=initObjFile, lateInitObjFile=lateInitObjFile)
|
|
cycles = ret['APERF'] if isAMDCPU() else ret['Core cycles']
|
|
|
|
maxPortUops = max((v/len(e.replace('UOPS_PORT_', '')) for e,v in ret.items() if 'PORT' in e), default=0)
|
|
sumPortUops = sum(v for e,v in ret.items() if 'PORT' in e)
|
|
if (not basicMode) and (len(measuredCycles) < 3) and ((cycles not in measuredCycles) or any(v<0 for v in ret.values())
|
|
or (maxPortUops * .98 > cycles) or (sumPortUops - int(sumPortUops) > .2)):
|
|
measuredCycles.append(cycles)
|
|
if len(measuredCycles) >= 3:
|
|
basicMode = True
|
|
setNanoBenchParameters(basicMode=basicMode)
|
|
continue
|
|
|
|
localHtmlReports.append('<li>Results:\n<ul>\n')
|
|
for evt, value in ret.items():
|
|
if 'RDTSC' in evt: continue
|
|
if evt == 'UOPS':
|
|
if arch in ['CON', 'WOL']: evt = 'RS_UOPS_DISPATCHED'
|
|
elif arch in ['NHM', 'WSM', 'BNL', 'GLM', 'GLP']: evt = 'UOPS_RETIRED.ANY'
|
|
elif arch in ['SNB', 'SLM', 'AMT', 'ADL-E', 'MTL-E']: evt = 'UOPS_RETIRED.ALL'
|
|
elif arch in ['HSW']: evt = 'UOPS_EXECUTED.CORE'
|
|
elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: evt = 'UOPS_EXECUTED.THREAD'
|
|
elif arch in ['TRM']: evt = 'TOPDOWN_RETIRING.ALL'
|
|
localHtmlReports.append('<li>' + evt + ': ' + str(value) + '</li>\n')
|
|
localHtmlReports.append('</ul>\n</li>')
|
|
|
|
if isAMDCPU():
|
|
ret['Core cycles'] = cycles
|
|
|
|
if arch in ['NHM', 'WSM'] and 'UOPS_PORT_3' in ret:
|
|
# Workaround for broken port4 and port5 counters
|
|
ret['UOPS_PORT_4'] = ret['UOPS_PORT_3']
|
|
ret['UOPS_PORT_5'] = max(0, ret['UOPS'] - ret['UOPS_PORT_0'] - ret['UOPS_PORT_1'] - ret['UOPS_PORT_2'] - ret['UOPS_PORT_3'] - ret['UOPS_PORT_4'])
|
|
|
|
if arch in ['SNB'] and all(('UOPS_PORT_'+str(p) in ret) for p in range(0,6)):
|
|
# some retired uops are not executed due to, e.g., move el. and zero idioms; however, using the sum of the uops on all ports may overcount due to replays
|
|
ret['UOPS'] = min(ret['UOPS'], sum(ret['UOPS_PORT_'+str(p)] for p in range(0,6)))
|
|
|
|
break
|
|
|
|
if htmlReports is not None:
|
|
htmlReports.extend(localHtmlReports)
|
|
return ret
|
|
|
|
|
|
def writeFile(fileName, content):
|
|
with open(fileName, "w") as f:
|
|
f.write(content+"\n");
|
|
|
|
|
|
def getMachineCode(objFile):
|
|
try:
|
|
machineCode = subprocess.check_output(['objdump', '-M', 'intel', '-d', objFile]).decode()
|
|
return machineCode.partition('<.text>:\n')[2]
|
|
except subprocess.CalledProcessError as e:
|
|
print('Error (getMachineCode): ' + str(e))
|
|
|
|
|
|
def getCodeLength(asmCode):
|
|
objFile = '/tmp/ramdisk/code.o'
|
|
binFile = '/tmp/ramdisk/code.bin'
|
|
assemble(asmCode, objFile, asmFile='/tmp/ramdisk/code.s')
|
|
objcopy(objFile, binFile)
|
|
return os.path.getsize(binFile)
|
|
|
|
|
|
def getEventConfig(event):
|
|
if event == 'UOPS':
|
|
if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED
|
|
if arch in ['NHM', 'WSM', 'SNB' ]: return 'C2.01' # UOPS_RETIRED.ANY
|
|
if arch in ['SNB']: return 'C2.01' # UOPS_RETIRED.ALL
|
|
if arch in ['GLM', 'GLP', 'ADL-E', 'MTL-E']: return 'C2.00' # UOPS_RETIRED.ALL
|
|
if arch in ['TRM']: return 'C2.00' # TOPDOWN_RETIRING.ALL
|
|
if arch in ['BNL', 'SLM', 'AMT']: return 'C2.10' # UOPS_RETIRED.ANY
|
|
if arch in ['HSW']: return 'B1.02' # UOPS_EXECUTED.CORE; note: may undercount due to erratum HSD30
|
|
if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return 'B1.01' # UOPS_EXECUTED.THREAD
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '0C1.00'
|
|
if event == 'RETIRE_SLOTS':
|
|
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return 'C2.02'
|
|
if event == 'UOPS_MITE':
|
|
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return '79.04'
|
|
if event == 'UOPS_MITE>=1':
|
|
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return '79.04.CMSK=1'
|
|
if event == 'UOPS_MS':
|
|
if arch in ['NHM', 'WSM']: return 'D1.02'
|
|
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '79.30'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return '79.20'
|
|
if arch in ['SLM', 'AMT', 'GLM', 'GLP', 'TRM', 'ADL-E', 'MTL-E']: return 'C2.01'
|
|
if arch in ['BNL']: return 'A9.01' # undocumented, but seems to work
|
|
if event == 'UOPS_PORT_0':
|
|
if arch in ['CON', 'WOL']: return 'A1.01.CTR=0'
|
|
if arch in ['NHM', 'WSM']: return 'B1.01'
|
|
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.01'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.01'
|
|
if event == 'UOPS_PORT_1':
|
|
if arch in ['CON', 'WOL']: return 'A1.02.CTR=0'
|
|
if arch in ['NHM', 'WSM']: return 'B1.02'
|
|
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.02'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.02'
|
|
if event == 'UOPS_PORT_2':
|
|
if arch in ['CON', 'WOL']: return 'A1.04.CTR=0'
|
|
if arch in ['NHM', 'WSM']: return 'B1.04'
|
|
if arch in ['SNB', 'IVB']: return 'A1.0C'
|
|
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.04'
|
|
if event == 'UOPS_PORT_3':
|
|
if arch in ['CON', 'WOL']: return 'A1.08.CTR=0'
|
|
if arch in ['NHM', 'WSM']: return 'B1.08'
|
|
if arch in ['SNB', 'IVB']: return 'A1.30'
|
|
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.08'
|
|
if event == 'UOPS_PORT_4':
|
|
if arch in ['CON', 'WOL']: return 'A1.10.CTR=0'
|
|
if arch in ['NHM', 'WSM']: return 'B1.10'
|
|
if arch in ['SNB', 'IVB']: return 'A1.40'
|
|
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.10'
|
|
if event == 'UOPS_PORT_5':
|
|
if arch in ['CON', 'WOL']: return 'A1.20.CTR=0'
|
|
if arch in ['NHM', 'WSM']: return 'B1.20'
|
|
if arch in ['SNB', 'IVB']: return 'A1.80'
|
|
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.20'
|
|
if event == 'UOPS_PORT_6':
|
|
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.40'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.40'
|
|
if event == 'UOPS_PORT_7':
|
|
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.80'
|
|
if event == 'UOPS_PORT_23':
|
|
if arch in ['ICL', 'TGL', 'RKL']: return 'A1.04'
|
|
if event == 'UOPS_PORT_49':
|
|
if arch in ['ICL', 'TGL', 'RKL']: return 'A1.10'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.10'
|
|
if event == 'UOPS_PORT_78':
|
|
if arch in ['ICL', 'TGL', 'RKL']: return 'A1.80'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.80'
|
|
if event == 'UOPS_PORT_5B':
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.20'
|
|
if event == 'UOPS_PORT_5B>=2':
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.20.CMSK=2'
|
|
if event == 'UOPS_PORT_23A':
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.04'
|
|
if event == 'DIV_CYCLES':
|
|
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return '14.01' # undocumented on HSW, but seems to work
|
|
if arch in ['ICL', 'TGL', 'RKL']: return '14.09'
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '0D3.00'
|
|
if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B0.09.CMSK=1'
|
|
if event == 'ILD_STALL.LCP':
|
|
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return '87.01'
|
|
if event == 'INST_DECODED.DEC0':
|
|
if arch in ['NHM', 'WSM']: return '18.01'
|
|
if event == 'FpuPipeAssignment.Total0':
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '000.01'
|
|
if event == 'FpuPipeAssignment.Total1':
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '000.02'
|
|
if event == 'FpuPipeAssignment.Total2':
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '000.04'
|
|
if event == 'FpuPipeAssignment.Total3':
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '000.08'
|
|
# the following two counters are undocumented so far, but seem to work
|
|
if event == 'FpuPipeAssignment.Total4':
|
|
if arch in ['ZEN3', 'ZEN4', 'ZEN5']: return '000.10'
|
|
if event == 'FpuPipeAssignment.Total5':
|
|
if arch in ['ZEN3', 'ZEN4', 'ZEN5']: return '000.20'
|
|
return None
|
|
|
|
|
|
def configurePFCs(events):
|
|
content = ''
|
|
for event in events:
|
|
cfg = getEventConfig(event)
|
|
if cfg is not None:
|
|
content += cfg + ' ' + event + '\n'
|
|
setNanoBenchParameters(config=content, fixedCounters=True)
|
|
|
|
|
|
InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit'])
|
|
|
|
def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2,
|
|
computeRegMemInit=True, useIndexedAddr=False):
|
|
if not doNotWriteRegs: doNotWriteRegs = []
|
|
if not doNotReadRegs: doNotReadRegs = []
|
|
if not opRegDict: opRegDict = {}
|
|
|
|
if (instrNode.attrib['extension'] == 'AVX2GATHER') or instrNode.attrib['isa-set'].startswith('AVX512_FP16'):
|
|
useDistinctRegs=True
|
|
hasMemOperand = len(instrNode.findall('./operand[@type="mem"]'))>0
|
|
|
|
readRegs = set()
|
|
writtenRegs = set()
|
|
opRegDict = dict(opRegDict)
|
|
|
|
for operandNode in instrNode.iter('operand'):
|
|
if operandNode.attrib['type'] == "reg":
|
|
regsList = sortRegs(operandNode.text.split(','))
|
|
if len(regsList) == 1:
|
|
reg = regsList[0]
|
|
opRegDict[int(operandNode.attrib['idx'])] = reg
|
|
if operandNode.attrib.get('w', '0') == '1':
|
|
writtenRegs.add(reg)
|
|
if operandNode.attrib.get('r', '0') == '1':
|
|
readRegs.add(reg)
|
|
elif operandNode.attrib['type'] == "mem" and 'base' in operandNode.attrib:
|
|
readRegs.add(operandNode.attrib['base'])
|
|
|
|
commonReg = None
|
|
if not useDistinctRegs:
|
|
commonRegs = findCommonRegisters(instrNode)
|
|
commonRegs -= set(doNotWriteRegs)|set(doNotReadRegs)|globalDoNotWriteRegs|(memRegs if hasMemOperand else set())
|
|
if commonRegs:
|
|
commonReg = sortRegs(commonRegs)[0]
|
|
|
|
asm = instrNode.attrib['asm']
|
|
|
|
first = True
|
|
for operandNode in instrNode.iter('operand'):
|
|
opI = int(operandNode.attrib['idx'])
|
|
|
|
if operandNode.attrib.get('suppressed', '0') == '1':
|
|
continue;
|
|
|
|
if not first and not operandNode.attrib.get('opmask', '') == '1':
|
|
asm += ", "
|
|
else:
|
|
asm += " "
|
|
first=False;
|
|
|
|
if operandNode.attrib['type'] == "reg":
|
|
if opI in opRegDict:
|
|
reg = opRegDict[opI]
|
|
else:
|
|
regsList = operandNode.text.split(',')
|
|
|
|
reg = None
|
|
if commonReg:
|
|
for reg2 in regsList:
|
|
if getCanonicalReg(reg2) == commonReg:
|
|
reg = reg2
|
|
break
|
|
if reg is None:
|
|
if len(regsList) > 1:
|
|
ignoreRegs = set()
|
|
if operandNode.attrib.get('w', '0') == '1':
|
|
ignoreRegs |= set(doNotWriteRegs)|globalDoNotWriteRegs|set(opRegDict.values())|(memRegs if hasMemOperand else set())
|
|
if operandNode.attrib.get('r', '0') == '1':
|
|
ignoreRegs |= set(doNotReadRegs)|writtenRegs|readRegs|set(opRegDict.values())
|
|
regsList = [x for x in regsList if not any(getCanonicalReg(x) == getCanonicalReg(y) for y in ignoreRegs)]
|
|
if not regsList:
|
|
return None;
|
|
reg = sortRegs(regsList)[0]
|
|
|
|
opRegDict[opI] = reg
|
|
if operandNode.attrib.get('w', '0') == '1':
|
|
writtenRegs.add(reg)
|
|
if operandNode.attrib.get('r', '0') == '1':
|
|
readRegs.add(reg)
|
|
|
|
if not operandNode.attrib.get('opmask', '') == '1':
|
|
asm += reg
|
|
else:
|
|
asm += ' {' + reg + '}'
|
|
if instrNode.attrib.get('zeroing', '') == '1':
|
|
asm += '{z}'
|
|
elif operandNode.attrib['type'] == "mem":
|
|
asmprefix = operandNode.attrib.get('memory-prefix', '')
|
|
asm += asmprefix
|
|
if asmprefix != '':
|
|
asm += ' '
|
|
|
|
if operandNode.attrib.get('moffs', '0') == '1':
|
|
asm += '[' + getAddress('R14') + ']'
|
|
else:
|
|
address = getAddrReg(instrNode, operandNode)
|
|
readRegs.add(address)
|
|
if useIndexedAddr or operandNode.attrib.get('VSIB', '0') != '0':
|
|
indexReg = getIndexReg(instrNode, operandNode)
|
|
address += '+' + indexReg
|
|
readRegs.add(indexReg)
|
|
|
|
asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']'
|
|
|
|
memorySuffix = operandNode.attrib.get('memory-suffix', '')
|
|
if memorySuffix:
|
|
asm += ' ' + memorySuffix
|
|
elif operandNode.attrib['type'] == 'agen':
|
|
agen = instrNode.attrib['agen']
|
|
address = []
|
|
|
|
if 'R' in agen: address.append('RIP')
|
|
if 'B' in agen:
|
|
addrReg = getAddrReg(instrNode, operandNode)
|
|
address.append(addrReg)
|
|
readRegs.add(addrReg)
|
|
if 'I' in agen:
|
|
indexReg = getIndexReg(instrNode, operandNode)
|
|
if 'IS' in agen:
|
|
address.append('2*' + indexReg)
|
|
else:
|
|
address.append('1*' + indexReg)
|
|
readRegs.add(indexReg)
|
|
if 'D8' in agen: address.append('8')
|
|
if 'D32' in agen: address.append('128')
|
|
|
|
asm += ' [' + '+'.join(address) + ']'
|
|
elif operandNode.attrib['type'] == "imm":
|
|
if instrNode.attrib.get('roundc', '') == '1':
|
|
asm += '{rn-sae}, '
|
|
elif instrNode.attrib.get('sae', '') == '1':
|
|
asm += '{sae}, '
|
|
width = int(operandNode.attrib['width'])
|
|
if operandNode.text:
|
|
imm = operandNode.text
|
|
elif (width == 8 or instrNode.attrib['iclass'] in ['ENTER', 'RET_FAR', 'RET_NEAR']):
|
|
imm = immediate
|
|
else:
|
|
imm = 1 << (width-8)
|
|
asm += str(imm)
|
|
elif operandNode.attrib['type'] == "relbr":
|
|
asm += "1f"
|
|
|
|
if not 'sae' in asm:
|
|
if instrNode.attrib.get('roundc', '') == '1':
|
|
asm += ', {rn-sae}'
|
|
elif instrNode.attrib.get('sae', '') == '1':
|
|
asm += ', {sae}'
|
|
|
|
if '1f' in asm:
|
|
asm = asm + '; 1: '
|
|
|
|
regMemInit = []
|
|
if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr)
|
|
return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit)
|
|
|
|
def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr):
|
|
asm = '.intel_syntax noprefix\n .byte 0x0F, 0x0B; mov ebx, 111; .byte 0x64, 0x67, 0x90\n'
|
|
if prefixInstr:
|
|
for i in range(prefixRep):
|
|
asm += prefixInstr + "\n"
|
|
asm += instr + "\n"
|
|
asm += "1:\n"
|
|
asm += 'mov ebx, 222; .byte 0x64, 0x67, 0x90; .byte 0x0F, 0x0B\n'
|
|
writeFile(fileName, asm)
|
|
|
|
|
|
def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstrRep, blockedPorts, config, htmlReports):
|
|
instrInstance = config.independentInstrs[0]
|
|
instr = instrInstance.asm
|
|
readRegs = instrInstance.readRegs
|
|
writtenRegs = instrInstance.writtenRegs
|
|
|
|
if debugOutput: print(' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs))
|
|
blockInstrsList = getIndependentInstructions(blockInstrNode, True, False, writtenRegs|readRegs, writtenRegs|readRegs|memRegs, 64)
|
|
if debugOutput: print(' bIL: ' + str(blockInstrsList))
|
|
|
|
htmlReports.append('<hr><h3>With blocking instructions for port' +
|
|
('s {' if len(blockedPorts)>1 else ' ') +
|
|
str(sorted(blockedPorts))[1:-1] +
|
|
('}' if len(blockedPorts)>1 else '') + ':</h3>')
|
|
|
|
if useIACA:
|
|
createIacaAsmFile("/tmp/ramdisk/asm.s", ';'.join(islice(cycle(x.asm for x in blockInstrsList), blockInstrRep)), 1, instr)
|
|
|
|
try:
|
|
subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o'])
|
|
iacaOut = subprocess.check_output(iacaCMDLine + (['-analysis', 'THROUGHPUT'] if iacaVersion=='2.1' else []) + ['/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT).decode()
|
|
except subprocess.CalledProcessError as e:
|
|
logging.warning('Error: ' + e.output.decode())
|
|
htmlReports.append('<pre>' + e.output.decode() + '</pre>')
|
|
return None
|
|
|
|
htmlReports.append('<pre>' + iacaOut + '</pre>')
|
|
|
|
if not iacaOut or ' !' in iacaOut or ' X' in iacaOut or ' 0X' in iacaOut or not 'Total Num Of Uops' in iacaOut:
|
|
print('IACA error')
|
|
return None
|
|
|
|
allPortsLine = re.search(r'\| Cycles \|.*', iacaOut).group(0)
|
|
instrPortsLine = iacaOut.split('\n')[-3]
|
|
|
|
allUopsOnBlockedPorts = 0.0
|
|
instrUopsOnBlockedPorts = 0.0
|
|
|
|
for p in blockedPorts:
|
|
allPortsCol = allPortsLine.split('|')[int(p)+2].split()
|
|
allUopsOnBlockedPorts += float(allPortsCol[0])
|
|
|
|
instrPortsCol = instrPortsLine.split('|')[int(p)+2].split()
|
|
if instrPortsCol:
|
|
instrUopsOnBlockedPorts += float(instrPortsCol[0])
|
|
|
|
if allUopsOnBlockedPorts < blockInstrRep-.5:
|
|
# something went wrong; fewer uops on ports than blockInstrRep
|
|
# happens, e.g., on SKX for ports {0, 1} if AVX-512 is active
|
|
return None
|
|
|
|
return int(.2+instrUopsOnBlockedPorts)
|
|
else:
|
|
if isIntelCPU():
|
|
if arch in ['NHM', 'WSM']:
|
|
# Needed for workaround for broken port 5 counter
|
|
events = ['UOPS_PORT_'+str(p) for p in range(0,6)] + ['UOPS']
|
|
else:
|
|
events = ['UOPS_PORT_'+str(p) for p in blockedPorts]
|
|
|
|
if (arch in ['ADL-P', 'EMR', 'MTL-P']) and ('5' in blockedPorts): # note that any port combination that contains B also contains 5
|
|
events += ['UOPS_PORT_5B']
|
|
if 'B' not in blockedPorts:
|
|
events += ['UOPS_PORT_5B>=2']
|
|
else:
|
|
if arch in ['ZEN+', 'ZEN2']:
|
|
events = ['FpuPipeAssignment.Total'+str(p) for p in range(0,4)]
|
|
elif arch in ['ZEN3', 'ZEN4', 'ZEN5']:
|
|
events = ['FpuPipeAssignment.Total'+str(p) for p in range(0,6)]
|
|
|
|
configurePFCs(events)
|
|
|
|
blockInstrAsm = ';'.join(islice(cycle(x.asm for x in blockInstrsList), blockInstrRep))
|
|
|
|
unrollCount = 1000//blockInstrRep # make sure that instrs. fit into icache
|
|
if isAMDCPU(): unrollCount = max(unrollCount, 100) # ZEN+ sometimes undercounts FP usage if code is short
|
|
|
|
|
|
init = list(chain.from_iterable([x.regMemInit for x in blockInstrsList])) + instrInstance.regMemInit + config.init
|
|
|
|
htmlReports.append('<ul>\n')
|
|
measurementResult = runExperiment(instrNode, blockInstrAsm + ';' + config.preInstrCode + ';' + instr, init=init, unrollCount=unrollCount,
|
|
basicMode=True, htmlReports=htmlReports)
|
|
htmlReports.append('</ul>\n')
|
|
|
|
if 'UOPS_PORT_5B>=2' in measurementResult:
|
|
measurementResult['UOPS_PORT_5'] = measurementResult['UOPS_PORT_5B'] - measurementResult['UOPS_PORT_5B>=2']
|
|
del measurementResult['UOPS_PORT_5B>=2']
|
|
del measurementResult['UOPS_PORT_5B']
|
|
elif 'UOPS_PORT_5B' in measurementResult:
|
|
# in the following, only the sum of the uops on ports 5 and B matters, not how they are distributed
|
|
measurementResult['UOPS_PORT_5'] = measurementResult['UOPS_PORT_5B']
|
|
del measurementResult['UOPS_PORT_5B']
|
|
|
|
if isIntelCPU():
|
|
ports_dict = {p[10:]: i for p, i in measurementResult.items() if p.startswith('UOPS_PORT')}
|
|
else:
|
|
ports_dict = {p[23:]: i for p, i in measurementResult.items() if 'FpuPipeAssignment.Total' in p}
|
|
|
|
if sum(ports_dict.values()) < blockInstrRep-.5:
|
|
# something went wrong; fewer uops on ports than blockInstrRep
|
|
# happens, e.g., on SKX for ports {0, 1} if AVX-512 is active
|
|
return None
|
|
|
|
return int(.2+sum([uops for p, uops in ports_dict.items() if p in blockedPorts])) - blockInstrRep
|
|
|
|
|
|
# Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of
|
|
# calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep.
|
|
def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNotReadRegs=None, doNotWriteRegs=None, initialOffset=0, immediate=2):
|
|
hasMemOperand = len(instrNode.findall('./operand[@type="mem"]'))>0
|
|
if not doNotReadRegs: doNotReadRegs = set()
|
|
if not doNotWriteRegs: doNotWriteRegs = set()
|
|
doNotReadRegs |= specialRegs
|
|
doNotWriteRegs |= globalDoNotWriteRegs|specialRegs
|
|
if hasMemOperand:
|
|
doNotWriteRegs |= memRegs
|
|
|
|
for opNode in instrNode.iter('operand'):
|
|
if opNode.attrib['type'] == 'reg':
|
|
regs = sortRegs(opNode.text.split(","))
|
|
if len(regs) == 1:
|
|
doNotReadRegs.add(regs[0])
|
|
doNotWriteRegs.add(regs[0])
|
|
if len(regs) >= 8 and 'RAX' in map(regTo64, regs):
|
|
#avoid RAX register if possible as some instructions have a special encoding for this
|
|
doNotReadRegs.add('RAX')
|
|
doNotWriteRegs.add('RAX')
|
|
|
|
independentInstructions = []
|
|
offset = initialOffset
|
|
|
|
for _ in range(maxTPRep):
|
|
instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate, useIndexedAddr=useIndexedAddr)
|
|
if not instrI:
|
|
break
|
|
|
|
if instrI in independentInstructions:
|
|
break
|
|
|
|
maxMemWidth = 0
|
|
for memNode in instrNode.findall('./operand[@type="mem"]'):
|
|
maxMemWidth = max(maxMemWidth, int(memNode.attrib.get('width', '0')) // 8)
|
|
offset += maxMemWidth
|
|
|
|
independentInstructions.append(instrI)
|
|
|
|
doNotWriteRegs = doNotWriteRegs | instrI.writtenRegs | instrI.readRegs
|
|
doNotReadRegs = doNotReadRegs | instrI.writtenRegs
|
|
|
|
if not independentInstructions:
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate, useIndexedAddr=useIndexedAddr)
|
|
independentInstructions.append(instrI)
|
|
|
|
return independentInstructions
|
|
|
|
# Returns True iff there are two operands that can use the same register, all reg. operands are non-suppressed, there are no memory operands, and the
|
|
# instruction does not use the divider
|
|
def hasCommonRegister(instrNode):
|
|
if 'GATHER' in instrNode.attrib['category'] or 'SCATTER' in instrNode.attrib['category']:
|
|
return False
|
|
if instrNode.find('./operand[@type="mem"]') is not None:
|
|
return False
|
|
if instrNode.find('./operand[@type="reg"][@suppressed="1"]') is not None:
|
|
return False
|
|
if isDivOrSqrtInstr(instrNode):
|
|
return False
|
|
return len(findCommonRegisters(instrNode)) > 0
|
|
|
|
def findCommonRegisters(instrNode):
|
|
for opNode1 in instrNode.findall('./operand[@type="reg"]'):
|
|
regs1 = set(opNode1.text.split(","))
|
|
regs1Canonical = set(map(getCanonicalReg, regs1))
|
|
for opNode2 in instrNode.findall('./operand[@type="reg"]'):
|
|
if opNode1 == opNode2: continue
|
|
regs2 = set(opNode2.text.split(","))
|
|
regs2Canonical = set(map(getCanonicalReg, regs2))
|
|
if (regs1.intersection(High8Regs) and regs2.intersection(Low8Regs)) or (regs2.intersection(High8Regs) and regs1.intersection(Low8Regs)):
|
|
continue
|
|
intersection = regs1Canonical.intersection(regs2Canonical)
|
|
if intersection:
|
|
return intersection
|
|
return set()
|
|
|
|
def hasExplicitNonVSIBMemOperand(instrNode):
|
|
for opNode in instrNode.findall('./operand[@type="mem"]'):
|
|
if opNode.attrib.get('suppressed', '') != '1' and opNode.attrib.get('VSIB', '0') == '0':
|
|
return True
|
|
return False
|
|
|
|
def getThroughputIacaNoInteriteration(instrNode, htmlReports):
|
|
createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm)
|
|
try:
|
|
subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o'])
|
|
iaca_tp = subprocess.check_output(iacaCMDLine + (['-analysis', 'THROUGHPUT'] if iacaVersion=='2.1' else []) + ['-no_interiteration', '/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT).decode()
|
|
except subprocess.CalledProcessError as e:
|
|
logging.warning('Error: ' + e.output.decode())
|
|
htmlReports.append('<pre>' + e.output.decode() + '</pre>\n')
|
|
return None
|
|
|
|
if debugOutput:
|
|
print(instrNode.attrib['iform'] + ' - NoInteriteration')
|
|
print(iaca_tp)
|
|
|
|
htmlReports.append('<pre>' + iaca_tp + '</pre>\n')
|
|
|
|
if not iaca_tp or ' !' in iaca_tp or ' X' in iaca_tp or ' 0X' in iaca_tp or not 'Total Num Of Uops' in iaca_tp:
|
|
print('IACA error')
|
|
return None
|
|
|
|
cycles = float(iaca_tp.split('\n')[3].split()[2])
|
|
return cycles
|
|
|
|
class TPConfig:
|
|
def __init__(self, independentInstrs=None, depBreakingInstrs='', init=None, preInstrCode='', preInstrNodes=None, note=''):
|
|
self.independentInstrs = ([] if independentInstrs is None else independentInstrs)
|
|
self.depBreakingInstrs = depBreakingInstrs
|
|
self.init = ([] if init is None else init)
|
|
self.preInstrCode = preInstrCode
|
|
self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes)
|
|
self.note = note
|
|
|
|
def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeIndepAndDepBreakingInstrs=True):
|
|
if isDivOrSqrtInstr(instrNode):
|
|
return getTPConfigsForDiv(instrNode)
|
|
|
|
iform = instrNode.attrib['iform']
|
|
iclass = instrNode.attrib['iclass']
|
|
|
|
independentInstrs = []
|
|
depBreakingInstrs = ''
|
|
if computeIndepAndDepBreakingInstrs:
|
|
independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr)
|
|
depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode)
|
|
|
|
# instructions with multiple configs
|
|
if iclass == 'CPUID':
|
|
configs = []
|
|
cpu = cpuid.CPUID()
|
|
for eax in (0x0, 0x80000000):
|
|
maxEax = cpu(eax)[0]
|
|
while eax <= maxEax + 1:
|
|
preInstrCode = 'mov EAX, {}; mov ECX, 0'.format(hex(eax))
|
|
preInstrNodes = [instrNodeDict['MOV (R32, I32)'], instrNodeDict['MOV (R32, I32)']]
|
|
note = 'With EAX={}, and ECX=0'.format(hex(eax))
|
|
configs.append(TPConfig(independentInstrs=independentInstrs, preInstrCode=preInstrCode, preInstrNodes=preInstrNodes, note=note))
|
|
eax += 1
|
|
return configs
|
|
|
|
if iclass in ['JB', 'JBE', 'JLE', 'JNB', 'JNBE', 'JNLE', 'JNO', 'JNP', 'JNS', 'JNZ', 'JO', 'JP', 'JS', 'JZ']:
|
|
config0 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; popfq'], note='With all flags set to 0')
|
|
config1 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; or qword ptr [RSP], 0x8D5; popfq'], note='With all flags set to 1')
|
|
return [config0, config1]
|
|
|
|
if iclass in ['JL', 'JNL']:
|
|
config0 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; popfq'], note='With SF=OF')
|
|
config1 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; or qword ptr [RSP], 0x80; popfq'], note='With SF!=OF')
|
|
return [config0, config1]
|
|
|
|
if iclass in ['JRCXZ']:
|
|
config0 = TPConfig(independentInstrs=independentInstrs, init=['mov RCX, 0'], note='With RCX=0')
|
|
config1 = TPConfig(independentInstrs=independentInstrs, init=['mov RCX, 1'], note='With RCX=1')
|
|
return [config0, config1]
|
|
|
|
if instrNode.attrib['string'] in ['MOV_89 (R32, R32)', 'MOV_8B (R32, R32)']:
|
|
# see https://github.com/andreas-abel/nanoBench/issues/26
|
|
config0 = TPConfig(independentInstrs)
|
|
config1 = TPConfig(independentInstrs, init=['xor {0}, {0}'.format(instrI.opRegDict[2]) for instrI in independentInstrs], note='With "clean" inputs')
|
|
return [config0, config1]
|
|
|
|
if 'LOOP' in iform or 'REP' in iform:
|
|
configs = []
|
|
for regVal in ['0', '1', '2']:
|
|
preInstrCode='mov RCX, '+regVal
|
|
preInstrNodes = [instrNodeDict['MOV (R64, I32)']]
|
|
note='With RCX='+regVal
|
|
config = TPConfig(independentInstrs=independentInstrs, preInstrCode=preInstrCode, preInstrNodes=preInstrNodes, note=note)
|
|
if instrNode.attrib['category'] in ['IOSTRINGOP']:
|
|
config.init = ['mov DX, 0x80']
|
|
configs.append(config)
|
|
return configs
|
|
|
|
# instructions with one config
|
|
preInstrCode, preInstrNodes = getPreInstr(instrNode)
|
|
config = TPConfig(independentInstrs, depBreakingInstrs, [], preInstrCode, preInstrNodes)
|
|
|
|
if re.search('BT.*MEMv_GPRv', iform):
|
|
config.init = list(set('mov ' + regTo64(r) + ', 0' for i in independentInstrs for r in i.readRegs if not regTo64(r) in globalDoNotWriteRegs|memRegs))
|
|
|
|
if iform in ['CALL_NEAR_GPRv', 'JMP_GPRv']:
|
|
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1: 'RAX'})]
|
|
|
|
if iform in ['CALL_NEAR_MEMv', 'JMP_MEMv']:
|
|
config.independentInstrs = [getInstrInstanceFromNode(instrNode)]
|
|
|
|
if iclass == 'FXRSTOR': config.init = ['FXSAVE [R14]']
|
|
if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]']
|
|
|
|
if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']:
|
|
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=0x80)
|
|
|
|
if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']:
|
|
config.init = ['mov DX, 0x80']
|
|
|
|
if iform == 'LLDT_GPR16': config.init = list(set('SLDT ' + reg for i in independentInstrs for reg in i.readRegs))
|
|
if iform == 'LMSW_GPR16': config.init = list(set('SMSW ' + reg for i in independentInstrs for reg in i.readRegs))
|
|
if iform == 'LMSW_MEMw': config.init = list(['SMSW [R14+'+str(i*2)+']' for i in range(0,maxTPRep)])
|
|
|
|
if iform == 'MOVDIR64B_GPRa_MEM':
|
|
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1: 'RSI'})]
|
|
|
|
if iform == 'POPF':
|
|
config.init = ['PUSHF; POP AX']
|
|
if iform == 'POPFQ':
|
|
config.init = ['PUSHFQ; pop RAX']
|
|
|
|
if iform in ['RDMSR', 'WRMSR']: config.init = ['MOV RCX, 0xE7'] # TSC Frequency Clock Counter
|
|
if iform == 'WRMSR': config.init += ['rdmsr'] # without this, Zen4 hangs ("smpboot: Scheduler frequency invariance went wobbly, disabling!")
|
|
if iform in ['RDPMC']: config.init = ['MOV RCX, 0']
|
|
|
|
if iform == 'RET_NEAR_IMMw':
|
|
config.independentInstrs = [getInstrInstanceFromNode(instrNode, immediate=8)]
|
|
|
|
if iclass == 'XGETBV': config.init = ['XOR ECX, ECX']
|
|
if iclass == 'XSETBV': config.init = ['XOR ECX, ECX; XGETBV']
|
|
if iclass == 'XRSTOR': config.init = ['XSAVE [R14]']
|
|
if iclass == 'XRSTORS': config.init = ['XSAVES [R14]']
|
|
if iclass == 'XRSTOR64': config.init = ['XSAVE64 [R14]']
|
|
if iclass == 'XRSTORS64': config.init = ['XSAVES64 [R14]']
|
|
|
|
return [config]
|
|
|
|
def getPreInstr(instrNode):
|
|
iform = instrNode.attrib['iform']
|
|
preInstrCode = ''
|
|
preInstrNodes = None
|
|
|
|
if iform in ['CALL_NEAR_GPRv', 'JMP_GPRv']:
|
|
preInstrCode = 'lea RAX, [RIP+2]'
|
|
preInstrNodes = [instrNodeDict['LEA_R_D32 (R64)']]
|
|
|
|
if iform in ['CALL_NEAR_MEMv', 'JMP_MEMv']:
|
|
preInstrCode = 'lea RAX, [RIP+6]; mov [R14], RAX'
|
|
preInstrNodes = [instrNodeDict['LEA_R_D32 (R64)'], instrNodeDict['MOV (M64, R64)']]
|
|
|
|
if iform == 'LEAVE':
|
|
preInstrCode = 'lea RBP, [R14]'
|
|
preInstrNodes = [instrNodeDict['LEA_B (R64)']]
|
|
|
|
if iform == 'POPF':
|
|
preInstrCode = 'PUSH AX'
|
|
preInstrNodes = [instrNodeDict['PUSH (R16)']]
|
|
|
|
if iform == 'POPFQ':
|
|
preInstrCode = 'PUSH RAX'
|
|
preInstrNodes = [instrNodeDict['PUSH (R64)']]
|
|
|
|
if iform == 'RET_NEAR':
|
|
preInstrCode = 'lea RAX, [RIP+5]; mov [RSP], RAX'
|
|
preInstrNodes = [instrNodeDict['LEA_R_D32 (R64)'], instrNodeDict['MOV (M64, R64)']]
|
|
|
|
if iform == 'RET_NEAR_IMMw':
|
|
preInstrCode = 'lea RAX, [RIP+7]; mov [RSP], RAX'
|
|
preInstrNodes = [instrNodeDict['LEA_R_D32 (R64)'], instrNodeDict['MOV (M64, R64)']]
|
|
|
|
return (preInstrCode, preInstrNodes)
|
|
|
|
# Returns [minConfig, maxConfig]
|
|
def getTPConfigsForDiv(instrNode):
|
|
memDivisor = len(instrNode.findall('./operand[@type="mem"]'))>0
|
|
iclass = instrNode.attrib['iclass']
|
|
|
|
minConfig = TPConfig(note='Fast division')
|
|
maxConfig = TPConfig(note='Slow division')
|
|
|
|
if iclass in ['DIV', 'IDIV']:
|
|
for op in instrNode.iter('operand'):
|
|
if op.attrib.get('suppressed', '0') == '0':
|
|
memDivisor = op.attrib['type'] == 'mem'
|
|
width = int(op.attrib['width'])
|
|
if width == 8:
|
|
maxConfig.preInstrCode = 'MOV AX, 13057'
|
|
maxConfig.preInstrNodes = [instrNodeDict['MOV (R16, I16)']]
|
|
maxDivisor = '123'
|
|
elif width == 16:
|
|
maxConfig.preInstrCode = 'MOV AX, 133; MOV DX, 0x343a'
|
|
maxConfig.preInstrNodes = [instrNodeDict['MOV (R16, I16)'], instrNodeDict['MOV (R16, I16)']]
|
|
maxDivisor = '0x75e6'
|
|
elif width == 32:
|
|
maxConfig.preInstrCode = 'MOV EAX, 133; MOV EDX, 0x343a9ed7'
|
|
maxConfig.preInstrNodes = [instrNodeDict['MOV (R32, I32)'], instrNodeDict['MOV (R32, I32)']]
|
|
maxDivisor = '0x75e6e44f'
|
|
else:
|
|
maxConfig.preInstrCode = 'MOV RAX, 133; MOV RDX, 0x343a9ed744556677'
|
|
maxConfig.preInstrNodes = [instrNodeDict['MOV (R64, I32)'],instrNodeDict['MOV (R64, I64)']]
|
|
maxDivisor = '0x75e6e44fccddeeff'
|
|
|
|
if memDivisor:
|
|
memPrefix = instrNode.findall('./operand[@type="mem"]')[0].attrib['memory-prefix']
|
|
minConfig.init = ['MOV ' + memPrefix + ' [R14], 1']
|
|
maxConfig.init = ['MOV ' + regToSize('R8', width) + ', ' + maxDivisor + '; MOV ' + memPrefix + ' [R14], ' + regToSize('R8', width)]
|
|
instrI = getInstrInstanceFromNode(instrNode)
|
|
else:
|
|
minConfig.init = ['MOV ' + regToSize('RBX', width) + ', 1']
|
|
maxConfig.init = ['MOV ' + regToSize('RBX', width) + ', ' + maxDivisor]
|
|
instrI = getInstrInstanceFromNode(instrNode, opRegDict={int(op.attrib['idx']):regToSize('RBX', width)})
|
|
|
|
minConfig.independentInstrs = [instrI]
|
|
maxConfig.independentInstrs = [instrI]
|
|
minConfig.init += ['MOV RAX, 0; MOV RDX, 0']
|
|
minConfig.preInstrCode = 'MOV RAX, 0; MOV RDX, 0'
|
|
minConfig.preInstrNodes = [instrNodeDict['MOV (R64, I32)'], instrNodeDict['MOV (R64, I32)']]
|
|
elif iclass in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD', 'VDIVSS', 'VDIVPS', 'VDIVSD', 'VDIVPD', 'VDIVSH', 'VDIVPH']:
|
|
dataType = iclass[-1]
|
|
if dataType == 'D':
|
|
maxDividend = '0x429da724b687da66' # 8.1509281715106E12
|
|
maxDivisor = '0x3ff33e97f934078b' # 1.20278165192619
|
|
minDividend = '0x3ff0000000000000' # 1.0
|
|
minDivisor = '0x3ff0000000000000' # 1.0
|
|
elif dataType == 'S':
|
|
maxDividend = '0x54ed392654ed3926' # 8.15093E12 in high and low 32-bit
|
|
maxDivisor = '0x3f99f4c03f99f4c0' # 1.20278 in high and low 32-bit
|
|
minDividend = '0x3f8000003f800000' # 1.0 in high and low 32-bit
|
|
minDivisor = '0x3f8000003f800000' # 1.0 in high and low 32-bit
|
|
else: # dataType == 'H'
|
|
# ToDo: find better values
|
|
maxDividend = '0x2769276927692769' # 0.02895 in all 4 16-bit blocks
|
|
maxDivisor = '0x3CCF3CCF3CCF3CCF' # 1.203 in all 4 16-bit blocks
|
|
minDividend = '0x3C003C003C003C00' # 1.0 in all 4 16-bit blocks
|
|
minDivisor = '0x3C003C003C003C00' # 1.0 in all 4 16-bit blocks
|
|
|
|
for config, dividend, divisor in [(maxConfig, maxDividend, maxDivisor), (minConfig, minDividend, minDivisor)]:
|
|
config.init = ['MOV RAX, ' + dividend]
|
|
config.init += ['MOV RBX, ' + divisor]
|
|
for i in range(0, 64, 8): config.init += ['MOV [R14+' + str(i) + '], RBX']
|
|
for i in range(64, 128, 8): config.init += ['MOV [R14+' + str(i) + '], RAX']
|
|
|
|
if instrNode.attrib['iclass'] in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD']:
|
|
config.init += ['MOVUP' + dataType + ' XMM0, [R14]']
|
|
config.init += ['MOVUP' + dataType + ' XMM1, [R14+64]']
|
|
config.init += ['MOVUP' + dataType + ' XMM2, XMM1']
|
|
config.preInstrCode = 'MOVUP' + dataType + ' XMM2, XMM1; '
|
|
config.preInstrNodes = [instrNodeDict['MOVUP' + dataType + '_0F10 (XMM, XMM)']]
|
|
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM2', 2:'XMM0'})]
|
|
else:
|
|
regType = 'XMM'
|
|
if 'YMM' in instrNode.attrib['iform']: regType = 'YMM'
|
|
if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM'
|
|
|
|
nOperands = len(instrNode.findall('./operand'))
|
|
|
|
dividendReg = regType + '0'
|
|
divisorReg = regType + '1'
|
|
|
|
config.init += ['VMOVUP' + ('S' if (dataType == 'S') else 'D') + ' ' + dividendReg + ', [R14+64]']
|
|
config.init += ['VMOVUP' + ('S' if (dataType == 'S') else 'D') + ' ' + divisorReg + ', [R14]']
|
|
|
|
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1:regType+str(reg), (nOperands-1):dividendReg, nOperands:divisorReg})
|
|
for reg in range(2, 10)]
|
|
elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', 'VSQRTPD','VRSQRTSS',
|
|
'VRSQRTPS', 'VRSQRT14SS', 'VRSQRT14SD', 'VRSQRT14PS', 'VRSQRT14PD', 'VSQRTSH', 'VSQRTPH', 'VRSQRTSH', 'VRSQRTPH']:
|
|
dataType = instrNode.attrib['iclass'][-1]
|
|
|
|
if dataType == 'D':
|
|
maxArg = '0x465a61fe1acdc21c' # 8.3610378602352937E30
|
|
minArg = '0x3ff0000000000000' # 1.0
|
|
elif dataType == 'S':
|
|
maxArg = '0x72d30ff172d30ff1' # 8.36104E30 in high and low 32-bit
|
|
minArg = '0x3f8000003f800000' # 1.0 in high and low 32-bit
|
|
else: # dataType == 'H'
|
|
# ToDo: find better values
|
|
maxArg = '0x1698169816981698' # 0.00161 in all 4 16-bit blocks
|
|
minArg = '0x3C003C003C003C00' # 1.0 in all 4 16-bit blocks
|
|
|
|
instrPrefix = ''
|
|
if instrNode.attrib['iclass'].startswith('V'): instrPrefix = 'V'
|
|
|
|
for arg, config in [(maxArg, maxConfig), (minArg, minConfig)]:
|
|
regType = 'XMM'
|
|
if 'YMM' in instrNode.attrib['iform']: regType = 'YMM'
|
|
if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM'
|
|
|
|
config.init = ['MOV RAX, ' + arg]
|
|
for i in range(0, getRegSize(regType)//8, 8): config.init += ['MOV [R14+' + str(i) + '], RAX']
|
|
|
|
targetRegIdx = min(int(opNode.attrib['idx']) for opNode in instrNode.findall('./operand') if opNode.text and regType in opNode.text)
|
|
if memDivisor:
|
|
instrs = [getInstrInstanceFromNode(instrNode, opRegDict={targetRegIdx:regType+str(reg)}) for reg in range(2, 10)]
|
|
else:
|
|
sourceReg = regType + '0'
|
|
config.init += [instrPrefix + 'MOVUP' + ('S' if (dataType == 'S') else 'D') + ' ' + sourceReg + ', [R14]']
|
|
sourceRegIdx = max(int(opNode.attrib['idx']) for opNode in instrNode.findall('./operand') if opNode.text and regType in opNode.text)
|
|
instrs = [getInstrInstanceFromNode(instrNode, opRegDict={targetRegIdx:regType+str(reg), sourceRegIdx: sourceReg}) for reg in range(2, 10)]
|
|
|
|
config.independentInstrs = instrs
|
|
|
|
return [minConfig, maxConfig]
|
|
|
|
|
|
# rounds to the nearest multiple of 1/5, 1/4, or 1/3 (in that order) if the value is at most 0.015 smaller or larger than this multiple;
|
|
# otherwise rounds to two decimals
|
|
def fancyRound(cycles):
|
|
round5 = round(round(cycles*5)/5, 2)
|
|
round4 = round(round(cycles*4)/4, 2)
|
|
round3 = round(round(cycles*3)/3, 2)
|
|
if abs(round5-cycles) <= 0.015:
|
|
return round5
|
|
elif abs(round4-cycles) <= 0.015:
|
|
return round4
|
|
elif abs(round3-cycles) <= 0.015:
|
|
return round3
|
|
return round(cycles, 2)
|
|
|
|
|
|
TPResult = namedtuple('TPResult', ['TP', 'TP_loop', 'TP_noLoop', 'TP_noDepBreaking_noLoop', 'TP_single', 'uops', 'fused_uops', 'uops_MITE', 'uops_MS', 'divCycles',
|
|
'ILD_stalls', 'complexDec', 'nAvailableSimpleDecoders', 'config', 'unblocked_ports', 'all_used_ports'])
|
|
|
|
# returns TPResult
|
|
# port usages are averages (when no ports are blocked by other instructions)
|
|
def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports):
|
|
configs = getTPConfigs(instrNode, useDistinctRegs, useIndexedAddr)
|
|
|
|
minTP = sys.maxsize
|
|
minTP_loop = sys.maxsize
|
|
minTP_noLoop = sys.maxsize
|
|
minTP_noDepBreaking_noLoop = sys.maxsize
|
|
minTP_single = sys.maxsize
|
|
ports_dict = {}
|
|
all_used_ports = set()
|
|
|
|
if useIACA:
|
|
config = configs[0] # consider only first config as IACA does not seem to consider different values in registers
|
|
|
|
instrList = [x.asm for x in config.independentInstrs]
|
|
for ic in sorted(set([1, len(instrList)])):
|
|
if len(instrList) > 1: htmlReports.append('<h3>With ' + str(ic) + ' independent instruction' + ('s' if ic>1 else '') + '</h3>\n')
|
|
if ic > 1: htmlReports.append('<hr>\n')
|
|
for useDepBreakingInstrs in [False, True]:
|
|
if useDepBreakingInstrs:
|
|
if not config.depBreakingInstrs: continue
|
|
instrStr = ";".join([i+';'+config.depBreakingInstrs for i in instrList[0:ic]])
|
|
htmlReports.append('<h4>With additional dependency-breaking instructions</h4>\n')
|
|
else:
|
|
instrStr = ";".join(instrList[0:ic])
|
|
|
|
createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, instrStr)
|
|
try:
|
|
subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o'])
|
|
iaca_out = subprocess.check_output(iacaCMDLine + ['/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT).decode()
|
|
except subprocess.CalledProcessError as e:
|
|
logging.warning('Error: ' + e.output.decode())
|
|
htmlReports.append('<pre>' + e.output.decode() + '</pre>\n')
|
|
if minTP != sys.maxsize:
|
|
continue # on SNB, IACA 2.2 crashes on only some (larger) inputs
|
|
else:
|
|
return None
|
|
|
|
htmlReports.append('<pre>' + iaca_out + '</pre>\n')
|
|
|
|
if not iaca_out or ' ! ' in iaca_out or ' X ' in iaca_out or ' 0X ' in iaca_out or not 'Total Num Of Uops' in iaca_out:
|
|
print('IACA error')
|
|
return None
|
|
|
|
cycles = float(iaca_out.split('\n')[3].split()[2])
|
|
cycles = cycles/ic
|
|
minTP = min(minTP, cycles)
|
|
if not useDepBreakingInstrs: minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles)
|
|
|
|
if ic == 1 and not useDepBreakingInstrs:
|
|
minTP_single = min(minTP_single, cycles)
|
|
|
|
unfused_uops_line = iaca_out.split('\n')[-2]
|
|
unfused_uops = int(unfused_uops_line.split()[4])//ic
|
|
|
|
ports_line = iaca_out.split('\n')[-3]
|
|
fused_uops = '^' in ports_line.split()[1]
|
|
|
|
num_ports = re.search(r'\| Port \|.*', iaca_out).group(0).count('|')-2
|
|
|
|
for p in range(0, num_ports):
|
|
portCol = ports_line.split('|')[p+2].split()
|
|
if portCol:
|
|
usage = float(portCol[0])
|
|
ports_dict[str(p)] = usage
|
|
if usage > 0:
|
|
all_used_ports.add(str(p))
|
|
else:
|
|
ports_dict[str(p)] = 0.0
|
|
|
|
port0 = ports_line.split('|')[2].split()
|
|
if len(port0)>1:
|
|
divCycles = int(float(port0[1]))
|
|
else:
|
|
divCycles = 0
|
|
|
|
return TPResult(minTP, minTP, minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, None, None, divCycles, 0, False, None, config,
|
|
ports_dict, all_used_ports)
|
|
else:
|
|
hasMemWriteOperand = len(instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'))>0
|
|
uops = None
|
|
uopsFused = None
|
|
uopsMITE = None
|
|
uopsMS = None
|
|
divCycles = None
|
|
ILD_stalls = None
|
|
complexDec = False
|
|
# number of other instr. requiring the simple decoder that can be decoded in the same cycle; only applicable for instr. that require the complex decoder
|
|
nAvailableSimpleDecoders = None
|
|
for config in configs:
|
|
if config.note: htmlReports.append('<h2>' + config.note + '</h2>\n')
|
|
|
|
instrIList = config.independentInstrs
|
|
instrLen = getCodeLength(instrIList[0].asm)
|
|
for ic in sorted(set([1, min(4, len(instrIList)), min(8, len(instrIList)), len(instrIList)])):
|
|
if ic > 1 and minTP_noLoop < sys.maxsize and minTP_loop < sys.maxsize and minTP_noLoop > 100 and minTP_loop > 100: break
|
|
|
|
if len(instrIList) > 1: htmlReports.append('<h3 style="margin-left: 25px">With ' + str(ic) + ' independent instruction' + ('s' if ic>1 else '') + '</h3>\n')
|
|
htmlReports.append('<div style="margin-left: 50px">\n')
|
|
|
|
init = list(chain.from_iterable(i.regMemInit for i in instrIList[0:ic])) + config.init
|
|
|
|
for useDepBreakingInstrs in ([False, True] if config.depBreakingInstrs else [False]):
|
|
if ic > 1 and minTP_noLoop < sys.maxsize and minTP_loop < sys.maxsize and minTP_noLoop > 100 and minTP_loop > 100: break
|
|
|
|
depBreakingInstrs = ''
|
|
if useDepBreakingInstrs:
|
|
depBreakingInstrs = config.depBreakingInstrs
|
|
htmlReports.append('<h4>With additional dependency-breaking instructions</h4>\n')
|
|
|
|
for repType in ['unrollOnly', 'loopSmall', 'loopBig']:
|
|
if ic > 1 and minTP_noLoop < sys.maxsize and minTP_loop < sys.maxsize and minTP_noLoop > 100 and minTP_loop > 100: break
|
|
|
|
paddingTypes = ['']
|
|
if ((repType != 'unrollOnly') and (uopsMITE is not None) and (not uopsMS) and (not 'RIP' in config.preInstrCode)
|
|
and ((math.ceil(32.0/instrLen) * uopsMITE > 18) or any(imm in instrNode.attrib['string'] for imm in ['I16', 'I32', 'I64']))):
|
|
if (instrNode.attrib.get('vex', '') != '') or (instrNode.attrib.get('evex', '') != '') or (instrNode.attrib.get('high8', '') != ''):
|
|
paddingTypes.append('long NOPs')
|
|
else:
|
|
paddingTypes.append('redundant prefixes')
|
|
|
|
for paddingType in paddingTypes:
|
|
# an lfence is added for measuring DIV_CYCLES accurately
|
|
for addLfence in ([False, True] if isDivOrSqrtInstr(instrNode) and (ic == 1) and (repType == 'unrollOnly') else [False]):
|
|
instrStr = ''
|
|
for i, instr in enumerate(instrIList[0:ic]):
|
|
instrStr += depBreakingInstrs + ';' + config.preInstrCode + ';'
|
|
if paddingType == 'redundant prefixes':
|
|
nPrefixes = max(1, 8 - instrLen) if (not useDepBreakingInstrs and not config.preInstrCode) else (14 - instrLen)
|
|
instrStr += '.byte ' + ','.join(['0x40'] * nPrefixes) + ';' # 'empty' REX prefixes
|
|
instrStr += instr.asm + ';'
|
|
if paddingType == 'long NOPs' and ((i % 4 == 3) or (i == ic - 1)):
|
|
instrStr += '.byte 0x66,0x66,0x66,0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00;' # 15-Byte NOP
|
|
|
|
if addLfence:
|
|
instrStr += 'lfence;'
|
|
|
|
if repType == 'unrollOnly':
|
|
unrollCount = int(round(500/ic+49, -2)) # should still fit in the icache
|
|
if instrNode.attrib['iclass'] in ['RDRAND', 'RDSEED', 'WBINVD'] or instrNode.attrib['category'] in ['IO', 'IOSTRINGOP']:
|
|
unrollCount = 10
|
|
loopCount = 0
|
|
else:
|
|
# we test with a small loop body so that uops may be delivered from the loop stream detector (LSD)
|
|
# we also test with a larger loop body to minimize potential overhead from the loop itself
|
|
if instrNode.attrib['iclass'] in ['RDRAND', 'RDSEED', 'WBINVD'] or instrNode.attrib['category'] in ['IO', 'IOSTRINGOP']:
|
|
continue
|
|
unrollCount = max(1, int(round(10.0/ic)))
|
|
if repType == 'loopSmall':
|
|
loopCount = 1000
|
|
else:
|
|
loopCount = 100
|
|
unrollCount *= 10
|
|
if minTP < sys.maxsize and minTP > 100:
|
|
unrollCount = 1
|
|
loopCount = 10
|
|
|
|
htmlReports.append('<h4>')
|
|
if loopCount > 0:
|
|
htmlReports.append('With loop_count=' + str(loopCount) + (',' if paddingType else ' and') + ' unroll_count=' + str(unrollCount))
|
|
else:
|
|
htmlReports.append('With unroll_count=' + str(unrollCount) + (',' if paddingType else ' and') + ' no inner loop')
|
|
if paddingType:
|
|
htmlReports.append(', and padding (' + paddingType + ')')
|
|
htmlReports.append('</h4>\n')
|
|
|
|
htmlReports.append('<ul>\n')
|
|
result = runExperiment(instrNode, instrStr, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=(loopCount>0),
|
|
htmlReports=htmlReports)
|
|
htmlReports.append('</ul>\n')
|
|
|
|
cycles = fancyRound(result['Core cycles']/ic)
|
|
|
|
#invalid = False
|
|
#if any('PORT' in e for e in result):
|
|
# maxPortUops = max(v/(len(e)-9) for e,v in result.items() if e.startswith('UOPS_PORT') and not '4' in e)
|
|
# if maxPortUops * .98 > result['Core cycles']:
|
|
# print('More uops on ports than cycles, uops: {}, cycles: {}'.format(maxPortUops, result['Core cycles']))
|
|
# #invalid = True
|
|
|
|
if not addLfence:
|
|
minTP = min(minTP, cycles)
|
|
if repType == 'unrollOnly':
|
|
minTP_noLoop = min(minTP_noLoop, cycles)
|
|
if not useDepBreakingInstrs:
|
|
minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles)
|
|
for p, i in result.items():
|
|
if (i/ic > .1) and (('UOPS_PORT' in p) or ('FpuPipeAssignment.Total' in p)):
|
|
all_used_ports.add(p[10:] if ('UOPS_PORT' in p) else p[23:])
|
|
else:
|
|
minTP_loop = min(minTP_loop, cycles)
|
|
|
|
if ic == 1 and (minTP == sys.maxsize or cycles == minTP) and not useDepBreakingInstrs and repType == 'unrollOnly':
|
|
minConfig = config
|
|
minTP_single = min(minTP_single, cycles)
|
|
|
|
if isIntelCPU():
|
|
ports_dict = {p[10:]: i for p, i in result.items() if 'UOPS_PORT' in p}
|
|
elif isAMDCPU() and not instrNode.attrib['extension'] == 'BASE':
|
|
# We ignore BASE instructions, as they sometimes wrongly count floating point uops
|
|
ports_dict = {p[23:]: i for p, i in result.items() if 'FpuPipeAssignment.Total' in p}
|
|
|
|
uops = int(result['UOPS']+.2)
|
|
if 'RETIRE_SLOTS' in result:
|
|
uopsFused = int(result['RETIRE_SLOTS']+.2)
|
|
|
|
if 'UOPS_MITE' in result:
|
|
uopsMITE = int(result['UOPS_MITE']+.2)
|
|
|
|
if 'UOPS_MS' in result:
|
|
uopsMS = int(result['UOPS_MS']+.2)
|
|
|
|
if 'ILD_STALL.LCP' in result:
|
|
ILD_stalls = int(result['ILD_STALL.LCP'])
|
|
|
|
if (not config.preInstrCode) and (((uopsMITE is not None) and (uopsMITE > 1)) or ((uopsMS is not None) and (uopsMS > 0)) or
|
|
(result.get('INST_DECODED.DEC0', 0) > .05) or ((result.get('UOPS_MITE>=1', 0) > .95) and (not isBranchInstr(instrNode)))):
|
|
# ToDo: preInstrs
|
|
complexDec = True
|
|
|
|
if complexDec and ('UOPS_MITE>=1' in result):
|
|
for nNops in count(1):
|
|
nopStr = str(nNops) + ' NOP' + ('s' if nNops > 1 else '')
|
|
htmlReports.append('<h4>With unroll_count=' + str(unrollCount) +', no inner loop, and ' + nopStr + '</h4>\n')
|
|
htmlReports.append('<ul>\n')
|
|
resultNops = runExperiment(instrNode, instrStr + ('; nop' * nNops), init=init, unrollCount=unrollCount, htmlReports=htmlReports)
|
|
htmlReports.append('</ul>\n')
|
|
if resultNops['UOPS_MITE>=1'] > result['UOPS_MITE>=1'] +.95:
|
|
nAvailableSimpleDecoders = nNops - 1
|
|
break
|
|
else:
|
|
if 'DIV_CYCLES' in result:
|
|
divCyclesTmp = int(result['DIV_CYCLES']+.2)
|
|
divCycles = min(divCycles, divCyclesTmp) if (divCycles is not None) else divCyclesTmp
|
|
|
|
htmlReports.append('</div>')
|
|
|
|
if minTP < sys.maxsize:
|
|
return TPResult(minTP, minTP_loop, minTP_noLoop, minTP_noDepBreaking_noLoop, minTP_single, uops, uopsFused, uopsMITE, uopsMS, divCycles, ILD_stalls,
|
|
complexDec, nAvailableSimpleDecoders, minConfig, ports_dict, all_used_ports)
|
|
|
|
|
|
def canMacroFuse(flagInstrNode, branchInstrNode, htmlReports):
|
|
flagInstrInstance = getInstrInstanceFromNode(flagInstrNode)
|
|
branchInstrInstance = getInstrInstanceFromNode(branchInstrNode)
|
|
|
|
init = flagInstrInstance.regMemInit
|
|
code = flagInstrInstance.asm + ';' + branchInstrInstance.asm
|
|
|
|
htmlReports.append('<div style="margin-left: 25px">\n')
|
|
htmlReports.append('<h3>With ' + branchInstrNode.attrib['string'] + '</h3>\n')
|
|
htmlReports.append('<ul>\n')
|
|
result = runExperiment(None, code, init=init, unrollCount=100, basicMode=True, htmlReports=htmlReports)
|
|
htmlReports.append('</ul>\n')
|
|
htmlReports.append('</div>\n')
|
|
|
|
uops = int(result['RETIRE_SLOTS']+.1)
|
|
return (uops == 1)
|
|
|
|
|
|
basicLatency = {}
|
|
|
|
def getBasicLatencies(instrNodeList):
|
|
andResult = runExperiment(instrNodeDict['AND_21 (R64, R64)'], 'AND RAX, RBX')
|
|
basicLatency['AND'] = int(andResult['Core cycles'] + .2)
|
|
|
|
orResult = runExperiment(instrNodeDict['OR_09 (R64, R64)'], 'OR RAX, RBX')
|
|
basicLatency['OR'] = int(orResult['Core cycles'] + .2)
|
|
|
|
xorResult = runExperiment(instrNodeDict['XOR_31 (R64, R64)'], 'XOR RAX, RBX')
|
|
basicLatency['XOR'] = int(xorResult['Core cycles'] + .2)
|
|
|
|
movR8hR8hResult = runExperiment(instrNodeDict['MOV_88 (R8h, R8h)'], 'MOV AH, AH')
|
|
basicLatency['MOV_R8h_R8h'] = int(movR8hR8hResult['Core cycles'] + .2)
|
|
|
|
movR8hResult = runExperiment(None, 'MOV AH, AL')
|
|
basicLatency['MOV_R8h_R8l'] = max(1, int(movR8hResult['Core cycles'] + .2))
|
|
|
|
for t in [16, 32, 64]:
|
|
for s in [8,16,32]:
|
|
if s >= t:
|
|
continue
|
|
movsxResult = runExperiment(None, 'MOVSX {}, {}'.format(regToSize('RAX', t), regToSize('RAX', s)))
|
|
basicLatency['MOVSX_R{}_R{}'.format(t,s)] = int(movsxResult['Core cycles'] + .2)
|
|
if t < 64:
|
|
movsxResult = runExperiment(None, 'MOVSX {}, AH; MOV AH, BL'.format(regToSize('RBX', t)))
|
|
basicLatency['MOVSX_R{}_R8h'.format(t)] = int(movsxResult['Core cycles'] + .2) - basicLatency['MOV_R8h_R8l']
|
|
|
|
cmcResult = runExperiment(instrNodeDict['CMC'], 'CMC')
|
|
basicLatency['CMC'] = int(cmcResult['Core cycles'] + .2)
|
|
|
|
movqResult = runExperiment(instrNodeDict['MOVQ_0F6F (MM, MM)'], 'MOVQ MM0, MM0')
|
|
basicLatency['MOVQ'] = int(movqResult['Core cycles'] + .2)
|
|
|
|
for flag in STATUSFLAGS_noAF:
|
|
testSetResult = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL')
|
|
# we additionally test with a nop, as the result may be higher than the actual latency (e.g., on ADL-P), probably due to non-optimal port assignments
|
|
testSetResultNop = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL; NOP')
|
|
testSetCycles = min(int(testSetResult['Core cycles'] + .2), int(testSetResultNop['Core cycles'] + .2))
|
|
|
|
if testSetCycles == 2:
|
|
basicLatency['TEST'] = 1
|
|
elif arch in ['BNL', 'SLM', 'AMT', 'GLM', 'GLP', 'TRM']:
|
|
# according to the Optimization Manual (June 2021)
|
|
basicLatency['TEST'] = 1
|
|
else:
|
|
print('Latencies of TEST and SET' + flag[0] + ' could not be determined')
|
|
sys.exit()
|
|
|
|
basicLatency['SET' + flag[0]] = testSetCycles - basicLatency['TEST']
|
|
|
|
testSetHigh8Result = runExperiment(None, 'TEST AH, AH; SET' + flag[0] + ' AH')
|
|
testSetHigh8Cycles = int(testSetHigh8Result['Core cycles'] + .2)
|
|
if testSetHigh8Cycles == 2:
|
|
basicLatency['SET' + flag[0] + '_R8h'] = 1
|
|
basicLatency['TEST_R8h_R8h'] = 1
|
|
|
|
testCmovResult = runExperiment(None, 'TEST RAX, RAX; CMOV' + flag[0] + ' RAX, RAX')
|
|
testCmovResultNop = runExperiment(None, 'TEST RAX, RAX; CMOV' + flag[0] + ' RAX, RAX; NOP')
|
|
basicLatency['CMOV' + flag[0]] = min(int(testCmovResult['Core cycles'] + .2), int(testCmovResultNop['Core cycles'] + .2)) - basicLatency['TEST']
|
|
|
|
for instr in ['ANDPS', 'ANDPD', 'ORPS', 'ORPD', 'PAND', 'POR']:
|
|
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM)'], instr + ' XMM1, XMM1')
|
|
basicLatency[instr] = int(result['Core cycles'] + .2)
|
|
|
|
for instr in ['PSHUFD', 'SHUFPD']:
|
|
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, I8)'], instr + ' XMM1, XMM1, 0')
|
|
basicLatency[instr] = int(result['Core cycles'] + .2)
|
|
|
|
if any(x.findall('[@iclass="VANDPS"]') for x in instrNodeList):
|
|
for instr in ['VANDPS', 'VANDPD', 'VORPS', 'VORPD', 'VPAND', 'VPOR']:
|
|
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, XMM)'], instr + ' XMM1, XMM1, XMM1')
|
|
basicLatency[instr] = int(result['Core cycles'] + .2)
|
|
|
|
for instr in ['VSHUFPD']:
|
|
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, XMM, I8)'], instr + ' XMM1, XMM1, XMM1, 0')
|
|
basicLatency[instr] = int(result['Core cycles'] + .2)
|
|
|
|
for instr in ['VPSHUFD']:
|
|
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, I8)'], instr + ' XMM1, XMM1, 0')
|
|
basicLatency[instr] = int(result['Core cycles'] + .2)
|
|
|
|
if any(x.findall('[@extension="AVX512EVEX"]') for x in instrNodeList):
|
|
kmovq_result = runExperiment(instrNodeDict['KMOVQ_VEX (K, K)'], 'KMOVQ K1, K1')
|
|
basicLatency['KMOVQ'] = int(kmovq_result['Core cycles'] + .2)
|
|
|
|
vpandd_result = runExperiment(instrNodeDict['VPANDD (ZMM, ZMM, ZMM)'], 'VPANDD ZMM0, ZMM0, ZMM0')
|
|
basicLatency['VPANDD'] = int(vpandd_result['Core cycles'] + .2)
|
|
|
|
for regType in ['XMM', 'YMM', 'ZMM']:
|
|
vmovups_result = runExperiment(instrNodeDict['VMOVUPS ({0}, K, {0})'.format(regType)], 'VMOVUPS ' + regType + '1 {k1}, ' + regType + '1')
|
|
vmovups_cycles = int(vmovups_result['Core cycles'] + .2)
|
|
vmovups_uops = int(vmovups_result['UOPS'] + .2)
|
|
basicLatency['VMOVUPS_' + regType + '_' + 'K'] = vmovups_cycles
|
|
|
|
if not vmovups_uops == 1:
|
|
print('VMOVUPS must have exactly 1 uop')
|
|
sys.exit()
|
|
|
|
vpmovq2m_result = runExperiment(instrNodeDict['VPMOVQ2M (K, ' + regType + ')'],
|
|
'VPMOVQ2M K1, ' + regType + '1; VMOVUPS ' + regType + '1 {k1}, ' + regType + '1')
|
|
basicLatency['VPMOVQ2M_'+regType] = int(vpmovq2m_result['Core cycles'] + .2) - vmovups_cycles
|
|
|
|
vptestnmq_result = runExperiment(instrNodeDict['VPTESTNMQ (K, K, {0}, {0})'.format(regType)],
|
|
'VPTESTNMQ K1 {K1}, ' + regType + '1, ' + regType + '1; VMOVUPS ' + regType + '1 {k1}, ' + regType + '1')
|
|
basicLatency['VPTESTNMQ_'+regType] = int(vptestnmq_result['Core cycles'] + .2) - vmovups_cycles
|
|
|
|
for memWidth in [8, 16, 32, 64]:
|
|
reg = regToSize('R12', memWidth)
|
|
mov_10movsx_mov_result = runExperiment(None, 'mov ' + reg + ', [r14];' + ';'.join(10*['MOVSX R12, R12w']) + '; mov [r14], ' + reg , unrollCount=100)
|
|
basicLatency['MOV_10MOVSX_MOV_'+str(memWidth)] = int(mov_10movsx_mov_result['Core cycles'] + .2)
|
|
|
|
print('Basic Latencies: ' + str(basicLatency))
|
|
|
|
# Returns a dict {opNode: instr}, s.t. opNode is both read and written, and instr breaks the dependency
|
|
# Returns a list of dependency breaking instructions for operands that are both read and written (with the exception of ignoreOperand, if specified).
|
|
def getDependencyBreakingInstrs(instrNode, opRegDict, ignoreOperand = None):
|
|
depBreakingInstrs = dict()
|
|
for opNode in instrNode.findall('./operand[@type="reg"][@r="1"][@w="1"]'):
|
|
if opNode == ignoreOperand: continue
|
|
|
|
xtype = opNode.attrib.get('xtype', '')
|
|
opI = int(opNode.attrib['idx'])
|
|
if opI in opRegDict:
|
|
reg = opRegDict[opI]
|
|
elif opNode.attrib.get('suppressed', '0') == '1':
|
|
reg = opNode.text
|
|
regPrefix = re.sub(r'\d', '', reg)
|
|
if reg in GPRegs:
|
|
if reg not in globalDoNotWriteRegs|memRegs:
|
|
depBreakingInstrs[opNode] = 'MOV ' + reg + ', 0' # don't use XOR as this would also break flag dependencies
|
|
elif reg in ['RSP', 'RBP']:
|
|
depBreakingInstrs[opNode] = 'MOV ' + reg + ', R14'
|
|
elif xtype.startswith('f'):
|
|
if isAVXInstr(instrNode):
|
|
depBreakingInstrs[opNode] = 'VMOVUPD ' + reg + ', ' + regPrefix + '15'
|
|
else:
|
|
depBreakingInstrs[opNode] = 'MOVUPD ' + reg + ', ' + regPrefix + '15'
|
|
elif regPrefix in ['XMM', 'YMM', 'ZMM'] and isAVXInstr(instrNode):
|
|
depBreakingInstrs[opNode] = 'VXORPS ' + reg + ', ' + reg + ', ' + reg
|
|
elif 'MM'in regPrefix:
|
|
depBreakingInstrs[opNode] = 'PXOR ' + reg + ', ' + reg
|
|
for opNode in instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'):
|
|
if opNode == ignoreOperand: continue
|
|
|
|
memWidth = int(opNode.attrib['width'])
|
|
if memWidth <= 64:
|
|
depBreakingInstrs[opNode] = 'MOV ' + opNode.attrib['memory-prefix'] + ' [' + getAddrReg(instrNode, opNode) + '], 0'
|
|
else:
|
|
depBreakingInstrs[opNode] = 'MOVUPS [' + getAddrReg(instrNode, opNode) + '], XMM15'
|
|
for opNode in instrNode.findall('./operand[@type="flags"][@w="1"]'):
|
|
if opNode == ignoreOperand: continue
|
|
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
|
|
|
|
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
|
|
depBreakingInstrs[opNode] = 'TEST R15, R15'
|
|
|
|
return depBreakingInstrs
|
|
|
|
|
|
# Returns an assembler code string of dependency breaking instructions for suppressed operands (and operands with only one possible register) of instrNode.
|
|
def getDependencyBreakingInstrsForSuppressedOperands(instrNode):
|
|
if instrNode.attrib['iclass'] in ['LEAVE']: return ''
|
|
if instrNode.attrib['iclass'] in ['XLAT']: return 'XOR RAX, RAX'
|
|
|
|
depBreakingInstrs = []
|
|
xorInDepBreakingInstrs = False
|
|
for opNode in instrNode.findall('./operand[@type="reg"][@r="1"]'):
|
|
if opNode.attrib.get('suppressed', '0') == '0' and ',' in opNode.text: continue
|
|
|
|
reg = opNode.text
|
|
if not reg in GPRegs: continue
|
|
if reg in globalDoNotWriteRegs|specialRegs|memRegs: continue
|
|
|
|
writeOfRegFound = False
|
|
for opNode2 in instrNode.findall('./operand[@type="reg"][@w="1"]'):
|
|
if opNode2.attrib.get('suppressed', '0') == '0' and ',' in opNode2.text: continue
|
|
reg2 = opNode2.text
|
|
if regTo64(reg) == regTo64(reg2):
|
|
writeOfRegFound = True
|
|
break
|
|
|
|
if writeOfRegFound:
|
|
# we use the corresponding 64-bit register, as dependency breaking doesn't seem to work for reg sizes <= 16
|
|
depBreakingInstrs += ['XOR ' + regTo64(reg) + ', ' + regTo64(reg)]
|
|
xorInDepBreakingInstrs = True
|
|
for opNode in instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"][@suppressed="1"]'):
|
|
depBreakingInstrs += ['MOV qword ptr [' + opNode.attrib['base'] + '], 0']
|
|
if not xorInDepBreakingInstrs:
|
|
for opNode in instrNode.findall('./operand[@type="flags"][@w="1"]'):
|
|
# on some CPUs, instructions that write flags conditionally also read the flags
|
|
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
|
|
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
|
|
depBreakingInstrs += ['TEST R15, R15']
|
|
|
|
return ';'.join(depBreakingInstrs)
|
|
|
|
|
|
#constants are from Agner Fog's scripts
|
|
def getDivLatConfigLists(instrNode, opNode1, opNode2, cRep):
|
|
if instrNode.attrib['iclass'] in ['DIV', 'IDIV']:
|
|
for op in instrNode.iter('operand'):
|
|
if op.attrib.get('suppressed', '0') == '0':
|
|
divisorNode = op
|
|
memDivisor = divisorNode.attrib['type'] == 'mem'
|
|
width = int(divisorNode.attrib['width'])
|
|
|
|
if memDivisor:
|
|
instrI = getInstrInstanceFromNode(instrNode)
|
|
else:
|
|
divisorReg = 'BH' if ('BH' in divisorNode.text) else regToSize('RBX', width)
|
|
instrI = getInstrInstanceFromNode(instrNode, opRegDict={int(divisorNode.attrib['idx']):divisorReg})
|
|
|
|
if width == 8:
|
|
maxRAX = '13057'
|
|
maxRDX = '0'
|
|
maxDivisor = '123'
|
|
elif width == 16:
|
|
maxRAX = '133'
|
|
maxRDX = '0x343a'
|
|
maxDivisor = '0x75e6'
|
|
elif width == 32:
|
|
maxRAX = '133'
|
|
maxRDX = '0x343a9ed7'
|
|
maxDivisor = '0x75e6e44f'
|
|
elif width == 64:
|
|
maxRAX = '133'
|
|
maxRDX = '0x343a9ed744556677'
|
|
maxDivisor = '0x75e6e44fccddeeff'
|
|
|
|
minRAX = '0'
|
|
minRDX = '0'
|
|
minDivisor = '1'
|
|
|
|
configLists = []
|
|
for RAX, RDX, divisor in [(minRAX, minRDX, minDivisor), (maxRAX, maxRDX, maxDivisor)]:
|
|
configList = LatConfigList()
|
|
configLists.append(configList)
|
|
|
|
config = LatConfig(instrI)
|
|
|
|
if RAX == maxRAX:
|
|
config.notes.append('slow division')
|
|
else:
|
|
config.notes.append('fast division')
|
|
|
|
immReg = {'RAX': 'R8', 'RDX': 'R9', 'divisor': 'RCX'}
|
|
config.init = ['MOV {}, {}'.format(immReg['RAX'], RAX),
|
|
'MOV {}, {}'.format(immReg['RDX'], RDX),
|
|
'MOV {}, {}'.format(immReg['divisor'], divisor)]
|
|
|
|
if memDivisor:
|
|
config.init += ['MOV [R14], ' + immReg['divisor']]
|
|
else:
|
|
config.init += ['MOV {}, {}'.format(divisorReg, regToSize(immReg['divisor'], width))]
|
|
config.init += ['MOV RAX, ' + immReg['RAX'],
|
|
'MOV RDX, ' + immReg['RDX']]
|
|
|
|
chainInstrs = ''
|
|
chainLatency = 0
|
|
immInstr = ''
|
|
for opNode in instrNode.iter('operand'):
|
|
if opNode.attrib['type'] == 'flags': continue
|
|
if opNode == opNode1:
|
|
if opNode == divisorNode:
|
|
if memDivisor:
|
|
reg2Size = min(getRegSize(opNode2.text), 32)
|
|
chainInstrs = 'MOVSX R12, ' + regToSize(opNode2.text, reg2Size) + '; '
|
|
chainInstrs += ('XOR R14, R12; ') * cRep # cRep is a power of two
|
|
chainLatency = basicLatency['MOVSX_R64_R' + str(reg2Size)] + basicLatency['XOR'] * cRep
|
|
else:
|
|
chainInstrs = 'AND {0}, {1}; AND {0}, {2}; OR {0}, {2}; '.format(divisorReg, regToSize(opNode2.text, getRegSize(divisorReg)),
|
|
regToSize(immReg['divisor'], getRegSize(divisorReg)))
|
|
chainInstrs += 'OR {0}, {0}; '.format(divisorReg) * cRep
|
|
chainLatency = basicLatency['AND'] * 2 + basicLatency['OR'] * (cRep+1)
|
|
else:
|
|
chainInstrs = 'AND {0}, {1}; OR {0}, {1}; '.format(opNode.text, regToSize(immReg[regTo64(opNode.text)], getRegSize(opNode.text)))
|
|
chainInstrs += 'OR {0}, {0}; '.format(opNode.text) * cRep
|
|
chainLatency = basicLatency['AND'] + basicLatency['OR'] * (cRep+1)
|
|
|
|
if opNode != opNode2:
|
|
chainInstrs = 'AND ' + opNode.text + ', ' + opNode2.text + '; ' + chainInstrs
|
|
chainLatency += basicLatency['AND']
|
|
|
|
elif opNode != divisorNode:
|
|
immInstr += 'MOV ' + opNode.text + ', ' + regToSize(immReg[regTo64(opNode.text)], getRegSize(opNode.text)) + ';'
|
|
|
|
config.chainInstrs = chainInstrs + '; ' + immInstr
|
|
config.chainLatency = chainLatency
|
|
configList.append(config)
|
|
return configLists
|
|
elif instrNode.attrib['iclass'] in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD', 'VDIVSS', 'VDIVPS', 'VDIVSD', 'VDIVPD', 'VDIVSH', 'VDIVPH']:
|
|
memDivisor = len(instrNode.findall('./operand[@type="mem"]'))>0
|
|
dataType = instrNode.attrib['iclass'][-1]
|
|
chainDataType = ('S' if (dataType == 'S') else 'D')
|
|
|
|
if dataType == 'D':
|
|
maxDividend = '0x429da724b687da66' # 8.1509281715106E12
|
|
maxDivisor = '0x3ff33e97f934078b' # 1.20278165192619
|
|
minDividend = '0x3ff0000000000000' # 1.0
|
|
minDivisor = '0x3ff0000000000000' # 1.0
|
|
elif dataType == 'S':
|
|
maxDividend = '0x54ed392654ed3926' # 8.15093E12 in high and low 32-bit
|
|
maxDivisor = '0x3f99f4c03f99f4c0' # 1.20278 in high and low 32-bit
|
|
minDividend = '0x3f8000003f800000' # 1.0 in high and low 32-bit
|
|
minDivisor = '0x3f8000003f800000' # 1.0 in high and low 32-bit
|
|
else: # dataType == 'H'
|
|
maxDividend = '0x2769276927692769' # 0.02895 in all 4 16-bit blocks
|
|
maxDivisor = '0x3CCF3CCF3CCF3CCF' # 1.203 in all 4 16-bit blocks
|
|
minDividend = '0x3C003C003C003C00' # 1.0 in all 4 16-bit blocks
|
|
minDivisor = '0x3C003C003C003C00' # 1.0 in all 4 16-bit blocks
|
|
|
|
configLists = []
|
|
for dividend, divisor in [(maxDividend, maxDivisor), (minDividend, minDivisor)]:
|
|
configList = LatConfigList()
|
|
configLists.append(configList)
|
|
|
|
regType = 'XMM'
|
|
if 'YMM' in instrNode.attrib['iform']: regType = 'YMM'
|
|
if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM'
|
|
|
|
init = ['MOV RAX, ' + dividend]
|
|
init += ['MOV RBX, ' + divisor]
|
|
for i in range(0, getRegSize(regType)//8, 8): init += ['MOV [R14+' + str(i) + '], RBX']
|
|
for i in range(64, 64+getRegSize(regType)//8, 8): init += ['MOV [R14+' + str(i) + '], RAX']
|
|
|
|
if instrNode.attrib['iclass'] in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD']:
|
|
init += ['MOVUP' + chainDataType + ' XMM1, [R14+64]']
|
|
init += ['MOVUP' + chainDataType + ' XMM2, [R14]']
|
|
init += ['MOVUP' + chainDataType + ' XMM3, [R14+64]']
|
|
init += ['MOVUP' + chainDataType + ' XMM4, [R14]']
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM3', 2:'XMM4'})
|
|
|
|
if opNode1 == opNode2:
|
|
if dividend == minDividend:
|
|
# some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands
|
|
instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM3', 2:'XMM3'})
|
|
config = LatConfig(instrISameReg, init=init)
|
|
configList.append(config)
|
|
|
|
config = LatConfig(instrI, init=init)
|
|
if dividend == maxDividend:
|
|
config.chainInstrs = 'ORP{0} XMM3, XMM1; ANDP{0} XMM3, XMM1; '.format(chainDataType)
|
|
config.chainLatency = basicLatency['ORP' + chainDataType] + basicLatency['ANDP' + chainDataType]
|
|
config.chainInstrs += 'ORP{} XMM3, XMM3;'.format(chainDataType) * cRep
|
|
config.chainLatency += basicLatency['ORP' + chainDataType] * cRep
|
|
configList.append(config)
|
|
configList.isUpperBound = True
|
|
else:
|
|
if memDivisor:
|
|
configList.isUpperBound = True
|
|
# find all other instrs from XMM3 to R12
|
|
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'XMM3', 'R12'):
|
|
if dividend == maxDividend:
|
|
chainInstrs = chainInstrI.asm + '; MOVUP' + chainDataType + ' XMM3, XMM1; '
|
|
else:
|
|
chainInstrs = chainInstrI.asm + '; '
|
|
chainInstrs += ('XOR R14, R12; ') * cRep
|
|
chainLatency = 1 + basicLatency['XOR'] * cRep
|
|
configList.append(LatConfig(instrI, init=init, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
if dividend == minDividend:
|
|
# some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands
|
|
instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM3', 2:'XMM3'})
|
|
config = LatConfig(instrISameReg, init=init)
|
|
configList.append(config)
|
|
|
|
config = LatConfig(instrI, init=init)
|
|
if dividend == maxDividend:
|
|
config.chainInstrs = 'ANDP{0} XMM4, XMM3; MOVUP{0} XMM3, XMM1; ANDP{0} XMM4, XMM2; ORP{0} XMM4, XMM2; '.format(dataType)
|
|
else:
|
|
config.chainInstrs = 'ANDP{0} XMM4, XMM3; ANDP{0} XMM4, XMM2; ORP{0} XMM4, XMM2; '.format(dataType)
|
|
config.chainInstrs += 'ORP{} XMM4, XMM4; '.format(chainDataType) * cRep
|
|
config.chainLatency = basicLatency['ANDP' + chainDataType] * 2 + basicLatency['ORP' + chainDataType] * (cRep+1)
|
|
configList.append(config)
|
|
configList.isUpperBound = True
|
|
else: # instrNode.attrib['iclass'] in ['VDIVSS', 'VDIVPS', 'VDIVSD', 'VDIVPD', 'VDIVSH', 'VDIVPH']:
|
|
nOperands = len(instrNode.findall('./operand'))
|
|
|
|
targetReg = regType + '0'
|
|
dividendBaseReg = regType + '1'
|
|
dividendReg = regType + '2'
|
|
divisorBaseReg = regType + '3'
|
|
divisorReg = regType + '4'
|
|
|
|
init += ['VMOVUP' + chainDataType + ' ' + dividendBaseReg + ', [R14+64]']
|
|
init += ['VMOVUP' + chainDataType + ' ' + dividendReg + ', [R14+64]']
|
|
init += ['VMOVUP' + chainDataType + ' ' + divisorBaseReg + ', [R14]']
|
|
init += ['VMOVUP' + chainDataType + ' ' + divisorReg + ', [R14]']
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, opRegDict={1:targetReg, (nOperands-1):dividendReg, nOperands:divisorReg})
|
|
|
|
if int(opNode1.attrib['idx']) == nOperands - 1: #dividend
|
|
if dividend == minDividend:
|
|
# some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands
|
|
instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:dividendReg, (nOperands-1):dividendReg, nOperands:divisorReg})
|
|
config = LatConfig(instrISameReg, init=init)
|
|
configList.append(config)
|
|
|
|
config = LatConfig(instrI, init=init)
|
|
config.chainInstrs = 'VORP{0} {1}, {2}, {2}; VORP{0} {1}, {1}, {3}; VANDP{0} {1}, {1}, {3}; '.format(chainDataType, dividendReg, targetReg, dividendBaseReg)
|
|
config.chainInstrs += 'VORP{0} {1}, {1}, {1}; '.format(chainDataType, dividendReg) * cRep
|
|
config.chainLatency = basicLatency['VORP' + chainDataType] * (cRep+2) + basicLatency['VANDP' + chainDataType]
|
|
configList.append(config)
|
|
configList.isUpperBound = True
|
|
else: # divisor
|
|
if memDivisor:
|
|
configList.isUpperBound = True
|
|
# find all other instrs from targetReg to R12
|
|
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, targetReg, 'R12'):
|
|
chainInstrs = chainInstrI.asm + '; ' + ('XOR R14, R12; ') * cRep
|
|
chainLatency = 1 + basicLatency['XOR'] * cRep
|
|
configList.append(LatConfig(instrI, init=init, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
if divisor == minDivisor:
|
|
# some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands
|
|
instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:divisorReg, (nOperands-1):dividendReg, nOperands:divisorReg})
|
|
config = LatConfig(instrISameReg, init=init)
|
|
configList.append(config)
|
|
|
|
config = LatConfig(instrI, init=init)
|
|
config.chainInstrs = 'VORP{0} {1}, {2}, {2}; VORP{0} {1}, {1}, {3}; VANDP{0} {1}, {1}, {3}; '.format(chainDataType, divisorReg, targetReg, divisorBaseReg)
|
|
config.chainInstrs += 'VORP{0} {1}, {1}, {1}; '.format(chainDataType, divisorReg) * cRep
|
|
config.chainLatency = basicLatency['VORP' + chainDataType] * (cRep+2) + basicLatency['VANDP' + chainDataType]
|
|
configList.append(config)
|
|
configList.isUpperBound = True
|
|
return configLists
|
|
elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', 'VSQRTPD','VRSQRTSS',
|
|
'VRSQRTPS', 'VRSQRT14PD', 'VRSQRT14PS', 'VRSQRT14SD', 'VRSQRT14SS', 'VSQRTSH', 'VSQRTPH', 'VRSQRTSH', 'VRSQRTPH']:
|
|
dataType = instrNode.attrib['iclass'][-1]
|
|
chainDataType = ('S' if (dataType == 'S') else 'D')
|
|
|
|
if dataType == 'D':
|
|
maxArg = '0x465a61fe1acdc21c' # 8.3610378602352937E30
|
|
minArg = '0x3ff0000000000000' # 1.0
|
|
elif dataType == 'S':
|
|
maxArg = '0x72d30ff172d30ff1' # 8.36104E30 in high and low 32-bit
|
|
minArg = '0x3f8000003f800000' # 1.0 in high and low 32-bit
|
|
else: # dataType == 'H'
|
|
maxArg = '0x1698169816981698' # 0.00161 in all 4 16-bit blocks
|
|
minArg = '0x3C003C003C003C00' # 1.0 in all 4 16-bit blocks
|
|
|
|
instrPrefix = ''
|
|
if instrNode.attrib['iclass'].startswith('V'): instrPrefix = 'V'
|
|
|
|
configLists = []
|
|
|
|
for arg in [maxArg, minArg]:
|
|
configList = LatConfigList()
|
|
configLists.append(configList)
|
|
|
|
regType = 'XMM'
|
|
if 'YMM' in instrNode.attrib['iform']: regType = 'YMM'
|
|
if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM'
|
|
|
|
init = ['MOV RAX, ' + arg]
|
|
for i in range(0, getRegSize(regType)//8, 8): init += ['MOV [R14+' + str(i) + '], RAX']
|
|
|
|
targetReg = regType + '0'
|
|
sourceBaseReg = regType + '1'
|
|
sourceReg = regType + '2'
|
|
|
|
init += [instrPrefix + 'MOVUP' + chainDataType + ' ' + sourceReg + ', [R14]']
|
|
init += [instrPrefix + 'MOVUP' + chainDataType + ' ' + sourceBaseReg + ', [R14]']
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):targetReg, int(opNode1.attrib['idx']): sourceReg})
|
|
|
|
if opNode1.attrib['type'] == 'mem':
|
|
configList.isUpperBound = True
|
|
# find all other instrs from targetReg to R12
|
|
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, targetReg, 'R12'):
|
|
chainInstrs = chainInstrI.asm + '; ' + ('XOR R14, R12; ') * cRep
|
|
chainLatency = 1 + basicLatency['XOR'] * cRep
|
|
configList.append(LatConfig(instrI, init=init, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
if arg == minArg:
|
|
# some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands
|
|
instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):sourceReg, int(opNode1.attrib['idx']): sourceReg})
|
|
config = LatConfig(instrISameReg, init=init)
|
|
configList.append(config)
|
|
|
|
config = LatConfig(instrI, init=init)
|
|
if instrPrefix == 'V':
|
|
config.chainInstrs = 'VORP{0} {1}, {2}, {2}; VORP{0} {1}, {1}, {3}; VANDP{0} {1}, {1}, {3}; '.format(chainDataType, sourceReg, targetReg, sourceBaseReg)
|
|
config.chainInstrs += 'VORP{0} {1}, {1}, {1}; '.format(chainDataType, sourceReg) * cRep
|
|
config.chainLatency = basicLatency['VORP' + chainDataType] * (cRep+2) + basicLatency['VANDP' + chainDataType]
|
|
else:
|
|
config.chainInstrs = 'ORP{0} {1}, {2}; ORP{0} {1}, {3}; ANDP{0} {1}, {3}; '.format(chainDataType, sourceReg, targetReg, sourceBaseReg)
|
|
config.chainInstrs += 'ORP{0} {1}, {1}; '.format(chainDataType, sourceReg) * cRep
|
|
config.chainLatency = basicLatency['ORP' + chainDataType] * (cRep+2) + basicLatency['ANDP' + chainDataType]
|
|
configList.append(config)
|
|
configList.isUpperBound = True
|
|
return configLists
|
|
|
|
# finds chain instructions from startReg to targetReg (including cases where only part of a reg is read/written)
|
|
def getAllChainInstrsFromRegToReg(instrNode, startReg, targetReg):
|
|
allFPDataTypes = ['PD', 'PS', 'SD', 'SS', 'PH', 'SH']
|
|
dataType = instrNode.attrib['iclass'][-2:]
|
|
if dataType not in allFPDataTypes:
|
|
dataType = ''
|
|
|
|
result = []
|
|
for chainInstrNode in instrNodeList:
|
|
if instrNode.attrib.get('vex', '0') != chainInstrNode.attrib.get('vex', '0'): continue
|
|
if instrNode.attrib.get('evex', '0') != chainInstrNode.attrib.get('evex', '0'): continue
|
|
|
|
iclass = chainInstrNode.attrib['iclass']
|
|
if dataType and any((d in iclass) for d in allFPDataTypes) and not dataType in iclass: continue
|
|
|
|
for chainOpNode1 in chainInstrNode.findall('./operand[@type="reg"][@r="1"]'):
|
|
regs1 = [r for r in chainOpNode1.text.split(',') if (r in GPRegs and startReg in GPRegs and regTo64(startReg)==regTo64(r)) or
|
|
((r not in GPRegs) and startReg[1:] == r[1:] and getRegSize(r) <= getRegSize(startReg))]
|
|
if not regs1: continue
|
|
reg1 = regs1[0]
|
|
for chainOpNode2 in chainInstrNode.findall('./operand[@type="reg"][@w="1"]'):
|
|
regs2 = [r for r in chainOpNode2.text.split(',') if r!=reg1 and ((r in GPRegs and targetReg in GPRegs and regTo64(targetReg)==regTo64(r)) or
|
|
((r not in GPRegs) and targetReg[1:] == r[1:] and getRegSize(r) <= getRegSize(targetReg)))]
|
|
if not regs2: continue
|
|
reg2 = regs2[0]
|
|
result.append(getInstrInstanceFromNode(chainInstrNode, [reg1, reg2], [reg1, reg2], True, {int(chainOpNode1.attrib['idx']):reg1, int(chainOpNode2.attrib['idx']):reg2}))
|
|
return result
|
|
|
|
|
|
def getLatConfigsFromMemToReg(instrNode, instrI, memOpNode, targetReg, addrReg, cRep):
|
|
result = []
|
|
|
|
if targetReg.startswith('MM'):
|
|
result.append(LatConfig(instrI, chainInstrs='MOVQ ' + targetReg + ', [' + addrReg + '];', chainLatency=1))
|
|
elif 'MM' in targetReg:
|
|
memWidth = int(memOpNode.attrib['width'])
|
|
|
|
if memWidth == 32:
|
|
chainInstrFP = 'MOVSS'
|
|
chainInstrInt = 'MOVD'
|
|
elif memWidth == 64:
|
|
chainInstrFP = 'MOVSD'
|
|
chainInstrInt = 'MOVQ'
|
|
else:
|
|
chainInstrFP = 'MOVUPD'
|
|
chainInstrInt = 'MOVDQU'
|
|
|
|
if isAVXInstr(instrNode):
|
|
chainInstrFP = 'V' + chainInstrFP
|
|
chainInstrInt = 'V' + chainInstrInt
|
|
|
|
chainInstrFP = chainInstrFP + ' XMM13, [' + addrReg + '];'
|
|
fillInstrFP, fillLatFP = getChainInstrForVectorRegs(instrNode, 'XMM13', 'XMM' + targetReg[3:], cRep, 'FP')
|
|
result.append(LatConfig(instrI, chainInstrs=chainInstrFP+fillInstrFP, chainLatency=1+fillLatFP))
|
|
|
|
if not (targetReg[0:3] == 'YMM' and instrNode.attrib['extension'] == 'AVX'): # integers in YMM registers are only supported by AVX>=2
|
|
chainInstrInt = chainInstrInt + ' XMM13, [' + addrReg + '];'
|
|
fillInstrInt, fillLatInt = getChainInstrForVectorRegs(instrNode, 'XMM13', 'XMM' + targetReg[3:], cRep, 'Int')
|
|
result.append(LatConfig(instrI, chainInstrs=chainInstrInt+fillInstrInt, chainLatency=1+fillLatInt))
|
|
else:
|
|
for chainInstrNode in instrNodeList:
|
|
if instrNode.attrib.get('vex', '0') != chainInstrNode.attrib.get('vex', '0'): continue
|
|
if instrNode.attrib.get('evex', '0') != chainInstrNode.attrib.get('evex', '0'): continue
|
|
|
|
for chainOpNode1 in chainInstrNode.findall('./operand[@type="mem"][@r="1"]'):
|
|
if chainOpNode1.attrib.get('suppressed', '0') == '1': continue
|
|
if memOpNode.attrib['width'] != chainOpNode1.attrib['width']: continue
|
|
if memOpNode.attrib.get('VSIB', '') != chainOpNode1.attrib.get('VSIB', ''): continue
|
|
|
|
for chainOpNode2 in [x for x in chainInstrNode.findall('./operand[@type="reg"][@w="1"]') if targetReg in x.text.split(',')]:
|
|
if chainOpNode2.attrib.get('optional', '') == '1': continue
|
|
chainsInstr = getInstrInstanceFromNode(chainInstrNode, [targetReg], [targetReg], True, {int(chainOpNode2.attrib['idx']):targetReg}).asm
|
|
result.append(LatConfig(instrI, chainInstrs=chainsInstr, chainLatency=1))
|
|
return result
|
|
|
|
def getLatConfigsFromRegToMem(instrNode, instrI, reg, addrReg, memWidth, cRep):
|
|
result = []
|
|
|
|
if reg.startswith('MM'):
|
|
result.append(LatConfig(instrI, chainInstrs='MOVQ [' + addrReg + '], ' + reg + ';', chainLatency=1))
|
|
elif 'MM' in reg:
|
|
if memWidth <= 32:
|
|
chainInstrFP = 'MOVSS'
|
|
chainInstrInt = 'MOVD'
|
|
regPrefix = 'XMM'
|
|
elif memWidth == 64:
|
|
chainInstrFP = 'MOVSD'
|
|
chainInstrInt = 'MOVQ'
|
|
regPrefix = 'XMM'
|
|
elif memWidth == 128:
|
|
chainInstrFP = 'MOVUPD'
|
|
chainInstrInt = 'MOVDQU'
|
|
regPrefix = 'XMM'
|
|
elif memWidth == 256:
|
|
chainInstrFP = 'MOVUPD'
|
|
chainInstrInt = 'MOVDQU'
|
|
regPrefix = 'YMM'
|
|
elif memWidth == 512:
|
|
chainInstrFP = 'MOVUPD'
|
|
chainInstrInt = 'MOVDQU64'
|
|
regPrefix = 'ZMM'
|
|
|
|
if isAVXInstr(instrNode):
|
|
chainInstrFP = 'V' + chainInstrFP
|
|
chainInstrInt = 'V' + chainInstrInt
|
|
|
|
reg1 = regPrefix + reg[3:]
|
|
reg2 = regPrefix + '13'
|
|
|
|
chainInstrFP = chainInstrFP + ' [' + addrReg + '], ' + reg2 + ';'
|
|
fillInstrFP, fillLatFP = getChainInstrForVectorRegs(instrNode, reg1, reg2, cRep, 'FP')
|
|
result.append(LatConfig(instrI, chainInstrs=fillInstrFP+chainInstrFP, chainLatency=1+fillLatFP))
|
|
|
|
if not (regPrefix == 'YMM' and instrNode.attrib['extension'] == 'AVX'): # integers in YMM registers are only supported by AVX>=2
|
|
chainInstrInt = chainInstrInt + ' [' + addrReg + '], ' + reg2 + ';'
|
|
fillInstrInt, fillLatInt = getChainInstrForVectorRegs(instrNode, reg1, reg2, cRep, 'Int')
|
|
result.append(LatConfig(instrI, chainInstrs=fillInstrInt+chainInstrInt, chainLatency=1+fillLatInt))
|
|
else:
|
|
# ToDo
|
|
pass
|
|
return result
|
|
|
|
def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType):
|
|
if cType == 'FP':
|
|
# We use (V)SHUFPD instead of (V)MOV*PD because the latter is a 0-latency operation on some CPUs in some cases
|
|
if isAVXInstr(instrNode):
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']:
|
|
# on ZEN, all shuffles are integer operations
|
|
chainInstrFP = 'VANDPD {0}, {1}, {1};'.format(targetReg, startReg)
|
|
chainInstrFP += 'VANDPD {0}, {0}, {0};'.format(targetReg) * cRep
|
|
chainLatencyFP = basicLatency['VANDPD'] * (cRep+1)
|
|
else:
|
|
chainInstrFP = 'VSHUFPD {0}, {1}, {1}, 0;'.format(targetReg, startReg)
|
|
chainInstrFP += 'VSHUFPD {0}, {0}, {0}, 0;'.format(targetReg) * cRep
|
|
chainLatencyFP = basicLatency['VSHUFPD'] * (cRep+1)
|
|
else:
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']:
|
|
# on ZEN, all shuffles are integer operations
|
|
chainInstrFP = 'VANDPD {0}, {1}, {1};'.format(targetReg, startReg)
|
|
chainInstrFP += 'VANDPD {0}, {0}, {0};'.format(targetReg) * cRep
|
|
chainLatencyFP = basicLatency['VANDPD'] * (cRep+1)
|
|
else:
|
|
chainInstrFP = 'SHUFPD {}, {}, 0;'.format(targetReg, startReg)
|
|
chainInstrFP += 'SHUFPD {0}, {0}, 0;'.format(targetReg) * cRep
|
|
chainLatencyFP = basicLatency['SHUFPD'] * (cRep+1)
|
|
return (chainInstrFP, chainLatencyFP)
|
|
else:
|
|
# We use (V)PAND instead of shuffles, because they can use more ports (https://github.com/andreas-abel/nanoBench/issues/23)
|
|
if isAVXInstr(instrNode):
|
|
instr = 'VPANDD' if ('ZMM' in targetReg) else 'VPAND'
|
|
chainInstrInt = '{0} {1}, {2}, {2};'.format(instr, targetReg, startReg)
|
|
chainInstrInt += '{0} {1}, {1}, {1};'.format(instr, targetReg) * cRep
|
|
chainLatencyInt = basicLatency[instr] * (cRep+1)
|
|
else:
|
|
# we use one shuffle to avoid a read dependency on the target register
|
|
chainInstrInt = 'PSHUFD {}, {}, 0;'.format(targetReg, startReg)
|
|
chainInstrInt += 'PAND {0}, {0};'.format(targetReg) * cRep
|
|
chainLatencyInt = basicLatency['PSHUFD'] + basicLatency['PAND'] * cRep
|
|
return (chainInstrInt, chainLatencyInt)
|
|
|
|
|
|
class LatConfig:
|
|
def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, basicMode=False, notes=None):
|
|
self.instrI = instrI
|
|
self.chainInstrs = chainInstrs
|
|
self.chainLatency = chainLatency
|
|
self.init = ([] if init is None else init)
|
|
self.basicMode = basicMode
|
|
self.notes = ([] if notes is None else notes)
|
|
|
|
class LatConfigList:
|
|
def __init__(self, latConfigs=None, sameReg = False, isUpperBound=False, notes=None):
|
|
self.latConfigs = ([] if latConfigs is None else latConfigs)
|
|
self.isUpperBound = isUpperBound
|
|
self.notes = ([] if notes is None else notes)
|
|
|
|
def append(self, latConfig):
|
|
self.latConfigs.append(latConfig)
|
|
|
|
def extend(self, latConfigs):
|
|
self.latConfigs.extend(latConfigs)
|
|
|
|
LatResult = namedtuple('LatResult', ['minLat','maxLat','lat_sameReg','isUpperBound'])
|
|
|
|
def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem, tpDict):
|
|
cRep = min(100, 2 + 2 * int(math.ceil(tpDict[instrNode].TP_single / 2))) # must be a multiple of 2
|
|
|
|
if isDivOrSqrtInstr(instrNode):
|
|
if not useDistinctRegs: return None
|
|
if targetNode.attrib['type'] == 'flags': return None
|
|
if addrMem == 'mem': return None
|
|
if startNode.attrib.get('opmask', '') == '1' or targetNode.attrib.get('opmask', '') == '1': return None
|
|
if instrNode.attrib.get('mask', '') == '1' and (startNode == targetNode): return None
|
|
return getDivLatConfigLists(instrNode, startNode, targetNode, cRep)
|
|
|
|
startNodeIdx = int(startNode.attrib['idx'])
|
|
targetNodeIdx = int(targetNode.attrib['idx'])
|
|
|
|
suppressedStart = startNode.attrib.get('suppressed', '0') == '1'
|
|
suppressedTarget = targetNode.attrib.get('suppressed', '0') == '1'
|
|
|
|
instrReadsFlags = len(instrNode.findall('./operand[@type="flags"][@r="1"]')) > 0
|
|
|
|
configList = LatConfigList()
|
|
|
|
if instrNode.attrib['iclass'] == 'LEAVE':
|
|
if startNode.text and targetNode.text and 'BP' in startNode.text and 'BP' in targetNode.text:
|
|
chainInstrs = 'MOVSX RBP, BP; ' * cRep
|
|
chainInstrs += 'AND RBP, R14; OR RBP, R14; '
|
|
chainLatency = basicLatency['MOVSX_R64_R8'] * cRep + basicLatency['AND'] + basicLatency['OR']
|
|
configList.append(LatConfig(getInstrInstanceFromNode(instrNode), chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
elif startNode.text and targetNode.text and 'BP' in startNode.text and 'SP' in targetNode.text:
|
|
chainInstrs = 'MOVSX RBP, SP; '
|
|
chainInstrs += 'MOVSX RBP, BP; ' * cRep
|
|
chainInstrs += 'AND RBP, R14; OR RBP, R14; '
|
|
chainLatency = basicLatency['MOVSX_R64_R8'] + basicLatency['MOVSX_R64_R8'] * cRep + basicLatency['AND'] + basicLatency['OR']
|
|
configList.append(LatConfig(getInstrInstanceFromNode(instrNode), chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
return None
|
|
elif instrNode.attrib['iclass'] == 'MOVDIR64B':
|
|
if (startNodeIdx == 1) and (targetNodeIdx == 3):
|
|
instrI = getInstrInstanceFromNode(instrNode, opRegDict={1: 'RSI'})
|
|
chainInstrs = 'MOV RSI, [RSI]'
|
|
configList.isUpperBound = True
|
|
configList.append(LatConfig(getInstrInstanceFromNode(instrNode), chainInstrs=chainInstrs, chainLatency=1, init='MOV [R14], RSI'))
|
|
else:
|
|
return None
|
|
elif instrNode.attrib['iclass'] == 'XGETBV':
|
|
if startNode.text == 'ECX':
|
|
chainInstrs = 'MOVSX ECX, {}; '.format(regTo16(targetNode.text))
|
|
chainInstrs += 'MOVSX ECX, CX; ' * cRep
|
|
chainInstrs += 'AND ECX, 0; '
|
|
chainLatency = basicLatency['MOVSX_R32_R16'] * (cRep + 1) + basicLatency['AND']
|
|
configList.append(LatConfig(getInstrInstanceFromNode(instrNode), chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
return None
|
|
elif startNode.text == 'RSP' or targetNode.text == 'RSP':
|
|
# we ignore operands that modify the stack pointer, as these are usually handled by the stack engine in the issue stage of the pipeline, and
|
|
# thus would not lead to meaningful results
|
|
return None
|
|
elif (startNode.text and 'RIP' in startNode.text) or (targetNode.text and 'RIP' in targetNode.text) or 'R' in instrNode.attrib.get('agen', ''):
|
|
return None
|
|
elif startNode.attrib['type'] == 'reg':
|
|
#################
|
|
# reg -> ...
|
|
#################
|
|
regs1 = set(startNode.text.split(","))-globalDoNotWriteRegs-specialRegs-memRegs
|
|
|
|
if not regs1: return None
|
|
|
|
if targetNode.attrib['type'] == 'reg':
|
|
#################
|
|
# reg -> reg
|
|
#################
|
|
regs2 = set(targetNode.text.split(","))-globalDoNotWriteRegs-specialRegs-memRegs
|
|
|
|
if not regs2:
|
|
return None
|
|
|
|
if startNode == targetNode:
|
|
reg1 = sortRegs(regs1)[0]
|
|
reg2 = reg1
|
|
else:
|
|
if len(regs2) == 1:
|
|
reg2 = sortRegs(regs2)[0]
|
|
otherRegs = [x for x in regs1 if getCanonicalReg(x) != getCanonicalReg(reg2)]
|
|
if otherRegs:
|
|
reg1 = sortRegs(otherRegs)[0]
|
|
else:
|
|
reg1 = sortRegs(regs1)[0]
|
|
else:
|
|
reg1 = sortRegs(regs1)[0]
|
|
reg2 = sortRegs(regs2)[0]
|
|
if not useDistinctRegs:
|
|
if reg1 in regs2:
|
|
reg2 = reg1
|
|
else:
|
|
for r in regs2:
|
|
if getCanonicalReg(r) == getCanonicalReg(reg1):
|
|
reg2 = r
|
|
break
|
|
else:
|
|
otherRegs = [x for x in regs2 if getCanonicalReg(x) != getCanonicalReg(reg1)]
|
|
if otherRegs:
|
|
reg2 = sortRegs(otherRegs)[0]
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={startNodeIdx:reg1, targetNodeIdx:reg2})
|
|
|
|
if reg1 == reg2:
|
|
configList.append(LatConfig(instrI))
|
|
|
|
reg1Prefix = re.sub(r'\d', '', reg1)
|
|
reg2Prefix = re.sub(r'\d', '', reg2)
|
|
|
|
if reg1 in GPRegs and reg2 in GPRegs:
|
|
if reg1 in High8Regs:
|
|
if reg2 in High8Regs:
|
|
chainInstrs = 'MOV {}, {};'.format(reg1, reg2)
|
|
chainInstrs += 'MOV {}, {};'.format(reg1, reg1) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['MOV_R8h_R8h']*(cRep+1)))
|
|
elif reg2 in Low8Regs:
|
|
chainInstrs = 'MOV {}, {};'.format(reg1, reg2)
|
|
chainInstrs += 'MOV {}, {};'.format(reg1, reg1) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['MOV_R8h_R8l'] + basicLatency['MOV_R8h_R8h']*cRep))
|
|
else:
|
|
reg2Size = min(32, getRegSize(reg2))
|
|
chainInstrs = 'MOVSX {}, {};'.format(regTo64(reg1), regToSize(reg2, reg2Size))
|
|
chainInstrs += 'MOV {}, {};'.format(reg1, regTo8(reg1))
|
|
chainInstrs += 'MOV {}, {};'.format(reg1, reg1) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['MOVSX_R64_R'+str(reg2Size)]
|
|
+ basicLatency['MOV_R8h_R8l'] + basicLatency['MOV_R8h_R8h']*cRep))
|
|
else:
|
|
# MOVSX avoids partial reg stalls and cannot be eliminated by "move elimination"
|
|
if reg2 in High8Regs:
|
|
chainInstrs = 'MOVSX {}, {};'.format(regTo32(reg1), reg2)
|
|
chainInstrs += 'MOVSX {}, {};'.format(regTo64(reg1), regTo32(reg1)) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['MOVSX_R32_R8h'] + basicLatency['MOVSX_R64_R32']*cRep))
|
|
else:
|
|
reg2Size = min(32, getRegSize(reg2))
|
|
chainInstrs = 'MOVSX {}, {};'.format(regTo64(reg1), regToSize(reg2, reg2Size))
|
|
chainInstrs += 'MOVSX {}, {};'.format(regTo64(reg1), regTo32(reg1)) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['MOVSX_R64_R'+str(reg2Size)]
|
|
+ basicLatency['MOVSX_R64_R32']*cRep))
|
|
elif reg1Prefix == 'K' and reg2Prefix == 'K':
|
|
chainInstr = 'KMOVQ {}, {};'.format(reg1, reg2)
|
|
chainInstr += 'KMOVQ {0}, {0};'.format(reg1) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=basicLatency['KMOVQ']*(cRep+1)))
|
|
elif reg1Prefix == 'K' and reg2Prefix[1:] == 'MM':
|
|
# we test with both VPMOVQ2M and VPTESTNMQ (as, e.g., VPMAXUB ZMM has a higher latency with the former for some unknown reason)
|
|
chainInstr1 = 'VPMOVQ2M ' + reg1 + ', ' + reg2 + ';'
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr1, chainLatency=basicLatency['VPMOVQ2M_'+reg2Prefix]))
|
|
chainInstr2 = 'VPTESTNMQ ' + reg1 + ' {' + reg1 + '}, ' + reg2 + ', ' + reg2 + ';'
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr2, chainLatency=basicLatency['VPTESTNMQ_'+reg2Prefix]))
|
|
elif reg1Prefix[1:] == 'MM' and reg2Prefix == 'K':
|
|
chainInstr = 'VMOVUPS ' + reg1 + ' {' + reg2 + '}, ' + reg1Prefix + '14;'
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=basicLatency['VMOVUPS_'+reg1Prefix+'_K']))
|
|
elif reg1Prefix[1:] == reg2Prefix[1:]:
|
|
# if the registers have different widths, bring the smaller to the width of the larger
|
|
reg1 = reg1.replace(reg1Prefix, min(reg1Prefix, reg2Prefix))
|
|
reg2 = reg2.replace(reg2Prefix, min(reg1Prefix, reg2Prefix))
|
|
|
|
if reg1Prefix =='MM':
|
|
chainInstr = 'MOVQ {}, {};'.format(reg1, reg2)
|
|
chainInstr += 'MOVQ {0}, {0};'.format(reg1) * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=basicLatency['MOVQ']*(cRep+1)))
|
|
elif reg1Prefix in ['XMM', 'YMM', 'ZMM']:
|
|
chainInstrFP, chainLatencyFP = getChainInstrForVectorRegs(instrNode, reg2, reg1, cRep, 'FP')
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrFP, chainLatency=chainLatencyFP))
|
|
|
|
if not (reg1Prefix == 'YMM' and instrNode.attrib['extension'] == 'AVX'): # integers in YMM registers are only supported by AVX>=2
|
|
chainInstrInt, chainLatencyInt = getChainInstrForVectorRegs(instrNode, reg2, reg1, cRep, 'Int')
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrInt, chainLatency=chainLatencyInt))
|
|
else:
|
|
print('invalid reg prefix: ' + reg1Prefix)
|
|
return None
|
|
else:
|
|
configList.isUpperBound = True
|
|
# find all other instrs from reg2 to reg1
|
|
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, reg2, reg1):
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrI.asm, chainLatency=1))
|
|
elif targetNode.attrib['type'] == 'flags':
|
|
#################
|
|
# reg -> flags
|
|
#################
|
|
|
|
reg = sortRegs(regs1)[0]
|
|
|
|
for flag in STATUSFLAGS_noAF:
|
|
if not ('flag_'+flag) in targetNode.attrib: continue
|
|
if not 'w' in targetNode.attrib[('flag_'+flag)]: continue
|
|
|
|
if reg in GPRegs:
|
|
regSize = getRegSize(reg)
|
|
if regSize == 8:
|
|
chainInstr = 'SET{} {};'.format(flag[0], reg)
|
|
if reg in High8Regs:
|
|
if 'SET' + flag[0] + '_R8h' in basicLatency:
|
|
chainLatency = basicLatency['SET' + flag[0] + '_R8h']
|
|
else:
|
|
chainLatency = 1
|
|
configList.isUpperBound = True
|
|
else:
|
|
chainLatency = basicLatency['SET' + flag[0]]
|
|
else:
|
|
chainInstr = 'CMOV{} {}, {};'.format(flag[0], regToSize('R15', regSize), regToSize('R15', regSize))
|
|
r15Size = min(32, regSize)
|
|
chainInstr += 'MOVSX {}, {};'.format(regTo64(reg), regToSize('R15', r15Size))
|
|
chainLatency = basicLatency['CMOV' + flag[0]] + basicLatency['MOVSX_R64_R'+str(r15Size)]
|
|
instrI = getInstrInstanceFromNode(instrNode, ['R15'], ['R15'], useDistinctRegs, {startNodeIdx:reg})
|
|
|
|
if reg in High8Regs:
|
|
movInstr = 'MOV {}, {};'.format(reg, reg)
|
|
chainInstrs = chainInstr + movInstr * cRep
|
|
chainLatency = chainLatency + basicLatency['MOV_R8h_R8h'] * cRep
|
|
else:
|
|
reg2Size = min(32, regSize)
|
|
movsxInstr = 'MOVSX {}, {};'.format(regTo64(reg), regToSize(reg, reg2Size))
|
|
chainInstrs = chainInstr + movsxInstr * cRep
|
|
chainLatency = chainLatency + basicLatency['MOVSX_R64_R'+str(reg2Size)] * cRep
|
|
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
elif 'MM' in reg:
|
|
instrI = getInstrInstanceFromNode(instrNode, ['R12', 'R15'], ['R12', 'R15'], True, {startNodeIdx:reg})
|
|
configList.isUpperBound = True
|
|
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R12', reg):
|
|
chainInstrs = 'CMOV' + flag[0] + ' R12, R15; ' + chainInstrI.asm
|
|
chainLatency = basicLatency['CMOV' + flag[0]] + 1
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
elif targetNode.attrib['type'] == 'mem':
|
|
#################
|
|
# reg -> mem
|
|
#################
|
|
|
|
reg = sortRegs(regs1)[0]
|
|
addrReg = getAddrReg(instrNode, targetNode)
|
|
|
|
if reg in GPRegs:
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={startNodeIdx:reg})
|
|
|
|
configList.isUpperBound = True
|
|
chainInstrs = 'MOV {}, [{}];'.format(reg, addrReg)
|
|
|
|
if reg in High8Regs:
|
|
chainInstrs += 'MOV {}, {};'.format(reg, reg) * cRep
|
|
chainLatency = basicLatency['MOV_R8h_R8h'] * cRep
|
|
else:
|
|
reg2Size = min(32, getRegSize(reg))
|
|
chainInstrs += 'MOVSX {}, {};'.format(regTo64(reg), regToSize(reg, reg2Size)) * cRep
|
|
chainLatency = basicLatency['MOVSX_R64_R'+str(reg2Size)] * cRep
|
|
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(getRegSize(reg))] >= 12) # 0 if CPU supports zero-latency store forwarding
|
|
|
|
if re.search('BT.*MEMv_GPRv', instrNode.attrib['iform']):
|
|
chainInstrs += 'AND ' + reg + ', 0;'
|
|
chainLatency += basicLatency['AND']
|
|
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
elif 'MM' in reg:
|
|
if suppressedTarget:
|
|
# ToDo: only happens in the case of maskmovdqu
|
|
pass
|
|
else:
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=True, opRegDict={startNodeIdx:reg})
|
|
configList.isUpperBound = True
|
|
configList.extend(getLatConfigsFromMemToReg(instrNode, instrI, targetNode, reg, addrReg, cRep))
|
|
else:
|
|
# ToDo
|
|
print('unsupported reg to mem')
|
|
return None
|
|
elif startNode.attrib['type'] == 'flags':
|
|
#################
|
|
# flags -> ...
|
|
#################
|
|
if targetNode.attrib['type'] == 'reg':
|
|
#################
|
|
# flags -> reg
|
|
#################
|
|
regs = set(targetNode.text.split(','))-globalDoNotWriteRegs-specialRegs-memRegs
|
|
if not regs: return None
|
|
|
|
reg = sortRegs(regs)[0]
|
|
|
|
if reg in GPRegs:
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={targetNodeIdx:reg})
|
|
chainInstrs = 'TEST {0}, {0};'.format(reg)
|
|
if reg in High8Regs:
|
|
if 'TEST_R8h_R8h' in basicLatency:
|
|
chainLatency = basicLatency['TEST_R8h_R8h']
|
|
else:
|
|
chainLatency = 1
|
|
configList.isUpperBound = True
|
|
else:
|
|
chainLatency = basicLatency['TEST']
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
|
|
if reg in High8Regs:
|
|
chainInstrs = 'MOV {}, {};'.format(reg, reg) * cRep + chainInstrs
|
|
chainLatency += basicLatency['MOV_R8h_R8h'] * cRep
|
|
else:
|
|
reg2Size = min(32, getRegSize(reg))
|
|
chainInstrs = 'MOVSX {}, {};'.format(regTo64(reg), regToSize(reg, reg2Size)) * cRep + chainInstrs
|
|
chainLatency += basicLatency['MOVSX_R64_R'+str(reg2Size)] * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
# ToDo: there is no instruction from flag to vector reg; the only non-GPR that is possible are ST(0) and X87STATUS
|
|
return None
|
|
elif targetNode.attrib['type'] == 'flags':
|
|
#################
|
|
# flags -> flag
|
|
#################
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs)
|
|
configList.append(LatConfig(instrI))
|
|
|
|
cfModifiers = startNode.attrib.get('flag_CF', '')
|
|
if ('r' in cfModifiers and 'w' in cfModifiers) or ('cw' in cfModifiers):
|
|
chainInstrs = 'CMC;'*cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['CMC']*cRep))
|
|
elif targetNode.attrib['type'] == 'mem':
|
|
#################
|
|
# flags -> mem
|
|
#################
|
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs)
|
|
chainInstr = 'TEST ' + targetNode.attrib['memory-prefix'] + ' [' + getAddrReg(instrNode, targetNode) + '], 1'
|
|
configList.isUpperBound = True
|
|
# we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=1, basicMode=True))
|
|
elif startNode.attrib['type'] in ['agen', 'mem']:
|
|
#################
|
|
# mem -> ...
|
|
#################
|
|
if startNode.attrib.get('r', '0') == '0' and targetNode != startNode:
|
|
# for memory writes, only the dependency address -> memory is interesting
|
|
return None
|
|
|
|
addrReg = getAddrReg(instrNode, startNode)
|
|
indexReg = getIndexReg(instrNode, startNode)
|
|
memWidth = int(startNode.attrib.get('width', 0))
|
|
|
|
if targetNode.attrib['type'] == 'reg':
|
|
#################
|
|
# mem -> reg
|
|
#################
|
|
regs = set(targetNode.text.split(","))
|
|
if not suppressedTarget: regs -= globalDoNotWriteRegs | specialRegs | memRegs
|
|
if not regs: return None
|
|
reg = sortRegs(regs)[0]
|
|
regSize = getRegSize(reg)
|
|
|
|
if suppressedStart:
|
|
if not regs.issubset(GPRegs):
|
|
print('read from suppressed mem to non-GPR reg not yet supported')
|
|
return None
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg},
|
|
useIndexedAddr=(addrMem=='addr_index'))
|
|
|
|
if reg in GPRegs:
|
|
if addrMem in ['addr', 'addr_index']:
|
|
# addr -> reg
|
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
|
reg2Size = min(32, regSize)
|
|
chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, reg2Size) + ';'
|
|
chainLatency = basicLatency['MOVSX_R64_R'+str(reg2Size)]
|
|
if chainReg != regTo64(reg):
|
|
chainInstrs += 'XOR {}, {};'.format(chainReg, regTo64(reg)) * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2
|
|
chainLatency += basicLatency['XOR'] * cRep
|
|
else:
|
|
# mem -> reg
|
|
configList = LatConfigList()
|
|
configList.isUpperBound = True
|
|
reg2Size = min(32, regSize)
|
|
chainInstrs = 'MOVSX R12, {};'.format(regToSize(reg, reg2Size))
|
|
chainInstrs += 'MOVSX R12, R12d;' * (cRep-1)
|
|
chainInstrs += 'mov [{}], {};'.format(addrReg, regToSize('R12', memWidth))
|
|
chainLatency = basicLatency['MOVSX_R64_R'+str(reg2Size)] + basicLatency['MOVSX_R64_R32'] * (cRep-1)
|
|
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding
|
|
if reg in High8Regs:
|
|
chainInstrs = 'MOVSX {}, {};'.format(regTo32(reg), reg) + chainInstrs
|
|
chainInstrs += 'MOV {}, {}'.format(reg, reg) # 'clean' reg again; this is not on the critical path
|
|
chainLatency += basicLatency['MOVSX_R32_R8h']
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
elif 'MM' in reg:
|
|
if addrMem in ['addr', 'addr_index']:
|
|
# addr -> reg
|
|
configList.isUpperBound = True
|
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
|
chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg))
|
|
if isAVXInstr(instrNode):
|
|
chainInstrs = 'V' + chainInstrs
|
|
chainInstrs += 'XOR {}, {};'.format(chainReg, 'R12') * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2
|
|
chainLatency = 1 + basicLatency['XOR'] * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
elif addrMem == 'addr_VSIB':
|
|
# addr_VSIB -> reg
|
|
configList.isUpperBound = True
|
|
chainInstrs = 'VANDPD {0}14, {0}14, {0}{1};'.format(startNode.attrib['VSIB'], reg[3:]) * cRep
|
|
chainLatency = basicLatency['VANDPD'] * cRep
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
# mem -> reg
|
|
configList.isUpperBound = True
|
|
configList.extend(getLatConfigsFromRegToMem(instrNode, instrI, reg, addrReg, memWidth, cRep))
|
|
elif targetNode.attrib['type'] == 'flags':
|
|
#################
|
|
# mem -> flags
|
|
#################
|
|
for flag in STATUSFLAGS_noAF:
|
|
if not ('flag_'+flag) in targetNode.attrib: continue
|
|
if not 'w' in targetNode.attrib[('flag_'+flag)]: continue
|
|
|
|
instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs,
|
|
useIndexedAddr=(addrMem=='addr_index'))
|
|
|
|
if addrMem in ['addr', 'addr_index']:
|
|
# addr -> flag
|
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
|
chainInstr = 'CMOV' + flag[0] + ' ' + chainReg + ', ' + chainReg
|
|
chainLatency = basicLatency['CMOV' + flag[0]]
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency))
|
|
else:
|
|
# mem -> flag
|
|
if memWidth <= 64:
|
|
configList.isUpperBound = True
|
|
chainInstrs = 'CMOV' + flag[0] + ' R12, R12;'
|
|
chainInstrs += 'MOVSX R12, R12d;' * cRep
|
|
chainInstrs += 'mov [' + addrReg + '], ' + regToSize('R12', memWidth)
|
|
chainLatency = basicLatency['CMOV' + flag[0]] + basicLatency['MOVSX_R64_R32'] * cRep
|
|
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(memWidth)] >= 12) # 0 if CPU supports zero-latency store forwarding
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
|
else:
|
|
# ToDo
|
|
pass
|
|
elif targetNode.attrib['type'] == 'mem':
|
|
#################
|
|
# mem -> mem
|
|
#################
|
|
if startNode == targetNode:
|
|
instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs=useDistinctRegs,
|
|
useIndexedAddr=(addrMem=='addr_index'))
|
|
|
|
if addrMem in ['addr', 'addr_index']:
|
|
# addr -> mem
|
|
configList.isUpperBound = True
|
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
|
memStr = addrReg + ('+'+indexReg if addrMem == 'addr_index' else '')
|
|
chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + memStr + '];'
|
|
reg2Size = min(32, memWidth)
|
|
chainInstrs += ('MOVSX R12, ' + regToSize('R12', reg2Size) + ';') * cRep
|
|
chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '')
|
|
chainLatency = basicLatency['MOVSX_R64_R'+str(reg2Size)] * cRep + 2*basicLatency['XOR']
|
|
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding
|
|
# we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True))
|
|
# on some microarch. (e.g., HSW), an additional nop instr. can sometimes lead to a better port scheduling
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs + 'nop;', chainLatency=chainLatency, basicMode=True, notes=['with additional nop']))
|
|
else:
|
|
# mem -> mem
|
|
if startNode.attrib.get('r','0')=='1':
|
|
configList = LatConfigList()
|
|
# we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
|
|
configList.append(LatConfig(instrI, basicMode=True))
|
|
|
|
if memWidth <= 64:
|
|
chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + addrReg + '];'
|
|
chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';')*10
|
|
chainInstrs += ('MOV [' + addrReg + '], ' + regToSize('R12', min(64, memWidth)))
|
|
chainLatency = basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))]
|
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True))
|
|
else:
|
|
# ToDo
|
|
pass
|
|
else:
|
|
# ToDo
|
|
return None
|
|
|
|
if not configList.latConfigs: return None
|
|
return [configList]
|
|
|
|
|
|
def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
|
|
if useIACA:
|
|
createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode).asm)
|
|
|
|
if iacaVersion == '2.1':
|
|
try:
|
|
subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o'])
|
|
iaca_lat = subprocess.check_output(iacaCMDLine + ['-analysis', 'LATENCY', '/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT).decode()
|
|
except subprocess.CalledProcessError as e:
|
|
logging.warning('Error: ' + e.output.decode())
|
|
htmlReports.append('<pre>' + e.output.decode() + '</pre>\n')
|
|
return None
|
|
|
|
htmlReports.append('<pre>' + iaca_lat + '</pre>\n')
|
|
|
|
if '!' in iaca_lat or not 'Latency' in iaca_lat:
|
|
print('IACA error')
|
|
return None
|
|
|
|
latency = iaca_lat.split('\n')[3].split()[1]
|
|
return latency
|
|
else:
|
|
if instrNode.attrib['iclass'] in ['CALL_NEAR', 'CALL_NEAR_MEMv', 'CLZERO', 'JMP', 'JMP_MEMv', 'MOVDIR64B', 'RET_NEAR', 'RET_NEAR_IMMw', 'RDMSR', 'WRMSR',
|
|
'RDPMC', 'CPUID', 'POPF', 'POPFQ']:
|
|
return None
|
|
if 'XSAVE' in instrNode.attrib['iclass']:
|
|
return None
|
|
if 'REP' in instrNode.attrib['iclass']:
|
|
return None
|
|
if instrNode.attrib['category'] in ['IO', 'IOSTRINGOP', 'PKU']:
|
|
return None
|
|
|
|
inputOpnds = []
|
|
outputOpnds = []
|
|
|
|
for opNode in instrNode.iter('operand'):
|
|
if opNode.attrib['type'] == 'flags' and not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF):
|
|
continue
|
|
|
|
if opNode.attrib.get('r', '0') == '1':
|
|
inputOpnds.append(opNode)
|
|
if opNode.attrib.get('w', '0') == '1':
|
|
outputOpnds.append(opNode)
|
|
if opNode.attrib.get('r', '0') == '0':
|
|
if opNode.attrib['type'] == 'mem':
|
|
if 'moffs' not in opNode.attrib:
|
|
inputOpnds.append(opNode) # address of memory write
|
|
elif opNode.attrib.get('conditionalWrite', '0') == '1':
|
|
inputOpnds.append(opNode)
|
|
elif opNode.attrib['type'] == 'reg':
|
|
if opNode.attrib.get('width', '') in ['8', '16'] and opNode.text.split(',')[0] in GPRegs:
|
|
inputOpnds.append(opNode)
|
|
elif instrNode.attrib['iclass'] in ['POPCNT', 'LZCNT', 'TZCNT']:
|
|
# these instructions have a false dependency on the first operand on some microarchitectures;
|
|
# see also https://stackoverflow.com/questions/21390165/why-does-breaking-the-output-dependency-of-lzcnt-matter
|
|
inputOpnds.append(opNode)
|
|
|
|
archNode = instrNode.find('./architecture[@name="' + arch + '"]')
|
|
measurementNode = archNode.find('./measurement')
|
|
|
|
overallMaxLat = 0
|
|
|
|
htmlHead = []
|
|
htmlBottom = []
|
|
|
|
for opNode1 in inputOpnds:
|
|
opNode1Idx = int(opNode1.attrib['idx'])
|
|
|
|
for opNode2 in outputOpnds:
|
|
opNode2Idx = int(opNode2.attrib['idx'])
|
|
latencyNode = None
|
|
|
|
addrMemList = ['']
|
|
if opNode1.attrib['type'] == 'mem':
|
|
if 'moffs' not in opNode1.attrib:
|
|
addrMemList = ['addr']
|
|
if 'VSIB' in opNode1.attrib:
|
|
addrMemList.append('addr_VSIB')
|
|
elif (opNode1.attrib.get('suppressed', '') != '1') or ('index' in opNode1.attrib):
|
|
addrMemList.append('addr_index')
|
|
addrMemList.append('mem') # mem added last; order is relevant for html output
|
|
elif opNode1.attrib['type'] == 'agen':
|
|
addrMemList = []
|
|
if 'B' in instrNode.attrib['agen']:
|
|
addrMemList.append('addr')
|
|
if 'I' in instrNode.attrib['agen']:
|
|
addrMemList.append('addr_index')
|
|
|
|
for addrMem in addrMemList:
|
|
minLatDistinctRegs = 0
|
|
maxLatDistinctRegs = 0
|
|
|
|
configI = 0
|
|
for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]):
|
|
latConfigLists = getLatConfigLists(instrNode, opNode1, opNode2, useDistinctRegs, addrMem, tpDict)
|
|
if latConfigLists is None: continue
|
|
|
|
minLat = sys.maxsize
|
|
maxLat = 0
|
|
|
|
minLatIsUpperBound = False
|
|
maxLatIsUpperBound = False
|
|
|
|
configHtmlReports = []
|
|
|
|
for latConfigList in latConfigLists:
|
|
minLatForCurList = sys.maxsize
|
|
|
|
if not any((latConfig.init or latConfig.instrI.regMemInit) for latConfig in latConfigList.latConfigs):
|
|
# Test different register values for read-only registers
|
|
for readOnlyRegOpNode in instrNode.findall('./operand[@type="reg"][@r="1"]'):
|
|
if readOnlyRegOpNode == opNode1: continue
|
|
if readOnlyRegOpNode.attrib.get('w', '') == '1': continue
|
|
readOnlyRegOpNodeIdx = int(readOnlyRegOpNode.attrib['idx'])
|
|
for latConfig in list(latConfigList.latConfigs):
|
|
if not readOnlyRegOpNodeIdx in latConfig.instrI.opRegDict:
|
|
print('readOnlyRegOpNodeIdx not found in opRegDict')
|
|
continue
|
|
reg = latConfig.instrI.opRegDict[readOnlyRegOpNodeIdx]
|
|
if (not reg in GPRegs) or (reg in High8Regs) or (reg in globalDoNotWriteRegs|specialRegs|memRegs): continue
|
|
if any((opNode is not None) for opNode in instrNode.findall('./operand[@type="reg"][@w="1"]')
|
|
if regTo64(latConfig.instrI.opRegDict[int(opNode.attrib['idx'])]) == regTo64(reg)): continue
|
|
|
|
latConfigList.latConfigs.remove(latConfig)
|
|
for regVal in ['0', '1', '2']:
|
|
newlatConfig = copy.deepcopy(latConfig)
|
|
newlatConfig.init += ['MOV ' + reg + ', ' + regVal]
|
|
newlatConfig.notes.append('with ' + reg + '=' + regVal)
|
|
latConfigList.latConfigs.append(newlatConfig)
|
|
|
|
# some SSE/AVX instr. (e.g., VORPS (on SKL, CLX), VAESDEC) incur a penalty (?) if a source was not written by an instr. of a similar kind,
|
|
# some other instructions (e.g., VPDPWSSD on ICL) incur a penalty if the source was written by an instr. of the same kind;
|
|
# therefore, we create configurations for both scenarios
|
|
if (isSSEInstr(instrNode) or isAVXInstr(instrNode)) and not isDivOrSqrtInstr(instrNode):
|
|
for latConfig in list(latConfigList.latConfigs):
|
|
regInit = []
|
|
for opNode in instrNode.findall('./operand[@r="1"][@type="reg"]'):
|
|
reg = latConfig.instrI.opRegDict[int(opNode.attrib['idx'])]
|
|
regPrefix = re.sub(r'\d', '', reg)
|
|
if (regPrefix in ['XMM', 'YMM', 'ZMM']) and (reg not in globalDoNotWriteRegs|memRegs):
|
|
for initOp in instrNode.findall('./operand[@w="1"][@type="reg"]'):
|
|
if initOp.text != opNode.text: continue
|
|
regInit += [getInstrInstanceFromNode(instrNode, opRegDict={int(initOp.attrib['idx']):reg}, computeRegMemInit=False).asm]
|
|
break
|
|
if regInit:
|
|
newlatConfig = copy.deepcopy(latConfig)
|
|
newlatConfig.instrI.regMemInit.extend(regInit)
|
|
newlatConfig.notes.append('source registers initialized by an instruction of the same kind')
|
|
latConfigList.latConfigs.append(newlatConfig)
|
|
|
|
# Create a copy of each experiment with dependency-breaking instructions for all dependencies other than the dependency from opNode2 to
|
|
# opNode1 if there aren't sufficiently many fill instructions in the chain
|
|
if (not isDivOrSqrtInstr(instrNode) and not 'GATHER' in instrNode.attrib['category'] and not 'SCATTER' in instrNode.attrib['category']):
|
|
for latConfig in list(latConfigList.latConfigs):
|
|
if not isAVXInstr(instrNode) and latConfig.chainLatency > tpDict[instrNode].TP_single:
|
|
continue
|
|
|
|
depBreakingInstrs = getDependencyBreakingInstrs(instrNode, latConfig.instrI.opRegDict)
|
|
if not depBreakingInstrs: continue
|
|
|
|
newlatConfig = copy.deepcopy(latConfig)
|
|
depBreakingAdded = False
|
|
for depOpNode in depBreakingInstrs:
|
|
depOpNodeIdx = int(depOpNode.attrib['idx'])
|
|
if (depOpNodeIdx in latConfig.instrI.opRegDict and opNode1Idx in latConfig.instrI.opRegDict
|
|
and latConfig.instrI.opRegDict[depOpNodeIdx] == latConfig.instrI.opRegDict[opNode1Idx]):
|
|
continue
|
|
elif depOpNode == opNode1 and opNode1 == opNode2:
|
|
continue
|
|
elif opNode1.attrib['type'] == 'flags' and depOpNode.attrib['type'] == 'flags':
|
|
continue
|
|
else:
|
|
if not latConfig.chainInstrs.endswith(depBreakingInstrs[depOpNode]):
|
|
newlatConfig.chainInstrs = latConfig.chainInstrs + ';' + depBreakingInstrs[depOpNode]
|
|
depBreakingAdded = True
|
|
if depBreakingAdded:
|
|
latConfigList.latConfigs.remove(latConfig)
|
|
latConfigList.latConfigs.append(latConfig) # order ...
|
|
newlatConfig.notes.append('with dependency-breaking instructions')
|
|
latConfigList.latConfigs.append(newlatConfig)
|
|
|
|
# make sure that the mask for gather/scatter instruction is never empty
|
|
if instrNode.attrib['extension'] == 'AVX2GATHER':
|
|
for latConfig in latConfigList.latConfigs:
|
|
maskReg = latConfig.instrI.opRegDict[3]
|
|
if opNode1Idx == 3:
|
|
latConfig.chainInstrs += 'VPCMPEQD {0}, {0}, {0};'.format(maskReg[0:3] + '13')
|
|
if 'VSHUFPD' in latConfig.chainInstrs:
|
|
orInstr = 'VORPD'
|
|
else:
|
|
orInstr = 'VPOR'
|
|
latConfig.chainInstrs += '{0} {1}, {1}, {2};'.format(orInstr, maskReg, maskReg[0:3] + '13')
|
|
latConfig.chainLatency += basicLatency[orInstr]
|
|
else:
|
|
latConfig.chainInstrs += 'VPCMPEQD {0}, {0}, {0};'.format(maskReg)
|
|
elif instrNode.attrib['extension'] == 'AVX512EVEX' and ('GATHER' in instrNode.attrib['category'] or 'SCATTER' in instrNode.attrib['category']):
|
|
for latConfig in latConfigList.latConfigs:
|
|
maskReg = latConfig.instrI.opRegDict[2]
|
|
if opNode1Idx == 2:
|
|
# ToDo
|
|
pass
|
|
else:
|
|
latConfig.chainInstrs += 'VPCMPD {0}, {1}, {1}, 7;'.format(maskReg, 'XMM15')
|
|
|
|
mlDP = sys.maxsize
|
|
mlnoDP = sys.maxsize
|
|
|
|
for latConfig in latConfigList.latConfigs:
|
|
configI += 1
|
|
configHtmlReports.append('<h3>Experiment ' + str(configI))
|
|
if latConfig.notes or not useDistinctRegs:
|
|
configHtmlReports.append(' (' + ', '.join(latConfig.notes +
|
|
(['with the same register for different operands'] if not useDistinctRegs else [])) + ')')
|
|
configHtmlReports.append('</h3>\n')
|
|
|
|
configHtmlReports.append('<ul>\n')
|
|
configHtmlReports.append('<li>Instruction: <code>' + latConfig.instrI.asm + '</code></li>\n')
|
|
if latConfig.chainInstrs:
|
|
chainIStr = latConfig.chainInstrs.strip(';')
|
|
configHtmlReports.append('<li>Chain instruction' + ('s' if ';' in chainIStr else '') + ': <code>' + chainIStr + '</code></li>\n')
|
|
if latConfig.chainLatency:
|
|
configHtmlReports.append('<li>Chain latency: ' + ('≥' if latConfigList.isUpperBound else '') + str(latConfig.chainLatency) + '</li>\n')
|
|
|
|
init = latConfig.instrI.regMemInit + latConfig.init
|
|
measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init,
|
|
basicMode=latConfig.basicMode, htmlReports=configHtmlReports, unrollCount=100)
|
|
configHtmlReports.append('</ul>\n')
|
|
|
|
if not measurementResult:
|
|
print('no result found')
|
|
continue
|
|
|
|
cycles = measurementResult['Core cycles']
|
|
|
|
cycles = int(cycles+.2)
|
|
|
|
if latConfig.chainLatency:
|
|
cycles -= latConfig.chainLatency
|
|
|
|
cycles = max(0, cycles) # for dep. breaking instructions (like XOR), cycles might be negative after subtracting chainLatency
|
|
|
|
minLatForCurList = min(minLatForCurList, cycles)
|
|
|
|
if minLatForCurList < minLat:
|
|
minLat = minLatForCurList
|
|
minLatIsUpperBound = latConfigList.isUpperBound
|
|
|
|
if minLatForCurList > maxLat:
|
|
maxLat = minLatForCurList
|
|
maxLatIsUpperBound = latConfigList.isUpperBound
|
|
|
|
if minLat > maxLat: continue
|
|
|
|
if useDistinctRegs:
|
|
minLatDistinctRegs = minLat
|
|
maxLatDistinctRegs = maxLat
|
|
else:
|
|
if minLatDistinctRegs == minLat and maxLatDistinctRegs == maxLat:
|
|
htmlBottom.append('<div style="margin-left: 50px">')
|
|
htmlBottom += configHtmlReports
|
|
htmlBottom.append('</div>')
|
|
continue
|
|
|
|
overallMaxLat = max(overallMaxLat, maxLat)
|
|
|
|
if latencyNode is None:
|
|
latencyNode = SubElement(measurementNode, 'latency')
|
|
latencyNode.attrib['start_op'] = str(opNode1.attrib['idx'])
|
|
latencyNode.attrib['target_op'] = str(opNode2.attrib['idx'])
|
|
|
|
suffix = ('_'+addrMem.replace('VSIB', 'index') if addrMem else '') + ('_same_reg' if not useDistinctRegs else '')
|
|
if minLat == maxLat:
|
|
latencyNode.attrib['cycles'+suffix] = str(minLat)
|
|
if minLatIsUpperBound:
|
|
latencyNode.attrib['cycles'+suffix+'_is_upper_bound'] = '1'
|
|
else:
|
|
latencyNode.attrib['min_cycles'+suffix] = str(minLat)
|
|
if minLatIsUpperBound:
|
|
latencyNode.attrib['min_cycles'+suffix+'_is_upper_bound'] = '1'
|
|
latencyNode.attrib['max_cycles'+suffix] = str(maxLat)
|
|
if maxLatIsUpperBound:
|
|
latencyNode.attrib['max_cycles'+suffix+'_is_upper_bound'] = '1'
|
|
|
|
summaryLine = latencyNodeToStr(latencyNode, not useDistinctRegs, addrMem.replace('VSIB', 'index'))
|
|
|
|
h2ID = 'lat' + str(opNode1Idx) + '->' + str(opNode2Idx) + suffix
|
|
htmlHead.append('<a href="#' + h2ID + '"><h3>' + summaryLine + '</h3></a>')
|
|
if useDistinctRegs: htmlBottom.append('<hr>')
|
|
htmlBottom.append('<h2 id="' + h2ID + '">' + summaryLine + '</h2>')
|
|
htmlBottom.append('<div style="margin-left: 50px">')
|
|
htmlBottom += configHtmlReports
|
|
htmlBottom.append('</div>')
|
|
|
|
addHTMLCodeForOperands(instrNode, htmlReports)
|
|
htmlReports.append('<hr>')
|
|
htmlReports += htmlHead
|
|
htmlReports += htmlBottom
|
|
|
|
return overallMaxLat
|
|
|
|
|
|
def isSSEInstr(instrNode):
|
|
extension = instrNode.attrib['extension']
|
|
return ('SSE' in extension) or (('XMM' in instrNode.attrib['string']) and not isAVXInstr(instrNode))
|
|
|
|
def isAVXInstr(instrNode):
|
|
return ('vex' in instrNode.attrib or 'evex' in instrNode.attrib)
|
|
|
|
def isDivOrSqrtInstr(instrNode):
|
|
return ('DIV' in instrNode.attrib['iclass']) or ('SQRT' in instrNode.attrib['iclass'])
|
|
|
|
def isBranchInstr(instrNode):
|
|
return any((op is not None) for op in instrNode.findall('./operand[@type="reg"][@w="1"]') if op.text == 'RIP')
|
|
|
|
def writeHtmlFile(folder, instrNode, title, body):
|
|
filename = canonicalizeInstrString(instrNode.attrib['string'])
|
|
if useIACA:
|
|
filename += '-IACA' + iacaVersion
|
|
else:
|
|
filename += '-Measurements'
|
|
filename += '.html'
|
|
|
|
folder = '/tmp/cpu-html/' + folder
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
htmlFilename = os.path.join(folder, filename)
|
|
with open(htmlFilename, "w") as f:
|
|
f.write('<html>\n'
|
|
'<head>\n'
|
|
'<title>' + title + '</title>\n'
|
|
'</head>\n'
|
|
'<body>\n'
|
|
+ body +
|
|
'</body>\n'
|
|
'</html>\n')
|
|
|
|
|
|
# returns list of xml instruction nodes
|
|
def filterInstructions(XMLRoot):
|
|
allInstrs = list(XMLRoot.iter('instruction'))
|
|
|
|
instrSet = set(allInstrs)
|
|
for XMLInstr in allInstrs:
|
|
extension = XMLInstr.attrib['extension']
|
|
isaSet = XMLInstr.attrib['isa-set']
|
|
|
|
# Future instruction set extensions
|
|
if extension in ['AMD_INVLPGB', 'CET', 'RDPRU', 'TDX', 'TSX_LDTRK']: instrSet.discard(XMLInstr)
|
|
|
|
# Not supported by assembler
|
|
if XMLInstr.attrib['iclass'] == 'NOP' and len(XMLInstr.findall('operand')) > 1:
|
|
instrSet.discard(XMLInstr)
|
|
if extension in ['MCOMMIT', 'WBNOINVD']: instrSet.discard(XMLInstr)
|
|
|
|
# Not supported any more by gnu as 2.36.1
|
|
if (isaSet == 'MPX') and ('R' in XMLInstr.attrib.get('agen', '')):
|
|
instrSet.discard(XMLInstr)
|
|
|
|
# Only supported by VIA
|
|
if 'VIA_' in extension:
|
|
instrSet.discard(XMLInstr)
|
|
|
|
# "no CPU available today has PTWRITE support" (https://software.intel.com/en-us/forums/intel-isa-extensions/topic/704356)
|
|
if extension in ['PTWRITE']:
|
|
instrSet.discard(XMLInstr)
|
|
|
|
if useIACA:
|
|
if extension in ['AVX512VEX', 'AVX512EVEX'] and arch != 'SKX': instrSet.discard(XMLInstr)
|
|
# AMD
|
|
if extension in ['3DNOW', 'CLZERO', 'FMA4', 'MONITORX', 'SSE4a', 'SVM', 'TBM', 'XOP']: instrSet.discard(XMLInstr)
|
|
# Future instruction set extensions
|
|
if extension in ['CLDEMOTE', 'ENQCMD', 'HRESET', 'MOVDIR', 'PCONFIG', 'SERIALIZE', 'SNP', 'UINTR', 'WAITPKG']: instrSet.discard(XMLInstr)
|
|
if extension in ['AVX512EVEX'] and any(x in isaSet for x in ['4FMAPS', '4VNNIW', 'ER', 'PF']): instrSet.discard(XMLInstr)
|
|
if any(x in isaSet for x in ['AMX', 'BF16', 'BITALG', 'GFNI', 'VAES', 'VBMI2', 'VNNI', 'VP2INTERSECT', 'VPCLMULQDQ', 'VPOPCNTDQ']):
|
|
instrSet.discard(XMLInstr)
|
|
|
|
if useIACA: return list(instrSet)
|
|
|
|
cpu = cpuid.CPUID()
|
|
|
|
_, _, ecx1, edx1 = cpu(0x01)
|
|
_, ebx7, ecx7, edx7 = cpu(0x07)
|
|
eax7_1, ebx7_1, ecx7_1, edx7_1 = cpu(0x07, 0x01)
|
|
eaxD_1, _, _, _ = cpu(0x0D, 0x01)
|
|
_, ebx19, _, _ = cpu(0x19)
|
|
eax1E_1, _, _, _ = cpu(0x1E, 0x01)
|
|
_, _, ecx8_1, edx8_1 = cpu(0x80000001)
|
|
_, ebx8_8, _, _ = cpu(0x80000008)
|
|
|
|
for XMLInstr in allInstrs:
|
|
iclass = XMLInstr.attrib['iclass']
|
|
extension = XMLInstr.attrib['extension']
|
|
isaSet = XMLInstr.attrib['isa-set']
|
|
category = XMLInstr.attrib['category']
|
|
|
|
if extension == 'SSE3' and not cpuid.get_bit(ecx1, 0): instrSet.discard(XMLInstr)
|
|
if extension == 'PCLMULQDQ' and not cpuid.get_bit(ecx1, 1): instrSet.discard(XMLInstr)
|
|
if extension == 'SSSE3' and not cpuid.get_bit(ecx1, 9): instrSet.discard(XMLInstr)
|
|
if extension == 'FMA' and not cpuid.get_bit(ecx1, 12): instrSet.discard(XMLInstr)
|
|
if extension == 'SSE4' and not cpuid.get_bit(ecx1, 19): instrSet.discard(XMLInstr)
|
|
if isaSet == 'SSE42' and not cpuid.get_bit(ecx1, 20): instrSet.discard(XMLInstr)
|
|
if extension == 'MOVBE' and not cpuid.get_bit(ecx1, 22): instrSet.discard(XMLInstr)
|
|
if isaSet == 'POPCNT' and not cpuid.get_bit(ecx1, 23): instrSet.discard(XMLInstr)
|
|
if extension == 'AES' and not cpuid.get_bit(ecx1, 25): instrSet.discard(XMLInstr)
|
|
if extension == 'AVX':
|
|
if not cpuid.get_bit(ecx1, 28):
|
|
instrSet.discard(XMLInstr)
|
|
else:
|
|
global supportsAVX
|
|
supportsAVX = True
|
|
if extension == 'AVXAES' and not (cpuid.get_bit(ecx1, 25) and cpuid.get_bit(ecx1, 28)): instrSet.discard(XMLInstr)
|
|
if extension in ['XSAVE', 'XSAVEC', 'XSAVEOPT', 'XSAVES'] and not cpuid.get_bit(ecx1, 27): instrSet.discard(XMLInstr)
|
|
if extension == 'F16C' and not cpuid.get_bit(ecx1, 29): instrSet.discard(XMLInstr)
|
|
if extension == 'RDRAND' and not cpuid.get_bit(ecx1, 30): instrSet.discard(XMLInstr)
|
|
if extension == 'MMX' and not cpuid.get_bit(edx1, 23): instrSet.discard(XMLInstr)
|
|
if extension == 'SSE' and not cpuid.get_bit(edx1, 25): instrSet.discard(XMLInstr)
|
|
if extension == 'SSE2' and not cpuid.get_bit(edx1, 26): instrSet.discard(XMLInstr)
|
|
if extension == 'BMI1' and not cpuid.get_bit(ebx7, 3): instrSet.discard(XMLInstr)
|
|
if extension in ['AVX2', 'AVX2GATHER'] and not cpuid.get_bit(ebx7, 5): instrSet.discard(XMLInstr)
|
|
if extension == 'BMI2' and not cpuid.get_bit(ebx7, 8): instrSet.discard(XMLInstr)
|
|
if extension == 'WBNOINVD' and not cpuid.get_bit(ebx7, 9): instrSet.discard(XMLInstr)
|
|
if extension == 'MPX' and not cpuid.get_bit(ebx7, 14): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512F') and not cpuid.get_bit(ebx7, 16): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512DQ') and not cpuid.get_bit(ebx7, 16): instrSet.discard(XMLInstr)
|
|
if extension == 'RDSEED' and not cpuid.get_bit(ebx7, 18): instrSet.discard(XMLInstr)
|
|
if extension == 'ADOX_ADCX' and not cpuid.get_bit(ebx7, 19): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_IFMA') and not cpuid.get_bit(ebx7, 21): instrSet.discard(XMLInstr)
|
|
if extension == 'CLFLUSHOPT' and not cpuid.get_bit(ebx7, 23): instrSet.discard(XMLInstr)
|
|
if extension == 'CLWB' and not cpuid.get_bit(ebx7, 24): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512PF') and not cpuid.get_bit(ebx7, 26): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512ER') and not cpuid.get_bit(ebx7, 27): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512CD') and not cpuid.get_bit(ebx7, 28): instrSet.discard(XMLInstr)
|
|
if extension == 'SHA' and not cpuid.get_bit(ebx7, 29): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512BW') and not cpuid.get_bit(ebx7, 30): instrSet.discard(XMLInstr)
|
|
if extension == 'PREFETCHWT1' and not cpuid.get_bit(ecx7, 0): instrSet.discard(XMLInstr)
|
|
if category == 'AVX512_VBMI' and not cpuid.get_bit(ecx7, 1): instrSet.discard(XMLInstr)
|
|
if extension == 'PKU' and not cpuid.get_bit(ecx7, 4): instrSet.discard(XMLInstr)
|
|
if extension == 'WAITPKG' and not cpuid.get_bit(ecx7, 5): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_VBMI2') and not cpuid.get_bit(ecx7, 6): instrSet.discard(XMLInstr)
|
|
if category == 'GFNI':
|
|
if not cpuid.get_bit(ecx7, 8):
|
|
instrSet.discard(XMLInstr)
|
|
elif ('AVX' in isaSet) and (not cpuid.get_bit(ecx1, 28)):
|
|
instrSet.discard(XMLInstr)
|
|
elif ('AVX512' in isaSet) and (not cpuid.get_bit(ebx7, 31)):
|
|
instrSet.discard(XMLInstr)
|
|
if 'VAES' in isaSet:
|
|
if not cpuid.get_bit(ecx7, 9):
|
|
instrSet.discard(XMLInstr)
|
|
elif 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31):
|
|
instrSet.discard(XMLInstr)
|
|
if 'VPCLMULQDQ' in isaSet:
|
|
if not cpuid.get_bit(ecx7, 10):
|
|
instrSet.discard(XMLInstr)
|
|
elif 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31):
|
|
instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_VNNI') and not cpuid.get_bit(ecx7, 11): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_BITALG') and not cpuid.get_bit(ecx7, 12): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_VPOPCNTDQ') and not cpuid.get_bit(ecx7, 14): instrSet.discard(XMLInstr)
|
|
if extension == 'RDPID' and not cpuid.get_bit(ecx7, 22): instrSet.discard(XMLInstr)
|
|
if extension.startswith('KEYLOCKER'):
|
|
if not cpuid.get_bit(ecx7, 23) or not cpuid.get_bit(ebx19, 0): instrSet.discard(XMLInstr)
|
|
if 'WIDE' in extension and not cpuid.get_bit(ebx19, 2): instrSet.discard(XMLInstr)
|
|
if extension == 'CLDEMOTE' and not cpuid.get_bit(ecx7, 25): instrSet.discard(XMLInstr)
|
|
if iclass == 'MOVDIRI' and not cpuid.get_bit(ecx7, 27): instrSet.discard(XMLInstr)
|
|
if iclass == 'MOVDIR64B' and not cpuid.get_bit(ecx7, 28): instrSet.discard(XMLInstr)
|
|
if extension == 'ENQCMD' and not cpuid.get_bit(ecx7, 29): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_4VNNI') and not cpuid.get_bit(edx7, 2): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_4FMAPS') and not cpuid.get_bit(edx7, 3): instrSet.discard(XMLInstr)
|
|
if extension == 'UINTR' and not cpuid.get_bit(edx7, 5): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_VP2INTERSECT') and not cpuid.get_bit(edx7, 8): instrSet.discard(XMLInstr)
|
|
if extension == 'SERIALIZE' and not cpuid.get_bit(edx7, 14): instrSet.discard(XMLInstr)
|
|
if extension == 'PCONFIG' and not cpuid.get_bit(edx7, 18): instrSet.discard(XMLInstr)
|
|
if extension == 'AMX_BF16' and not cpuid.get_bit(edx7, 22): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_FP16') and not cpuid.get_bit(edx7, 23): instrSet.discard(XMLInstr)
|
|
if extension == 'AMX_TILE' and not cpuid.get_bit(edx7, 24): instrSet.discard(XMLInstr)
|
|
if extension == 'AMX_INT8' and not cpuid.get_bit(edx7, 25): instrSet.discard(XMLInstr)
|
|
if extension == 'SHA512' and not cpuid.get_bit(eax7_1, 0): instrSet.discard(XMLInstr)
|
|
if extension == 'SM3' and not cpuid.get_bit(eax7_1, 1): instrSet.discard(XMLInstr)
|
|
if extension == 'SM4' and not cpuid.get_bit(eax7_1, 2): instrSet.discard(XMLInstr)
|
|
if extension == 'RAO_INT' and not cpuid.get_bit(eax7_1, 3): instrSet.discard(XMLInstr)
|
|
if extension == 'AVX_VNNI' and not cpuid.get_bit(eax7_1, 4): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AVX512_BF16') and not cpuid.get_bit(eax7_1, 5): instrSet.discard(XMLInstr)
|
|
if extension == 'CMPCCXADD' and not cpuid.get_bit(eax7_1, 7): instrSet.discard(XMLInstr)
|
|
if extension == 'FRED' and not cpuid.get_bit(eax7_1, 17): instrSet.discard(XMLInstr)
|
|
if extension == 'LKGS' and not cpuid.get_bit(eax7_1, 18): instrSet.discard(XMLInstr)
|
|
if extension == 'WRMSRNS' and not cpuid.get_bit(eax7_1, 19): instrSet.discard(XMLInstr)
|
|
if extension == 'AMX_FP16' and not cpuid.get_bit(eax7_1, 21): instrSet.discard(XMLInstr)
|
|
if extension == 'HRESET' and not cpuid.get_bit(eax7_1, 22): instrSet.discard(XMLInstr)
|
|
if extension == 'AVX_IFMA' and not cpuid.get_bit(eax7_1, 23): instrSet.discard(XMLInstr)
|
|
if extension == 'MSRLIST' and not cpuid.get_bit(eax7_1, 27): instrSet.discard(XMLInstr)
|
|
if ('MOVRS' in isaSet) and ('AMX' not in isaSet) and not cpuid.get_bit(eax7_1, 31): instrSet.discard(XMLInstr)
|
|
if extension == 'PBNDKB' and not cpuid.get_bit(ebx7_1, 1): instrSet.discard(XMLInstr)
|
|
if extension == 'MSR_IMM' and not cpuid.get_bit(ecx7_1, 5): instrSet.discard(XMLInstr)
|
|
if extension == 'AVX_VNNI_INT8' and not cpuid.get_bit(edx7_1, 4): instrSet.discard(XMLInstr)
|
|
if extension == 'AVX_NE_CONVERT' and not cpuid.get_bit(edx7_1, 5): instrSet.discard(XMLInstr)
|
|
if isaSet == 'AMX_COMPLEX' and not cpuid.get_bit(edx7_1, 8): instrSet.discard(XMLInstr)
|
|
if extension == 'AVX_VNNI_INT16' and not cpuid.get_bit(edx7_1, 10): instrSet.discard(XMLInstr)
|
|
if extension == 'ICACHE_PREFETCH' and not cpuid.get_bit(edx7_1, 14): instrSet.discard(XMLInstr)
|
|
if extension == 'USER_MSR' and not cpuid.get_bit(edx7_1, 15): instrSet.discard(XMLInstr)
|
|
if (extension in ['APXEVEX', 'APXLEGACY']) and not cpuid.get_bit(edx7_1, 21): instrSet.discard(XMLInstr)
|
|
if extension == 'XSAVEOPT' and not cpuid.get_bit(eaxD_1, 0): instrSet.discard(XMLInstr)
|
|
if extension == 'XSAVEC' and not cpuid.get_bit(eaxD_1, 1): instrSet.discard(XMLInstr)
|
|
if extension == 'XSAVES' and not cpuid.get_bit(eaxD_1, 3): instrSet.discard(XMLInstr)
|
|
if isaSet.startswith('AMX_'):
|
|
if '_FP8' in isaSet and not cpuid.get_bit(eax1E_1, 4): instrSet.discard(XMLInstr)
|
|
if '_TRANSPOSE' in isaSet and not cpuid.get_bit(eax1E_1, 5): instrSet.discard(XMLInstr)
|
|
if '_TF32' in isaSet and not cpuid.get_bit(eax1E_1, 6): instrSet.discard(XMLInstr)
|
|
if '_MOVRS' in isaSet and not cpuid.get_bit(eax1E_1, 8): instrSet.discard(XMLInstr)
|
|
if '_TRANSPOSE_COMPLEX' in isaSet and not cpuid.get_bit(eax1E_1, 2): instrSet.discard(XMLInstr)
|
|
if '_TRANSPOSE_BF16' in isaSet and not cpuid.get_bit(eax1E_1, 1): instrSet.discard(XMLInstr)
|
|
if '_TRANSPOSE_FP16' in isaSet and not cpuid.get_bit(eax1E_1, 3): instrSet.discard(XMLInstr)
|
|
if extension == 'SSE4a' and not cpuid.get_bit(ecx8_1, 6): instrSet.discard(XMLInstr)
|
|
if extension == 'XOP' and not cpuid.get_bit(ecx8_1, 11): instrSet.discard(XMLInstr)
|
|
if extension == 'FMA4' and not cpuid.get_bit(ecx8_1, 16): instrSet.discard(XMLInstr)
|
|
if extension == 'TBM' and not cpuid.get_bit(ecx8_1, 21): instrSet.discard(XMLInstr)
|
|
if extension == 'RDTSCP' and not cpuid.get_bit(edx8_1, 27): instrSet.discard(XMLInstr)
|
|
if extension == '3DNOW' and not cpuid.get_bit(edx8_1, 31): instrSet.discard(XMLInstr)
|
|
if extension == 'CLZERO' and not cpuid.get_bit(ebx8_8, 0): instrSet.discard(XMLInstr)
|
|
#if extension == 'MCOMMIT' and not cpuid.get_bit(ebx8_8, 8): instrSet.discard(XMLInstr)
|
|
|
|
# AVX10
|
|
avx10Enabled = cpuid.get_bit(edx7_1, 19)
|
|
avx10Version = -1
|
|
avx10VectorLengths = set()
|
|
if avx10Enabled:
|
|
_, ebx24, _, _ = cpu(0x24)
|
|
avx10Version = cpuid.get_bytes(ebx24)[0]
|
|
if cpuid.get_bit(ebx24, 16):
|
|
avx10VectorLengths.add(128)
|
|
if cpuid.get_bit(ebx24, 17):
|
|
avx10VectorLengths.add(256)
|
|
if cpuid.get_bit(ebx24, 18):
|
|
avx10VectorLengths.add(512)
|
|
|
|
if isaSet.endswith('_128') and 128 not in avx10VectorLengths:
|
|
instrSet.discard(XMLInstr)
|
|
if isaSet.endswith('_256') and 256 not in avx10VectorLengths:
|
|
instrSet.discard(XMLInstr)
|
|
if isaSet.endswith('_512') and 512 not in avx10VectorLengths:
|
|
instrSet.discard(XMLInstr)
|
|
|
|
if avx10Version < 2 and (isaSet in ['AVX512_COM_EF_SCALAR']
|
|
or any(isaSet.startswith(x) for x in ['AVX10_2_', 'AVX512_FP8_CONVERT_', 'AVX512_FP16_CONVERT_', 'AVX512_MEDIAX_', 'AVX512_MINMAX_',
|
|
'AVX512_MOVZXC_', 'AVX512_SAT_CVT_', 'AVX512_VNNI_FP16', 'AVX512_VNNI_INT8_', 'AVX512_VNNI_INT16', 'SM4_'])):
|
|
instrSet.discard(XMLInstr)
|
|
|
|
if isaSet == 'AMX_AVX512' and ((avx10Version < 2) or (512 not in avx10VectorLengths) or not cpuid.get_bit(eax1E_1, 7)):
|
|
instrSet.discard(XMLInstr)
|
|
|
|
# Virtualization instructions
|
|
if extension in ['SVM', 'VMFUNC', 'VTX']: instrSet.discard(XMLInstr)
|
|
|
|
# Safer Mode Extensions
|
|
if extension in ['SMX']: instrSet.discard(XMLInstr)
|
|
|
|
# Software Guard Extensions
|
|
if extension in ['SGX', 'SGX_ENCLV']: instrSet.discard(XMLInstr)
|
|
|
|
# Transactional Synchronization Extensions
|
|
if extension in ['RTM']: instrSet.discard(XMLInstr)
|
|
|
|
# WAITPKG
|
|
if extension in ['WAITPKG']: instrSet.discard(XMLInstr)
|
|
|
|
# X87 instructions:
|
|
if extension in ['X87']: instrSet.discard(XMLInstr)
|
|
if XMLInstr.attrib['category'] in ['X87_ALU']: instrSet.discard(XMLInstr)
|
|
|
|
# System instructions
|
|
if extension in ['HRESET', 'INVPCID', 'MONITOR', 'MONITORX', 'PCONFIG', 'RDWRFSGS', 'SMAP', 'SNP', 'UINTR']: instrSet.discard(XMLInstr)
|
|
if XMLInstr.attrib['category'] in ['INTERRUPT', 'SEGOP', 'SYSCALL', 'SYSRET']: instrSet.discard(XMLInstr)
|
|
if XMLInstr.attrib['iclass'] in ['CALL_FAR', 'HLT', 'INVD', 'IRET', 'IRETD', 'IRETQ', 'JMP_FAR', 'LTR', 'RET_FAR', 'UD2']: instrSet.discard(XMLInstr)
|
|
if 'XRSTOR' in XMLInstr.attrib['iclass']: instrSet.discard(XMLInstr)
|
|
if XMLInstr.attrib['iform'] in ['POP_FS', 'POP_GS', 'MOV_CR_CR_GPR64', 'MOV_SEG_MEMw', 'MOV_SEG_GPR16', 'SWAPGS']: instrSet.discard(XMLInstr)
|
|
|
|
# Undefined instructions
|
|
if XMLInstr.attrib['iclass'].startswith('UD'): instrSet.discard(XMLInstr)
|
|
|
|
return list(instrSet)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='CPU Benchmarks')
|
|
parser.add_argument("-iaca", help="IACA command line; if not specified, perf. ctrs. are used")
|
|
parser.add_argument("-input", help="Instructions XML file", required=True)
|
|
parser.add_argument("-output", help="Output XML file")
|
|
parser.add_argument("-arch", help="Architecture, Supported: [NHM, ...]")
|
|
parser.add_argument("-noPretty", help="Disable pretty printing XML file", action='store_true')
|
|
parser.add_argument("-noPorts", help="Don't measure port usage", action='store_true')
|
|
parser.add_argument("-tpInput", help=".pickle file with TP data")
|
|
parser.add_argument("-latInput", help=".pickle file with latency data")
|
|
parser.add_argument("-debug", help="Debug output", action='store_true')
|
|
|
|
args = parser.parse_args()
|
|
|
|
global arch
|
|
if args.arch is not None:
|
|
arch = args.arch
|
|
else:
|
|
cpu = cpuid.CPUID()
|
|
arch = cpuid.micro_arch(cpu)
|
|
print(cpuid.get_basic_info(cpu))
|
|
if arch == 'unknown':
|
|
exit(1)
|
|
|
|
global debugOutput
|
|
debugOutput = args.debug
|
|
|
|
global useIACA
|
|
if args.iaca:
|
|
useIACA = True
|
|
|
|
try:
|
|
versionString = subprocess.check_output([args.iaca], stderr=subprocess.STDOUT)
|
|
except subprocess.CalledProcessError as e:
|
|
versionString = e.output
|
|
global iacaVersion
|
|
iacaVersion = re.search(r'\d\.\d', versionString.decode()).group(0)
|
|
global iacaCMDLine
|
|
iacaCMDLine = [args.iaca, '-reduceout', '-arch', arch]
|
|
if iacaVersion == '2.1':
|
|
iacaCMDLine.append('-64')
|
|
else:
|
|
useIACA = False
|
|
|
|
resetNanoBench()
|
|
|
|
if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']:
|
|
configurePFCs(['UOPS','FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3',
|
|
'FpuPipeAssignment.Total4', 'FpuPipeAssignment.Total5', 'DIV_CYCLES'])
|
|
else:
|
|
configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_MITE', 'UOPS_MS', 'UOPS_PORT_0', 'UOPS_PORT_1', 'UOPS_PORT_2', 'UOPS_PORT_3', 'UOPS_PORT_4',
|
|
'UOPS_PORT_5', 'UOPS_PORT_6', 'UOPS_PORT_7', 'UOPS_PORT_23', 'UOPS_PORT_49', 'UOPS_PORT_78', 'UOPS_PORT_5B', 'UOPS_PORT_5B>=2',
|
|
'UOPS_PORT_23A', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0', 'UOPS_MITE>=1'])
|
|
|
|
try:
|
|
subprocess.check_output('mkdir -p /tmp/ramdisk; sudo mount -t tmpfs -o size=100M none /tmp/ramdisk/', shell=True)
|
|
except subprocess.CalledProcessError as e:
|
|
print('Could not create ramdisk ' + e.output)
|
|
exit(1)
|
|
|
|
XMLRoot = ET.parse(args.input).getroot()
|
|
XMLRoot.attrib['date'] = str(datetime.date.today())
|
|
|
|
global instrNodeList
|
|
instrNodeList = filterInstructions(XMLRoot)
|
|
|
|
global instrNodeDict
|
|
instrNodeDict = {instrNode.attrib['string']: instrNode for instrNode in instrNodeList}
|
|
|
|
# move instructions that need a preInstr to the end, as their throughput can only be determined after the throughput of the instructions included in the
|
|
# preInstr has been measured
|
|
#instrRequiringPreInstr = []
|
|
#if not useIACA:
|
|
# instrRequiringPreInstr = [x for x in instrNodeList if isDivOrSqrtInstr(x) or getPreInstr(x)[0]]
|
|
instrNodeList.sort(key=lambda x: x.attrib['string'])
|
|
|
|
condBrInstr = [i for i in instrNodeList if i.attrib['category'] == 'COND_BR' and i.attrib['isa-set'] == 'I86' and not 'LOOP' in i.attrib['iclass']]
|
|
|
|
for instrNode in instrNodeList:
|
|
archNode = instrNode.find('./architecture[@name="' + arch + '"]')
|
|
if archNode is None:
|
|
archNode = SubElement(instrNode, "architecture")
|
|
archNode.attrib['name'] = arch
|
|
if not useIACA:
|
|
measurementNode = archNode.find('./measurement')
|
|
if measurementNode is None:
|
|
measurementNode = SubElement(archNode, "measurement")
|
|
|
|
########################
|
|
# Througput and Uops
|
|
########################
|
|
|
|
tpDict = {}
|
|
tpDictSameReg = {}
|
|
tpDictIndexedAddr = {}
|
|
tpDictNoInteriteration = {}
|
|
macroFusionDict = {}
|
|
|
|
if args.tpInput is not None:
|
|
with open(args.tpInput, 'rb') as f:
|
|
pTpDict, pTpDictSameReg, pTpDictIndexedAddr, pTpDictNoInteriteration = pickle.load(f)
|
|
tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()}
|
|
tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()}
|
|
tpDictIndexedAddr = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictIndexedAddr.items()}
|
|
tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()}
|
|
else:
|
|
for i, instrNode in enumerate(instrNodeList):
|
|
#if not 'POP (R64)' in instrNode.attrib['string']: continue
|
|
print('Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')')
|
|
|
|
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
|
|
|
|
hasCommonReg = hasCommonRegister(instrNode)
|
|
if hasCommonReg: htmlReports.append('<h2 id="distinctRegs">With different registers for different operands</h2>\n')
|
|
|
|
hasExplMemOp = hasExplicitNonVSIBMemOperand(instrNode)
|
|
if hasExplMemOp: htmlReports.append('<h2 id="nonIndexedAddr">With a non-indexed addressing mode</h2>\n')
|
|
|
|
tpResult = getThroughputAndUops(instrNode, True, False, htmlReports)
|
|
print(instrNode.attrib['string'] + " - tp: " + str(tpResult))
|
|
|
|
if tpResult is None:
|
|
continue
|
|
|
|
tpDict[instrNode] = tpResult
|
|
|
|
if hasCommonReg:
|
|
htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n')
|
|
tpResultSR = getThroughputAndUops(instrNode, False, False, htmlReports)
|
|
if tpResultSR and (tpResult.uops != tpResultSR.uops or tpResult.fused_uops != tpResultSR.fused_uops or tpResult.uops_MITE != tpResultSR.uops_MITE
|
|
or abs(sum(tpResult.unblocked_ports.values()) - sum(tpResultSR.unblocked_ports.values())) > .8
|
|
or tpResultSR.TP_single < .95 * tpResult.TP_single):
|
|
tpDictSameReg[instrNode] = tpResultSR
|
|
|
|
if hasExplMemOp:
|
|
htmlReports.append('<hr><h2 id="indexedAddr">With an indexed addressing mode</h2>\n')
|
|
tpResultIndexed = getThroughputAndUops(instrNode, True, True, htmlReports)
|
|
if tpResultIndexed:
|
|
tpDictIndexedAddr[instrNode] = tpResultIndexed
|
|
|
|
# Macro-Fusion
|
|
if (not useIACA) and tpResult.fused_uops == 1 and (instrNode.find('./operand[@type="flags"][@w="1"]') is not None):
|
|
htmlReports.append('<hr><h2 id="macroFusion">Tests for macro-fusion</h2>\n')
|
|
fusibleInstrList = []
|
|
for brInstr in condBrInstr:
|
|
if canMacroFuse(instrNode, brInstr, htmlReports):
|
|
fusibleInstrList.append(brInstr)
|
|
if fusibleInstrList: macroFusionDict[instrNode] = fusibleInstrList
|
|
|
|
if useIACA and iacaVersion in ['2.1', '2.2']:
|
|
htmlReports.append('<hr><h2 id="noInteriteration">With the -no_interiteration flag</h2>\n')
|
|
tp = getThroughputIacaNoInteriteration(instrNode, htmlReports)
|
|
if tp: tpDictNoInteriteration[instrNode] = tp
|
|
|
|
if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
|
with open('/tmp/tp_' + arch + '.pickle', 'wb') as f:
|
|
pickle.dump((tpDict, tpDictSameReg, tpDictIndexedAddr, tpDictNoInteriteration), f)
|
|
|
|
########################
|
|
# Latency
|
|
########################
|
|
|
|
if not useIACA:
|
|
configurePFCs(['UOPS'])
|
|
getBasicLatencies(instrNodeList)
|
|
|
|
latencyDict = {}
|
|
|
|
if args.latInput is not None:
|
|
with open(args.latInput, 'rb') as f:
|
|
latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()}
|
|
elif not useIACA or iacaVersion == '2.1':
|
|
for i, instrNode in enumerate(instrNodeList):
|
|
#if not 'DIV' in instrNode.attrib['string']: continue
|
|
print('Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')')
|
|
|
|
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
|
|
lat = getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports)
|
|
|
|
if lat is not None:
|
|
if debugOutput: print(instrNode.attrib['iform'] + ': ' + str(lat))
|
|
latencyDict[instrNode] = lat
|
|
writeHtmlFile('html-lat/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
|
with open('/tmp/lat_' + arch + '.pickle', 'wb') as f:
|
|
pickle.dump(latencyDict, f)
|
|
|
|
########################
|
|
# Ports
|
|
########################
|
|
|
|
if not useIACA:
|
|
configurePFCs(['UOPS'])
|
|
|
|
# the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently
|
|
portCombinationsResultDict = {}
|
|
portCombinationsResultDictSameReg = {}
|
|
portCombinationsResultDictIndexedAddr = {}
|
|
|
|
if not args.noPorts:
|
|
for instr, tpResult in tpDict.items():
|
|
up = tpResult.unblocked_ports
|
|
usedPorts = tpResult.all_used_ports
|
|
if '5B>=2' in up:
|
|
if '5B' in usedPorts:
|
|
usedPorts.add('5')
|
|
if '5B>=2' not in usedPorts:
|
|
up['5'] = up['5B']
|
|
else:
|
|
up['5'] = up['B'] = up['5B'] / 2
|
|
usedPorts.add('B')
|
|
up.pop('5B', None)
|
|
up.pop('5B>=2', None)
|
|
usedPorts.discard('5B')
|
|
usedPorts.discard('5B>=2')
|
|
|
|
# iforms of instructions that are potentially zero-latency instructions
|
|
# we consider all MOVZX instructions to be potentially zero-latency instr.; the descr. in the manual is not accurate as, e.g., MOVZX RSI, CL can be
|
|
# eliminated, but MOVZX RSI, DIL cannot (at least on Coffee Lake)
|
|
zeroLatencyMovIforms = set(x.attrib['iform'] for x in instrNodeList
|
|
if x.attrib['iform'].startswith(('MOV_', 'MOVZX_', 'NOP', 'MOVUPD_', 'MOVAPD_', 'MOVUPS_', 'MOVAPS_', 'MOVDQA_', 'MOVDQU_',
|
|
'VMOVUPD_', 'VMOVAPD_', 'VMOVUPS_', 'VMOVAPS_', 'VMOVDQA_', 'VMOVDQU_'))
|
|
and len(x.findall('./operand[@type="reg"]')) >= 2 and not 'MEM' in x.attrib['iform'])
|
|
# iforms of instructions that change the control flow based on a register, flag, or memory location
|
|
branchInstrs = set(instr for instr in instrNodeList if isBranchInstr(instr))
|
|
disallowedBlockingInstrs = set(instr for instr in tpDict
|
|
if instr.attrib['iform'] in (zeroLatencyMovIforms | serializingInstructions | set(['PAUSE']))
|
|
or (instr in branchInstrs and not instr.attrib['iform'] == 'JMP_RELBRb')
|
|
or (instr.find('./operand[@base="RSP"]') is not None)
|
|
or (instr.find('./operand[@conditionalWrite="1"]') is not None)
|
|
or instr.attrib['category'] == 'SYSTEM'
|
|
or instr.attrib['extension'] == 'X87'
|
|
or '_AL_' in instr.attrib['iform'] or '_OrAX_' in instr.attrib['iform']
|
|
or instr.attrib.get('high8')
|
|
or tpDict[instr].TP_noDepBreaking_noLoop - .2 > max([uops for _, uops in tpDict[instr].unblocked_ports.items()] or [0])
|
|
or '512' in instr.attrib['isa-set']) # on SKX, some AVX-512 instructions can 'shut down' vector units on port 1
|
|
|
|
if isAMDCPU():
|
|
disallowedBlockingInstrs |= set(instr for instr in instrNodeList for op in instr.findall('./operand[@type="mem"]'))
|
|
# combining SHA instr. with other instr. leads to wrong port counts
|
|
disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['extension'] == 'SHA')
|
|
# combining FP with non-FP instr. can lead to wrong port counts
|
|
#disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['category'] in ['LOGICAL_FP'] or
|
|
# any(not 'f' in o.attrib.get('xtype','f') for o in instr.findall('./operand')))
|
|
if arch in ['ZEN3', 'ZEN4', 'ZEN5']:
|
|
# we need one instruction with 1*FP45;
|
|
# their throughput is limited to 1 per cycle; thus, they are disallowed by the TP_noDepBreaking_noLoop check above
|
|
disallowedBlockingInstrs.remove(instrNodeDict['MOVD (R32, XMM)'])
|
|
disallowedBlockingInstrs.remove(instrNodeDict['VMOVD (R32, XMM)'])
|
|
|
|
print('disallowedBlockingInstrs')
|
|
for instrNode in disallowedBlockingInstrs:
|
|
print(' ' + str(instrNode.attrib['string']))
|
|
|
|
print('tpDict')
|
|
for instr, tpResult in tpDict.items():
|
|
print(' ' + str(instr.attrib['string']) + ' ' + str(tpResult.unblocked_ports))
|
|
|
|
# we cannot start higher than .79 as IACA has .2 uops on each port for a port usage of, e.g., 1*p1256
|
|
# using uops_dict instead can be problematic because in IACA the uops on the individual ports do not always add up to this value
|
|
oneUopInstrs = [instr for instr, tpResult in tpDict.items() if instr not in disallowedBlockingInstrs and .79 < sum([v for v in tpResult.unblocked_ports.values() if v>.1]) < 1.11]
|
|
|
|
print('oneUopInstrs')
|
|
for instrNode in oneUopInstrs:
|
|
print(' ' + str(instrNode.attrib['string']))
|
|
# dicts from port combination to a set of instructions (either not containing AVX or SSE instructions bec. of transition penalty) that always uses these ports
|
|
blockingInstructionsDictNonAVX_set = {}
|
|
blockingInstructionsDictNonSSE_set = {}
|
|
|
|
for instrNode in oneUopInstrs:
|
|
usedPorts = frozenset(tpDict[instrNode].all_used_ports)
|
|
if usedPorts:
|
|
print(instrNode.attrib['iform'] + ': ' + str(usedPorts) + ' ' + str(len(instrNode.findall('./operand[@suppressed="1"]'))))
|
|
|
|
if not isSSEInstr(instrNode):
|
|
if not usedPorts in blockingInstructionsDictNonSSE_set: blockingInstructionsDictNonSSE_set[usedPorts] = set()
|
|
blockingInstructionsDictNonSSE_set[usedPorts].add(instrNode)
|
|
if not isAVXInstr(instrNode):
|
|
if not usedPorts in blockingInstructionsDictNonAVX_set: blockingInstructionsDictNonAVX_set[usedPorts] = set()
|
|
blockingInstructionsDictNonAVX_set[usedPorts].add(instrNode)
|
|
|
|
# choose instruction with lowest throughput value; prefer non-control flow instructions, instr. that do not need decoder 0, and instr. with as few as
|
|
# possible implicit operands that are read
|
|
sort_key = lambda x:(x in branchInstrs, tpDict[x].complexDec, len(x.findall('./operand[@suppressed="1"]')), tpDict[x].TP_noDepBreaking_noLoop, x.attrib['string'])
|
|
blockingInstructionsDictNonAVX = {comb: next(iter(sorted(instr_set, key=sort_key))) for comb, instr_set in blockingInstructionsDictNonAVX_set.items()}
|
|
blockingInstructionsDictNonSSE = {comb: next(iter(sorted(instr_set, key=sort_key))) for comb, instr_set in blockingInstructionsDictNonSSE_set.items()}
|
|
|
|
#for comb, instr_set in blockingInstructionsDictNonAVX_set.items():
|
|
# print(comb)
|
|
# print([x.attrib['string'] for x in sorted(instr_set, key=sort_key)])
|
|
|
|
#print(str(blockingInstructionsDictNonAVX.items()))
|
|
|
|
if isIntelCPU():
|
|
# mov to mem has always two uops: store address and store data; there is no instruction that uses just one of them
|
|
movMemInstrNode = instrNodeDict['MOV (M64, R64)']
|
|
|
|
if arch in ['ICL', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']:
|
|
storeDataPort = '49'
|
|
else:
|
|
storeDataPort = '4'
|
|
blockingInstructionsDictNonAVX[frozenset({storeDataPort})] = movMemInstrNode
|
|
blockingInstructionsDictNonSSE[frozenset({storeDataPort})] = movMemInstrNode
|
|
|
|
storeAddressPorts = frozenset({p for p in tpDict[movMemInstrNode].all_used_ports if not p == storeDataPort})
|
|
if storeAddressPorts not in blockingInstructionsDictNonAVX: blockingInstructionsDictNonAVX[storeAddressPorts] = movMemInstrNode
|
|
if storeAddressPorts not in blockingInstructionsDictNonSSE: blockingInstructionsDictNonSSE[storeAddressPorts] = movMemInstrNode
|
|
|
|
print('Non-AVX:')
|
|
for k,v in blockingInstructionsDictNonAVX.items():
|
|
print(str(k) + ': ' + v.attrib['string'])
|
|
print('Non-SSE:')
|
|
for k,v in blockingInstructionsDictNonSSE.items():
|
|
print(str(k) + ': ' + v.attrib['string'])
|
|
|
|
sortedPortCombinationsNonAVX = sorted(blockingInstructionsDictNonAVX.keys(), key=lambda x:(len(x), sorted(x)))
|
|
sortedPortCombinationsNonSSE = sorted(blockingInstructionsDictNonSSE.keys(), key=lambda x:(len(x), sorted(x)))
|
|
print('sortedPortCombinations: ' + str(sortedPortCombinationsNonAVX))
|
|
|
|
for i, instrNode in enumerate(sorted(tpDict.keys(), key=lambda x: (len(tpDict[x].config.preInstrNodes), x.attrib['string']))):
|
|
#if not 'CVTPD2PI' in instrNode.attrib['string']: continue
|
|
|
|
print('Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')')
|
|
|
|
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>']
|
|
|
|
for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]):
|
|
for useIndexedAddr in ([False, True] if useDistinctRegs and (instrNode in tpDictIndexedAddr) else [False]):
|
|
tpResult = None
|
|
|
|
if not useDistinctRegs:
|
|
tpResult = tpDictSameReg[instrNode]
|
|
htmlReports.append('<hr><h2>With the same register for different operands</h2>')
|
|
elif useIndexedAddr:
|
|
tpResult = tpDictIndexedAddr[instrNode]
|
|
htmlReports.append('<hr><h2>With an indexed addressing mode</h2>')
|
|
else:
|
|
tpResult = tpDict[instrNode]
|
|
|
|
rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2))
|
|
|
|
if not useIACA and tpResult.config.preInstrNodes:
|
|
rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes)
|
|
|
|
used_ports = tpResult.all_used_ports
|
|
if debugOutput: print(instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports))
|
|
|
|
if not isAVXInstr(instrNode):
|
|
blockingInstrs = blockingInstructionsDictNonAVX
|
|
sortedPortCombinations = sortedPortCombinationsNonAVX
|
|
else:
|
|
blockingInstrs = blockingInstructionsDictNonSSE
|
|
sortedPortCombinations = sortedPortCombinationsNonSSE
|
|
|
|
uopsCombinationList = []
|
|
|
|
if not used_ports:
|
|
htmlReports.append('No uops')
|
|
elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (tpResult.ILD_stalls in [None, 0]):
|
|
# one uop instruction
|
|
for combination in sortedPortCombinations:
|
|
if used_ports.issubset(combination):
|
|
uopsCombinationList = [(combination, 1)]
|
|
htmlReports.append('<hr>Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in combination))
|
|
break
|
|
elif (rem_uops > 0) and (arch not in ['ZEN+', 'ZEN2']):
|
|
for combination in sortedPortCombinations:
|
|
if not combination.intersection(used_ports): continue
|
|
|
|
prevUopsOnCombination = 0
|
|
for prev_combination, prev_uops in uopsCombinationList:
|
|
if prev_combination.issubset(combination):
|
|
prevUopsOnCombination += prev_uops
|
|
|
|
uopsOnCombinationUnblocked = sum(x for p, x in tpResult.unblocked_ports.items() if p in combination)
|
|
if uopsOnCombinationUnblocked - prevUopsOnCombination < .8:
|
|
continue
|
|
|
|
if not useIACA:
|
|
if tpResult.config.preInstrNodes:
|
|
for preInstrNode in tpResult.config.preInstrNodes:
|
|
for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]:
|
|
if pre_comb.issubset(frozenset(''.join(combination))):
|
|
prevUopsOnCombination += pre_uops
|
|
|
|
nPortsInComb = sum(len(str(x)) for x in combination)
|
|
blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10)
|
|
blockInstrRep = min(blockInstrRep, 100)
|
|
uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports)
|
|
if uopsOnBlockedPorts is None:
|
|
#print('no uops on blocked ports: ' + str(combination))
|
|
continue
|
|
|
|
uopsOnBlockedPorts -= prevUopsOnCombination
|
|
|
|
if rem_uops < uopsOnBlockedPorts:
|
|
print('More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts))
|
|
|
|
if uopsOnBlockedPorts <= 0: continue
|
|
|
|
if isIntelCPU() and (combination == {storeDataPort}) and (instrNode.attrib.get('locked', '') == '1'):
|
|
# for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which
|
|
# seems to lead to replays of the store data uops
|
|
uopsOnBlockedPorts = 1
|
|
|
|
uopsCombinationList.append((combination, uopsOnBlockedPorts))
|
|
|
|
htmlReports.append('<strong>⇨ ' +
|
|
((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') +
|
|
' that can only use port' +
|
|
('s {' if len(combination)>1 else ' ') +
|
|
str(sorted(combination))[1:-1] +
|
|
('}' if len(combination)>1 else '') + '</strong>')
|
|
|
|
rem_uops -= uopsOnBlockedPorts
|
|
if rem_uops <= 0: break
|
|
|
|
# on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports
|
|
# we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate
|
|
uopsCombinationList = [(frozenset(''.join(comb)), uops) for comb, uops in uopsCombinationList]
|
|
|
|
if not useDistinctRegs:
|
|
portCombinationsResultDictSameReg[instrNode] = uopsCombinationList
|
|
elif useIndexedAddr:
|
|
portCombinationsResultDictIndexedAddr[instrNode] = uopsCombinationList
|
|
else:
|
|
portCombinationsResultDict[instrNode] = uopsCombinationList
|
|
|
|
writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
|
|
|
########################
|
|
# Write XML File
|
|
########################
|
|
|
|
for instrNode in tpDict:
|
|
archNode = instrNode.find('./architecture[@name="' + arch + '"]')
|
|
if useIACA:
|
|
resultNode = SubElement(archNode, "IACA")
|
|
resultNode.attrib['version'] = iacaVersion
|
|
else:
|
|
resultNode = archNode.find('./measurement')
|
|
|
|
applicableResults = [(tpDict[instrNode], portCombinationsResultDict.get(instrNode, None), '')]
|
|
for otherTPDict, otherPCDict, suffix in [(tpDictSameReg, portCombinationsResultDictSameReg, '_same_reg'),
|
|
(tpDictIndexedAddr, portCombinationsResultDictIndexedAddr, '_indexed')]:
|
|
if instrNode in otherTPDict:
|
|
t1 = tpDict[instrNode]
|
|
t2 = otherTPDict[instrNode]
|
|
p1 = portCombinationsResultDict.get(instrNode, None)
|
|
p2 = otherPCDict.get(instrNode, None)
|
|
if (t1.uops != t2.uops or t1.fused_uops != t2.fused_uops or t1.uops_MITE != t2.uops_MITE or ((p2 is not None) and (p1 != p2))):
|
|
applicableResults.append((t2, p2, suffix))
|
|
|
|
for tpResult, portUsageList, suffix in applicableResults:
|
|
uops = tpResult.uops
|
|
uopsFused = tpResult.fused_uops
|
|
uopsMITE = tpResult.uops_MITE
|
|
uopsMS = tpResult.uops_MS
|
|
if useIACA:
|
|
resultNode.attrib['TP'+suffix] = "%.2f" % tpResult.TP_loop
|
|
if uopsFused:
|
|
resultNode.attrib['fusion_occurred'] = '1'
|
|
if instrNode in latencyDict:
|
|
resultNode.attrib['latency'] = str(latencyDict[instrNode])
|
|
if instrNode in tpDictNoInteriteration:
|
|
resultNode.attrib['TP_no_interiteration'] = "%.2f" % tpDictNoInteriteration[instrNode]
|
|
else:
|
|
if tpResult.config.preInstrNodes:
|
|
uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes)
|
|
if uopsFused is not None:
|
|
uopsFused -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].fused_uops for preInstrNode in tpResult.config.preInstrNodes)
|
|
if uopsMITE is not None:
|
|
uopsMITE -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops_MITE for preInstrNode in tpResult.config.preInstrNodes)
|
|
if uopsMS is not None:
|
|
uopsMS -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops_MS for preInstrNode in tpResult.config.preInstrNodes)
|
|
if uopsFused is not None:
|
|
resultNode.attrib['uops_retire_slots'+suffix] = str(uopsFused)
|
|
if uopsMITE is not None:
|
|
resultNode.attrib['uops_MITE'+suffix] = str(uopsMITE)
|
|
if uopsMS is not None:
|
|
resultNode.attrib['uops_MS'+suffix] = str(uopsMS)
|
|
if tpResult.complexDec:
|
|
resultNode.attrib['complex_decoder'+suffix] = '1'
|
|
if tpResult.nAvailableSimpleDecoders is not None:
|
|
resultNode.attrib['available_simple_decoders'+suffix] = str(tpResult.nAvailableSimpleDecoders)
|
|
resultNode.attrib['TP_unrolled'+suffix] = "%.2f" % tpResult.TP_noLoop
|
|
resultNode.attrib['TP_loop'+suffix] = "%.2f" % tpResult.TP_loop
|
|
|
|
resultNode.attrib['uops'+suffix] = str(uops)
|
|
|
|
if instrNode in macroFusionDict:
|
|
resultNode.attrib['macro_fusible'] = ';'.join(sorted(m.attrib['string'] for m in macroFusionDict[instrNode]))
|
|
|
|
divCycles = tpResult.divCycles
|
|
if divCycles: resultNode.attrib['div_cycles'+suffix] = str(divCycles)
|
|
|
|
portPrefix = ('p' if isIntelCPU() else 'FP')
|
|
computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(p for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0])))
|
|
if portUsageList:
|
|
resultNode.attrib['ports'+suffix] = computePortStr(portUsageList)
|
|
try:
|
|
resultNode.attrib['TP_ports'+suffix] = "%.2f" % getTP_LP(portUsageList)
|
|
except ValueError as err:
|
|
print('Could not solve LP for ' + instrNode.attrib['string'] + ':')
|
|
print(err)
|
|
|
|
with open(args.output or 'result_'+arch+(('_IACA_' + iacaVersion) if useIACA else '_measured')+'.xml' , "w") as f:
|
|
reparsed = XMLRoot
|
|
if not args.noPretty:
|
|
rough_string = ET.tostring(XMLRoot, 'utf-8')
|
|
reparsed = minidom.parseString(rough_string)
|
|
f.write('\n'.join([line for line in reparsed.toprettyxml(indent=' ').split('\n') if line.strip()]))
|
|
|
|
tarFilename = 'genhtml-' + arch + (('-IACA' + iacaVersion) if useIACA else '-Measurements') + '.tar.gz'
|
|
with tarfile.open(tarFilename, "w:gz") as tar:
|
|
tar.add('/tmp/cpu-html/', arcname=os.path.sep)
|
|
|
|
shutil.rmtree('/tmp/cpu-html/')
|
|
|
|
try:
|
|
subprocess.check_output('umount /tmp/ramdisk/', shell=True)
|
|
except subprocess.CalledProcessError:
|
|
exit(1)
|
|
|
|
print('Total number of microbenchmarks: ' + str(nExperiments))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|