#!/usr/bin/env python3 import ctypes import sys import time import textwrap import itertools import random import collections import pprint import math import llvmlite.binding as llvm import psutil # TODOs # * API to create test scenarios # * DSL? # * Test cases: # * Instructions: # * [x] arithmetics \w reg and/or imm. # * scalar # * packed # * [x] lea # * [x] LOAD / mov \w mem # * [TODO] STORE / mov to mem # * [x] Single Latency # * [x] Single Throughput # * [TODO] Combined Throughput # * [TODO] Random Throughput # * [TODO] Automated TP, Lat, #pipeline analysis # * [TODO] IACA marked binary output generation # * [TODO] Fuzzing algorithm # * [TODO] CLI # * C based timing routine? As an extension? # * make sanity checks during runtime, check for fixed frequency and pinning def floor_harmonic_fraction(n, error=0.1): ''' Finds closest floored integer or inverse integer and returns error. (numerator, denominator, relative error) where either numerator or denominator is exactly one. ''' floor_n = math.floor(n) if floor_n > 0: return (floor_n, 1, 1 - floor_n/n) else: i = 2 while (1/i) > n: i += 1 return (1, i, 1 - (1/i)/n) class Benchmark: def __init__(self, parallel=1, serial=5): self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64) self.parallel = parallel self.serial = serial # Do interesting work self._loop_body = textwrap.dedent('''\ %"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"] %"checksum.1" = call i64 asm sideeffect " add $1, $0", "=r,i,r" (i64 1, i64 %"checksum")\ ''') def __repr__(self): return '{}({})'.format( self.__class__.__name__, ', '.join(['{}={!r}'.format(k,v) for k,v in self.__dict__.items() if not k.startswith('_')])) def get_ir(self): # FP add loop - may have issues #return textwrap.dedent('''\ # define i64 @"test"(i64 %"N") # {{ # entry: # %"N.fp" = sitofp i64 %"N" to double # %"loop_cond" = fcmp olt double 0.0, %"N.fp" # br i1 %"loop_cond", label %"loop", label %"end" # # loop: # %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"] # {loop_body} # %"loop_counter.1" = fadd double %"loop_counter", 1.0 # %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp" # br i1 %"loop_cond.1", label %"loop", label %"end" # # end: # %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"] # %"ret" = fptosi double %"ret.fp" to i64 # ret i64 %"ret" # }} # ''').format( # loop_body=textwrap.indent(self._loop_body, ' ')) return textwrap.dedent('''\ define i64 @"test"(i64 %"N") {{ entry: %"loop_cond" = icmp slt i64 0, %"N" br i1 %"loop_cond", label %"loop", label %"end" loop: %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"] {loop_body} %"loop_counter.1" = add i64 %"loop_counter", 1 %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N" br i1 %"loop_cond.1", label %"loop", label %"end" end: %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"] ret i64 %"ret" }} ''').format( loop_body=textwrap.indent(self._loop_body, ' ')) def prepare_arguments(self, previous_args=None, time_factor=1.0): '''Build argument tuple, to be passed to low level function.''' if previous_args is None: return (100,) else: return (int(previous_args[0]*time_factor),) def get_iterations(self, args): '''Return number of iterations performed, based on lower level function arguments.''' return args[0] def get_llvm_module(self): '''Build and return LLVM module from LLVM IR code.''' if not hasattr(self, '_llvm_module'): self._llvm_module = llvm.parse_assembly(self.get_ir()) self._llvm_module.verify() return self._llvm_module def get_target_machine(self): '''Instantiate and return target machine.''' if not hasattr(self, '_llvm_module'): features=llvm.get_host_cpu_features().flatten() cpu=llvm.get_host_cpu_name() self._tm = llvm.Target.from_default_triple().create_target_machine( cpu=cpu, features=features, opt=1) return self._tm def get_assembly(self): '''Compile and return assembly from LLVM module.''' tm = self.get_target_machine() tm.set_asm_verbosity(0) return tm.emit_assembly(self.get_llvm_module()) def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3): # Compile the module to machine code using MCJIT tm = self.get_target_machine() runtimes = [] args = self.prepare_arguments() with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee: ee.finalize_object() # Obtain a pointer to the compiled 'sum' - it's the address of its JITed # code in memory. cfptr = ee.get_function_address('test') # To convert an address to an actual callable thing we have to use # CFUNCTYPE, and specify the arguments & return type. cfunc = self._function_ctype(cfptr) # Now 'cfunc' is an actual callable we can invoke # TODO replace time.clock with a C implemententation for less overhead # TODO return result in machine readable format fixed_args = False for i in range(repeat): while True: start = time.perf_counter() res = cfunc(*args) end = time.perf_counter() elapsed = end-start if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed): target_elapsed = 2/3*min_elapsed+1/3*max_elapsed factor = target_elapsed / elapsed args = self.prepare_arguments(previous_args=args, time_factor=factor) continue else: # After we have the right argument choice, we keep it. fixed_args = True break runtimes.append(elapsed) return {'iterations': self.get_iterations(args), 'arguments': args, 'runtimes': runtimes, 'frequency': psutil.cpu_freq().current*1e6} @classmethod def get_latency(cls, max_serial=6, print_table=False, **kwargs): if print_table: print(' s |'+''.join([' {:^5}'.format(i) for i in range(1, max_serial)])) print(' | ', end='') serial_runs = [] for s in range(1, max_serial): m = cls(serial=s, parallel=1, **kwargs) r = m.build_and_execute(repeat=1) cy_per_it = min(r['runtimes'])*r['frequency']/(r['iterations']*m.parallel*m.serial) if print_table: print('{:.3f} '.format(cy_per_it), end='') sys.stdout.flush() serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m)) if print_table: print() print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format( lat=min(serial_runs)[1])) return min(serial_runs)[1] @classmethod def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs): if print_table: print('s\p |'+''.join([' {:^5}'.format(i) for i in range(2, max_parallel)])) parallel_runs = [] for s in range(1, max_serial): if print_table: print('{:>3} | '.format(s), end='') for p in range(2, max_parallel): m = cls(serial=s, parallel=p, **kwargs) r = m.build_and_execute(repeat=1) cy_per_it = min(r['runtimes'])*r['frequency']/(r['iterations']*m.parallel*m.serial) if print_table: print('{:.3f} '.format(cy_per_it), end='') sys.stdout.flush() parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m)) if print_table: print() if print_table: print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format( tp=min(parallel_runs)[1])) return min(parallel_runs)[1] class InstructionBenchmark(Benchmark): def __init__(self, instruction='addq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('i','i64', '1'),), parallel=10, serial=4): ''' Build LLVM IR for arithmetic instruction benchmark without memory references. Currently only one destination (dst) or combined destination and source (dstsrc) operand is allowed. Only instruction's operands ($N) refer to the order of opernads found in dst + dstsrc + src. ''' Benchmark.__init__(self, parallel=parallel, serial=serial) self.instruction = instruction self.dst_operands = dst_operands self.dstsrc_operands = dstsrc_operands self.src_operands = src_operands self._loop_body = '' if len(dst_operands) + len(dstsrc_operands) != 1: raise NotImplemented("Must have exactly one dst or dstsrc operand.") if not all([op[0] in 'irx' for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]): raise NotImplemented("This class only supports register and immediate operands.") # Part 1: PHI functions and initializations for i, dstsrc_op in enumerate(dstsrc_operands): # constraint code, llvm type string, initial value if dstsrc_op[0] in 'rx': # register operand for p in range(self.parallel): self._loop_body += ( '%"dstsrc{index}_{p}" = phi {type} ' '[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format( index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p) else: raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op)) # Part 2: Inline ASM call # Build constraint string from operands constraints = ','.join( ['='+dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] + [sop[0] for sop in itertools.chain(src_operands, dstsrc_operands)]) for i, dstsrc_op in enumerate(dstsrc_operands): # Build instruction from instruction and operands # TODO support multiple dstsrc operands # TODO support dst and dstsrc operands at the same time for p in range(self.parallel): operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands] for i, dop in enumerate(dstsrc_operands): operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=i, p=p)) args = ', '.join(operands) self._loop_body += ( '%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect' ' "{instruction}", "{constraints}" ({args})\n').format( index=i, dst_type=dstsrc_op[1], instruction='\n'.join([instruction]*self.serial), constraints=constraints, args=args, p=p) for i, dst_op in enumerate(dst_operands): # Build instruction from instruction and operands # TODO support multiple dst operands # TODO support dst and dstsrc operands at the same time if self.serial != 1: raise NotImplemented("Serial > 1 and dst operand is not supported.") for p in range(self.parallel): operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands] args = ', '.join(operands) self._loop_body += ( '%"dst{index}_{p}.out" = call {dst_type} asm sideeffect' ' "{instruction}", "{constraints}" ({args})\n').format( index=i, dst_type=dst_op[1], instruction=instruction, constraints=constraints, args=args, p=p) class AddressGenerationBenchmark(Benchmark): def __init__(self, offset=('i', 'i64', '0x42'), base=('r', 'i64', '0'), index=('r', 'i64', '0'), width=('i', None, '4'), destination='base', parallel=10, serial=4): ''' Benchmark for address generation modes. Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register) or 'i' (immediate) and initial_value a string. E.g., ('r', 'i64', '0') or ('i', None, '4') +--------------------------------+-----------------------------+ | Mode | AT&T | +--------------------------------+-----------------------------+ | Offset | leal 0x0100, %eax | <- no latency support | Base | leal (%esi), %eax | | Offset + Base | leal -8(%ebp), %eax | | Offset + Index*Width | leal 0x100(,%ebx,4), %eax | | Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax | +--------------------------------+-----------------------------+ OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width offset: immediate integer (+/-) base: register index: register width: immediate 1,2,4 or 8 ''' Benchmark.__init__(self, parallel=parallel, serial=serial) self.offset = offset self.base = base self.index = index self.width = width self.destination = destination self.parallel = parallel # Sanity checks: if bool(index) ^ bool(width): raise ValueError("Index and width both need to be set, or be None.") elif index and width: if width[0] != 'i' or int(width[2]) not in [1,2,4,8]: raise ValueError("Width may only be immediate 1,2,4 or 8.") if index[0] != 'r': raise ValueError("Index must be a register.") if offset and offset[0] != 'i': raise ValueError("Offset must be an immediate.") if base and base[0] != 'r': raise ValueError("Offset must be a register.") if not index and not width and not offset and not base: raise ValueError("Must provide at least an offset or base.") if destination == 'base' and not base: raise ValueError("Destination may only be set to 'base' if base is set.") elif destination == 'index' and not index: raise ValueError("Destination may only be set to 'index' if index is set.") elif destination not in ['base', 'index']: raise ValueError("Destination must be set to 'base' or 'index'.") if not base and not index: raise ValueError("Either base or index must be set for latency test to work.") if serial != 1 and not (base or index): raise ValueError("Serial > 1 only works with index and/or base in use.") self._loop_body = '' ops = '' if offset: ops += offset[2] if base: ops += '($0' if width and index: ops += ',$1,{}'.format(width[2]) ops += ')' if destination == 'base': ops += ', $0' else: # destination == 'index' ops += ', $1' else: if width and index: ops += '(,$0,{}), $0'.format(width[2]) ops += ' ' if destination == 'base': destination_reg = base else: # destination == 'index' destination_reg = index # Part 1: PHI function for destination for p in range(parallel): self._loop_body += ( '%"{name}_{p}.0" = ' 'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format( name=destination, type=destination_reg[1], initial=destination_reg[2], p=p, s=self.serial) for p in range(parallel): for s in range(self.serial): constraints = '=r,r' if base and index: constraints += ',r' if destination == 'base': args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format( base_type=base[1], base_name=destination, index_type=index[1], index_value=index[2], p=p, s_in=s) else: # destination == 'index': args ='{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format( base_type=base[1], base_value=base[2], index_type=index[1], index_name=destination, p=p, s_in=s) else: args = '{type} %"{name}_{p}.{s_in}"'.format( type=destination_reg[1], name=destination, p=p, s_in=s) self._loop_body += ( '%"{name}_{p}.{s_out}" = call {type} asm sideeffect' ' "lea {ops}", "{constraints}" ({args})\n').format( name=destination, type=destination_reg[1], ops=ops, constraints=constraints, args=args, p=p, s_out=s+1) class LoadBenchmark(Benchmark): def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4): ''' Benchmark for L1 load using pointer chasing. *chain_length* is the number of pointers to place in memory. *structure* may be 'linear' (1-offsets) or 'random'. ''' Benchmark.__init__(self, parallel=parallel, serial=serial) self._loop_body = '' element_type = ctypes.POINTER(ctypes.c_int) self._function_ctype = ctypes.CFUNCTYPE( ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int) self.chain_length = chain_length self.parallel = parallel self.structure = structure self._pointer_field = (element_type * chain_length)() if chain_length % serial != 0: raise ValueError( "chain_length ({}) needs to be divisible by serial factor ({}).".format( chain_length, serial)) # Initialize pointer field # Field must represent a ring of pointers if structure == 'linear': for i in range(chain_length): self._pointer_field[i] = ctypes.cast( ctypes.pointer(self._pointer_field[(i+1)%chain_length]), element_type) elif structure == 'random': shuffled_indices = list(range(chain_length)) random.shuffle(shuffled_indices) for i in range(chain_length): self._pointer_field[shuffled_indices[i]] = ctypes.cast( ctypes.pointer(self._pointer_field[shuffled_indices[(i+1)%chain_length]]), element_type) else: raise ValueError("Given structure is not supported. Supported are: " "linear and random.") def prepare_arguments(self, previous_args=None, time_factor=1.0): '''Build argument tuple, to be passed to low level function.''' if previous_args is None: return (self._pointer_field, 100) else: return (previous_args[0], int(previous_args[1]*time_factor)) def get_iterations(self, args): '''Return number of iterations performed, based on lower level function arguments.''' return self.chain_length*args[1] def get_ir(self): ''' Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1): int test(int** ptrf, int repeat) { int** p0 = (int**)ptrf[0]; int i = 0; while(i < N) { int** p = (int**)*p0; while(p != p0) { p = (int**)*p; } i++; } return i; } ''' ret = textwrap.dedent(''' define i32 @test(i32** %"ptrf_0", i32 %"repeats") { entry: ''') # Load pointer to ptrf[p] and p0 for p in range(self.parallel): if p > 0: ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p) ret += ( ' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n' ' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p) ret += textwrap.dedent(''' %"cmp.entry" = icmp sgt i32 %"repeats", 0 br i1 %"cmp.entry", label %"loop0", label %"end" loop0: br label %"loop1" loop1: %"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ] br label %"loop2" loop2:\n''') for p in range(self.parallel): ret += (' %"p_{p}.0" = phi i32** ' '[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format( p=p, s_max=self.serial) # load p, compare to p0 and or-combine results for p in range(self.parallel): for s in range(self.serial): ret += (' %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n' ' %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format( p=p, s=s+1, s_prev=s) # Compare is needed for all registers, for llvm not to remove unused # instructions: ret += ' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format( p=p, s_max=self.serial) # TODO tree reduce cmp to make use of all cmp_* values # It is sufficient to use only one compare, all others will be eliminated ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n' ret += textwrap.dedent(''' loop3: %"i.1" = add i32 %"i", 1 %"cmp.loop3" = icmp eq i32 %"i.1", %"repeats" br i1 %"cmp.loop3", label %"end", label %"loop1" end: %"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ] ret i32 %"ret" }''') return ret if __name__ == '__main__': llvm.initialize() llvm.initialize_native_target() llvm.initialize_native_asmprinter() llvm.initialize_native_asmparser() modules = collections.OrderedDict() # immediate source modules['add i64 r64 LAT'] = InstructionBenchmark( instruction='addq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('i','i64', '1'),), parallel=1, serial=5) # register source modules['add r64 r64 LAT'] = InstructionBenchmark( instruction='addq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('r','i64', '1'),), parallel=1, serial=5) # multiple instructions modules['4xadd i64 r64 LAT'] = InstructionBenchmark( instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('i','i64', '1'),), parallel=1, serial=5) # immediate source modules['add i64 r64 TP'] = InstructionBenchmark( instruction='addq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('i','i64', '1'),), parallel=10, serial=5) # register source modules['add r64 r64 TP'] = InstructionBenchmark( instruction='addq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('r','i64', '1'),), parallel=10, serial=5) # multiple instructions modules['4xadd i64 r64 TP'] = InstructionBenchmark( instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0', dst_operands=(), dstsrc_operands=(('r','i64', '0'),), src_operands=(('i','i64', '1'),), parallel=10, serial=1) modules['lea base LAT'] = AddressGenerationBenchmark( offset=None, base=('r', 'i64', '666'), index=None, width=None, destination='base', parallel=1, serial=5) modules['lea base+offset LAT'] = AddressGenerationBenchmark( offset=('i', None, '23'), base=('r', 'i64', '666'), index=None, width=None, destination='base', parallel=1, serial=5) modules['lea index*width LAT'] = AddressGenerationBenchmark( offset=None, base=None, index=('r', 'i64', '1'), width=('i', None, '4'), destination='index', parallel=1, serial=5) modules['lea offset+index*width LAT'] = AddressGenerationBenchmark( offset=('i', 'i64', '-0x8'), base=None, index=('r', 'i64', '51'), width=('i', None, '4'), destination='index', parallel=1, serial=5) modules['lea base+index*width LAT'] = AddressGenerationBenchmark( offset=None, base=('r', 'i64', '23'), index=('r', 'i64', '12'), width=('i', None, '4'), destination='base', parallel=1, serial=5) modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark( offset=('i', None, '42'), base=('r', 'i64', '23'), index=('r', 'i64', '12'), width=('i', None, '4'), destination='base', parallel=1, serial=5) modules['lea base TP'] = AddressGenerationBenchmark( offset=None, base=('r', 'i64', '666'), index=None, width=None, destination='base', parallel=10, serial=1) modules['lea base+offset TP'] = AddressGenerationBenchmark( offset=('i', None, '23'), base=('r', 'i64', '666'), index=None, width=None, destination='base', parallel=10, serial=1) modules['lea index*width TP'] = AddressGenerationBenchmark( offset=None, base=None, index=('r', 'i64', '1'), width=('i', None, '4'), destination='index', parallel=10, serial=1) modules['lea offset+index*width TP'] = AddressGenerationBenchmark( offset=('i', 'i64', '-0x8'), base=None, index=('r', 'i64', '51'), width=('i', None, '4'), destination='index', parallel=10, serial=1) modules['lea base+index*width TP'] = AddressGenerationBenchmark( offset=None, base=('r', 'i64', '23'), index=('r', 'i64', '12'), width=('i', None, '4'), destination='base', parallel=10, serial=1) modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark( offset=('i', None, '42'), base=('r', 'i64', '23'), index=('r', 'i64', '12'), width=('i', None, '4'), destination='base', parallel=10, serial=1) modules['LD linear LAT'] = LoadBenchmark( chain_length=2048, # 2048 * 8B = 16kB structure='linear', parallel=1, serial=2) modules['LD random LAT'] = LoadBenchmark( chain_length=2048, # 2048 * 8B = 16kB structure='random', parallel=1, serial=2) modules['LD linear TP'] = LoadBenchmark( chain_length=2048, # 2048 * 8B = 16kB structure='linear', parallel=4, serial=2) modules['LD random TP'] = LoadBenchmark( chain_length=2048, # 2048 * 8B = 16kB structure='random', parallel=4, serial=2) modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark( instruction='vaddpd $1, $0, $0', dst_operands=(), dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),), src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),), parallel=1, serial=5) modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark( instruction='vmulpd $1, $0, $0', dst_operands=(), dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),), src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),), parallel=1, serial=5) # This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies: modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark( instruction='vmulpd $1, $2, $0', dst_operands=(), dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),), src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),), parallel=10, serial=1) modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('lea base LAT')]) verbose = 2 if '-v' in sys.argv else 0 for key, module in modules.items(): if verbose > 0: print("=== Benchmark") print(repr(module)) print("=== LLVM") print(module.get_ir()) print("=== Assembly") print(module.get_assembly()) r = module.build_and_execute(repeat=3) if verbose > 0: print("=== Result") pprint.pprint(r) cy_per_it = min(r['runtimes'])*r['frequency']/(r['iterations']*module.parallel*module.serial) print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format( key=key, module=module, cy_per_it=cy_per_it, runtime_sum=sum(r['runtimes']))) #InstructionBenchmark.get_latency( # instruction='vmulpd $1, $0, $0', # dst_operands=(), # dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),), # src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))), # ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),), # print_table=True) #InstructionBenchmark.get_throughput( # instruction='vmulpd $1, $0, $0', # dst_operands=(), # dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),), # src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))), # ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),), # print_table=True) # #InstructionBenchmark.get_latency( # instruction='nop', # dst_operands=(), # dstsrc_operands=(('r','i8', '0'),), # src_operands=(), # print_table=True) #InstructionBenchmark.get_throughput( # instruction='nop', # dst_operands=(), # dstsrc_operands=(('r','i8', '0'),), # src_operands=(), # print_table=True)