Files
asmbench/jit.py
Julian Hammer 437eaadad4 code cleanups
2018-06-22 10:25:47 +02:00

861 lines
32 KiB
Python
Executable File

#!/usr/bin/env python3
import ctypes
import sys
import time
import textwrap
import itertools
import random
import collections
import pprint
import math
import llvmlite.binding as llvm
import psutil
# TODOs
# * API to create test scenarios
# * DSL?
# * Test cases:
# * Instructions:
# * [x] arithmetics \w reg and/or imm.
# * scalar
# * packed
# * [x] lea
# * [x] LOAD / mov \w mem
# * [TODO] STORE / mov to mem
# * [x] Single Latency
# * [x] Single Throughput
# * [TODO] Combined Throughput
# * [TODO] Random Throughput
# * [TODO] Automated TP, Lat, #pipeline analysis
# * [TODO] IACA marked binary output generation
# * [TODO] Fuzzing algorithm
# * [TODO] CLI
# * C based timing routine? As an extension?
# * make sanity checks during runtime, check for fixed frequency and pinning
def floor_harmonic_fraction(n, error=0.1):
"""
Finds closest floored integer or inverse integer and returns error.
(numerator, denominator, relative error) where either numerator or denominator is exactly one.
"""
floor_n = math.floor(n)
if floor_n > 0:
return floor_n, 1, 1 - floor_n / n
else:
i = 2
while (1 / i) > n:
i += 1
return 1, i, 1 - (1 / i) / n
class Benchmark:
def __init__(self, parallel=1, serial=5):
self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
self.parallel = parallel
self.serial = serial
# Do interesting work
self._loop_body = textwrap.dedent('''\
%"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
%"checksum.1" = call i64 asm sideeffect "
add $1, $0",
"=r,i,r" (i64 1, i64 %"checksum")\
''')
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
def get_ir(self):
# FP add loop - may have issues
# return textwrap.dedent('''\
# define i64 @"test"(i64 %"N")
# {{
# entry:
# %"N.fp" = sitofp i64 %"N" to double
# %"loop_cond" = fcmp olt double 0.0, %"N.fp"
# br i1 %"loop_cond", label %"loop", label %"end"
#
# loop:
# %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"]
# {loop_body}
# %"loop_counter.1" = fadd double %"loop_counter", 1.0
# %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp"
# br i1 %"loop_cond.1", label %"loop", label %"end"
#
# end:
# %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"]
# %"ret" = fptosi double %"ret.fp" to i64
# ret i64 %"ret"
# }}
# ''').format(
# loop_body=textwrap.indent(self._loop_body, ' '))
return textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(self._loop_body, ' '))
def prepare_arguments(self, previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 100,
else:
return int(previous_args[0] * time_factor),
def get_iterations(self, args):
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def get_llvm_module(self):
"""Build and return LLVM module from LLVM IR code."""
if not hasattr(self, '_llvm_module'):
self._llvm_module = llvm.parse_assembly(self.get_ir())
self._llvm_module.verify()
return self._llvm_module
def get_target_machine(self):
"""Instantiate and return target machine."""
if not hasattr(self, '_llvm_module'):
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
self._tm = llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=1)
return self._tm
def get_assembly(self):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
return tm.emit_assembly(self.get_llvm_module())
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self._function_ctype(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
for i in range(repeat):
while True:
start = time.perf_counter()
res = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
factor = target_elapsed / elapsed
args = self.prepare_arguments(previous_args=args, time_factor=factor)
continue
else:
# After we have the right argument choice, we keep it.
fixed_args = True
break
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': psutil.cpu_freq().current * 1e6}
@classmethod
def get_latency(cls, max_serial=6, print_table=False, **kwargs):
if print_table:
print(' s |' + ''.join([' {:^5}'.format(i) for i in range(1, max_serial)]))
print(' | ', end='')
serial_runs = []
for s in range(1, max_serial):
m = cls(serial=s, parallel=1, **kwargs)
r = m.build_and_execute(repeat=1)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * m.parallel * m.serial)
if print_table:
print('{:.3f} '.format(cy_per_it), end='')
sys.stdout.flush()
serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
if print_table:
print()
print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format(
lat=min(serial_runs)[1]))
return min(serial_runs)[1]
@classmethod
def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs):
if print_table:
print('s\p |' + ''.join([' {:^5}'.format(i) for i in range(2, max_parallel)]))
parallel_runs = []
for s in range(1, max_serial):
if print_table:
print('{:>3} | '.format(s), end='')
for p in range(2, max_parallel):
m = cls(serial=s, parallel=p, **kwargs)
r = m.build_and_execute(repeat=1)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * m.parallel * m.serial)
if print_table:
print('{:.3f} '.format(cy_per_it), end='')
sys.stdout.flush()
parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
if print_table:
print()
if print_table:
print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format(
tp=min(parallel_runs)[1]))
return min(parallel_runs)[1]
class InstructionBenchmark(Benchmark):
def __init__(self, instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=4):
"""
Build LLVM IR for arithmetic instruction benchmark without memory references.
Currently only one destination (dst) or combined destination and source (dstsrc) operand
is allowed. Only instruction's operands ($N) refer to the order of opernads found in
dst + dstsrc + src.
"""
Benchmark.__init__(self, parallel=parallel, serial=serial)
self.instruction = instruction
self.dst_operands = dst_operands
self.dstsrc_operands = dstsrc_operands
self.src_operands = src_operands
self._loop_body = ''
if len(dst_operands) + len(dstsrc_operands) != 1:
raise NotImplemented("Must have exactly one dst or dstsrc operand.")
if not all([op[0] in 'irx'
for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
raise NotImplemented("This class only supports register and immediate operands.")
# Part 1: PHI functions and initializations
for i, dstsrc_op in enumerate(dstsrc_operands):
# constraint code, llvm type string, initial value
if dstsrc_op[0] in 'rx':
# register operand
for p in range(self.parallel):
self._loop_body += (
'%"dstsrc{index}_{p}" = phi {type} '
'[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
else:
raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
# Part 2: Inline ASM call
# Build constraint string from operands
constraints = ','.join(
['=' + dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
[sop[0] for sop in itertools.chain(src_operands)] +
['{}'.format(i + len(dst_operands)) for i in range(len(dstsrc_operands))])
for i, dstsrc_op in enumerate(dstsrc_operands):
# Build instruction from instruction and operands
# TODO support multiple dstsrc operands
# TODO support dst and dstsrc operands at the same time
for p in range(self.parallel):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
for j, dop in enumerate(dstsrc_operands):
operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=j, p=p))
args = ', '.join(operands)
self._loop_body += (
'%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dstsrc_op[1],
instruction='\n'.join([instruction] * self.serial),
constraints=constraints,
args=args,
p=p)
for i, dst_op in enumerate(dst_operands):
# Build instruction from instruction and operands
# TODO support multiple dst operands
# TODO support dst and dstsrc operands at the same time
if self.serial != 1:
raise NotImplemented("Serial > 1 and dst operand is not supported.")
for p in range(self.parallel):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
args = ', '.join(operands)
self._loop_body += (
'%"dst{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dst_op[1],
instruction=instruction,
constraints=constraints,
args=args,
p=p)
class AddressGenerationBenchmark(Benchmark):
def __init__(self,
offset=('i', 'i64', '0x42'),
base=('r', 'i64', '0'),
index=('r', 'i64', '0'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=4):
"""
Benchmark for address generation modes.
Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
or 'i' (immediate) and initial_value a string.
E.g., ('r', 'i64', '0') or ('i', None, '4')
+--------------------------------+-----------------------------+
| Mode | AT&T |
+--------------------------------+-----------------------------+
| Offset | leal 0x0100, %eax | <- no latency support
| Base | leal (%esi), %eax |
| Offset + Base | leal -8(%ebp), %eax |
| Offset + Index*Width | leal 0x100(,%ebx,4), %eax |
| Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax |
+--------------------------------+-----------------------------+
OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
offset: immediate integer (+/-)
base: register
index: register
width: immediate 1,2,4 or 8
"""
Benchmark.__init__(self, parallel=parallel, serial=serial)
self.offset = offset
self.base = base
self.index = index
self.width = width
self.destination = destination
self.parallel = parallel
# Sanity checks:
if bool(index) ^ bool(width):
raise ValueError("Index and width both need to be set, or be None.")
elif index and width:
if width[0] != 'i' or int(width[2]) not in [1, 2, 4, 8]:
raise ValueError("Width may only be immediate 1,2,4 or 8.")
if index[0] != 'r':
raise ValueError("Index must be a register.")
if offset and offset[0] != 'i':
raise ValueError("Offset must be an immediate.")
if base and base[0] != 'r':
raise ValueError("Offset must be a register.")
if not index and not width and not offset and not base:
raise ValueError("Must provide at least an offset or base.")
if destination == 'base' and not base:
raise ValueError("Destination may only be set to 'base' if base is set.")
elif destination == 'index' and not index:
raise ValueError("Destination may only be set to 'index' if index is set.")
elif destination not in ['base', 'index']:
raise ValueError("Destination must be set to 'base' or 'index'.")
if not base and not index:
raise ValueError("Either base or index must be set for latency test to work.")
if serial != 1 and not (base or index):
raise ValueError("Serial > 1 only works with index and/or base in use.")
self._loop_body = ''
ops = ''
if offset:
ops += offset[2]
if base:
ops += '($0'
if width and index:
ops += ',$1,{}'.format(width[2])
ops += ')'
if destination == 'base':
ops += ', $0'
else: # destination == 'index'
ops += ', $1'
else:
if width and index:
ops += '(,$0,{}), $0'.format(width[2])
ops += ' '
if destination == 'base':
destination_reg = base
else: # destination == 'index'
destination_reg = index
# Part 1: PHI function for destination
for p in range(parallel):
self._loop_body += (
'%"{name}_{p}.0" = '
'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format(
name=destination, type=destination_reg[1], initial=destination_reg[2], p=p,
s=self.serial)
for p in range(parallel):
for s in range(self.serial):
constraints = '=r,r'
if base and index:
constraints += ',r'
if destination == 'base':
args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format(
base_type=base[1], base_name=destination,
index_type=index[1], index_value=index[2], p=p, s_in=s)
else: # destination == 'index':
args = '{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format(
base_type=base[1], base_value=base[2],
index_type=index[1], index_name=destination, p=p, s_in=s)
else:
args = '{type} %"{name}_{p}.{s_in}"'.format(
type=destination_reg[1], name=destination, p=p, s_in=s)
self._loop_body += (
'%"{name}_{p}.{s_out}" = call {type} asm sideeffect'
' "lea {ops}", "{constraints}" ({args})\n').format(
name=destination,
type=destination_reg[1],
ops=ops,
constraints=constraints,
args=args,
p=p,
s_out=s + 1)
class LoadBenchmark(Benchmark):
def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4):
"""
Benchmark for L1 load using pointer chasing.
*chain_length* is the number of pointers to place in memory.
*structure* may be 'linear' (1-offsets) or 'random'.
"""
Benchmark.__init__(self, parallel=parallel, serial=serial)
self._loop_body = ''
element_type = ctypes.POINTER(ctypes.c_int)
self._function_ctype = ctypes.CFUNCTYPE(
ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
self.chain_length = chain_length
self.parallel = parallel
self.structure = structure
self._pointer_field = (element_type * chain_length)()
if chain_length % serial != 0:
raise ValueError(
"chain_length ({}) needs to be divisible by serial factor ({}).".format(
chain_length, serial))
# Initialize pointer field
# Field must represent a ring of pointers
if structure == 'linear':
for i in range(chain_length):
self._pointer_field[i] = ctypes.cast(
ctypes.pointer(self._pointer_field[(i + 1) % chain_length]), element_type)
elif structure == 'random':
shuffled_indices = list(range(chain_length))
random.shuffle(shuffled_indices)
for i in range(chain_length):
self._pointer_field[shuffled_indices[i]] = ctypes.cast(
ctypes.pointer(self._pointer_field[shuffled_indices[(i + 1) % chain_length]]),
element_type)
else:
raise ValueError("Given structure is not supported. Supported are: "
"linear and random.")
def prepare_arguments(self, previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return self._pointer_field, 100
else:
return previous_args[0], int(previous_args[1] * time_factor)
def get_iterations(self, args):
"""Return number of iterations performed, based on lower level function arguments."""
return self.chain_length * args[1]
def get_ir(self):
"""
Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1):
int test(int** ptrf, int repeat) {
int** p0 = (int**)ptrf[0];
int i = 0;
while(i < N) {
int** p = (int**)*p0;
while(p != p0) {
p = (int**)*p;
}
i++;
}
return i;
}
"""
ret = textwrap.dedent('''
define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
entry:
''')
# Load pointer to ptrf[p] and p0
for p in range(self.parallel):
if p > 0:
ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
ret += (
' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
ret += textwrap.dedent('''
%"cmp.entry" = icmp sgt i32 %"repeats", 0
br i1 %"cmp.entry", label %"loop0", label %"end"
loop0:
br label %"loop1"
loop1:
%"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
br label %"loop2"
loop2:\n''')
for p in range(self.parallel):
ret += (' %"p_{p}.0" = phi i32** '
'[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format(
p=p, s_max=self.serial)
# load p, compare to p0 and or-combine results
for p in range(self.parallel):
for s in range(self.serial):
ret += (' %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n'
' %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format(
p=p, s=s + 1, s_prev=s)
# Compare is needed for all registers, for llvm not to remove unused
# instructions:
ret += ' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format(
p=p, s_max=self.serial)
# TODO tree reduce cmp to make use of all cmp_* values
# It is sufficient to use only one compare, all others will be eliminated
ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
ret += textwrap.dedent('''
loop3:
%"i.1" = add i32 %"i", 1
%"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
br i1 %"cmp.loop3", label %"end", label %"loop1"
end:
%"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
ret i32 %"ret"
}''')
return ret
if __name__ == '__main__':
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
modules = collections.OrderedDict()
# immediate source
modules['add i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=1,
serial=5)
# register source
modules['add r64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('r', 'i64', '1'),),
parallel=1,
serial=5)
# multiple instructions
modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=1,
serial=5)
# immediate source
modules['add i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=5)
# register source
modules['add r64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('r', 'i64', '1'),),
parallel=10,
serial=5)
# multiple instructions
modules['4xadd i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=1)
modules['lea base LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5)
modules['lea base+offset LAT'] = AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5)
modules['lea index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5)
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5)
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5)
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5)
modules['lea base TP'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1)
modules['lea base+offset TP'] = AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1)
modules['lea index*width TP'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1)
modules['lea offset+index*width TP'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1)
modules['lea base+index*width TP'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1)
modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1)
modules['LD linear LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=1,
serial=2)
modules['LD random LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=1,
serial=2)
modules['LD linear TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=4,
serial=2)
modules['LD random TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=4,
serial=2)
modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark(
instruction='vaddpd $1, $0, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=1,
serial=5)
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark(
instruction='vmulpd $1, $0, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=1,
serial=5)
# This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies:
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark(
instruction='vmulpd $1, $2, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=10,
serial=1)
# modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('lea base LAT')])
verbose = 2 if '-v' in sys.argv else 0
for key, module in modules.items():
if verbose > 0:
print("=== Benchmark")
print(repr(module))
print("=== LLVM")
print(module.get_ir())
print("=== Assembly")
print(module.get_assembly())
r = module.build_and_execute(repeat=3)
if verbose > 0:
print("=== Result")
pprint.pprint(r)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * module.parallel * module.serial)
print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format(
key=key,
module=module,
cy_per_it=cy_per_it,
runtime_sum=sum(r['runtimes'])))
# InstructionBenchmark.get_latency(
# instruction='vmulpd $1, $0, $0',
# dst_operands=(),
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
# print_table=True)
# InstructionBenchmark.get_throughput(
# instruction='vmulpd $1, $0, $0',
# dst_operands=(),
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
# print_table=True)
#
# InstructionBenchmark.get_latency(
# instruction='nop',
# dst_operands=(),
# dstsrc_operands=(('r','i8', '0'),),
# src_operands=(),
# print_table=True)
# InstructionBenchmark.get_throughput(
# instruction='nop',
# dst_operands=(),
# dstsrc_operands=(('r','i8', '0'),),
# src_operands=(),
# print_table=True)