mirror of
https://github.com/RRZE-HPC/asmbench.git
synced 2025-09-08 01:50:06 +02:00
656 lines
24 KiB
Python
Executable File
656 lines
24 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import ctypes
|
|
import sys
|
|
import time
|
|
import textwrap
|
|
import itertools
|
|
import random
|
|
import collections
|
|
|
|
import llvmlite.binding as llvm
|
|
import psutil
|
|
|
|
# TODOs
|
|
# * API to create test scenarios
|
|
# * DSL?
|
|
# * Test cases:
|
|
# * Instructions:
|
|
# * arithmetics \w reg and/or imm.
|
|
# * scalar
|
|
# * packed
|
|
# * lea
|
|
# * LOAD / mov \w mem
|
|
# * Single Latency
|
|
# * Single Throughput
|
|
# * Combined Throughput
|
|
# * Random Throughput
|
|
# * IACA marked binary output generation
|
|
# * Fuzzing algorithm
|
|
# * CLI
|
|
# * C based timing routine? As an extension?
|
|
# * make sanity checks during runtime, check for fixed frequency and pinning
|
|
|
|
|
|
class Benchmark:
|
|
LLVM2CTYPE = {
|
|
'i8': ctypes.c_int8,
|
|
'i16': ctypes.c_int16,
|
|
'i32': ctypes.c_int32,
|
|
'i64': ctypes.c_int64,
|
|
'f32': ctypes.c_float,
|
|
'f64': ctypes.c_double,
|
|
'i8*': ctypes.POINTER(ctypes.c_int8),
|
|
'i16*': ctypes.POINTER(ctypes.c_int16),
|
|
'i32*': ctypes.POINTER(ctypes.c_int32),
|
|
'i64*': ctypes.POINTER(ctypes.c_int64),
|
|
'f32*': ctypes.POINTER(ctypes.c_float),
|
|
'f64*': ctypes.POINTER(ctypes.c_double),
|
|
}
|
|
def __init__(self):
|
|
self._loop_init = ''
|
|
self._ret_llvmtype = 'i64'
|
|
self._ret_ctype = self.LLVM2CTYPE[self._ret_llvmtype]
|
|
self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int)
|
|
self._iterations = 100000000
|
|
|
|
# Do interesting work
|
|
self._loop_body = textwrap.dedent('''\
|
|
%"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
|
|
%"checksum.1" = call i64 asm sideeffect "
|
|
add $1, $0",
|
|
"=r,i,r" (i64 1, i64 %"checksum")\
|
|
''')
|
|
|
|
# Set %"ret" to something, needs to be a constant or phi function
|
|
self._loop_tail = textwrap.dedent('''\
|
|
%"ret" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]\
|
|
''')
|
|
|
|
def __repr__(self):
|
|
return '{}({})'.format(
|
|
self.__class__.__name__,
|
|
', '.join(['{}={!r}'.format(k,v) for k,v in self.__dict__.items()
|
|
if not k.startswith('_')]))
|
|
|
|
def get_ir(self):
|
|
return textwrap.dedent('''\
|
|
define {ret_type} @"test"(i64 %"N")
|
|
{{
|
|
entry:
|
|
%"loop_cond" = icmp slt i64 0, %"N"
|
|
{loop_init}
|
|
br i1 %"loop_cond", label %"loop", label %"end"
|
|
|
|
loop:
|
|
%"loop_counter" = phi {ret_type} [0, %"entry"], [%"loop_counter.1", %"loop"]
|
|
{loop_body}
|
|
%"loop_counter.1" = add i64 %"loop_counter", 1
|
|
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
|
br i1 %"loop_cond.1", label %"loop", label %"end"
|
|
|
|
end:
|
|
{loop_tail}
|
|
ret {ret_type} %"ret"
|
|
}}
|
|
''').format(
|
|
ret_type=self._ret_llvmtype,
|
|
loop_init=textwrap.indent(self._loop_init, ' '),
|
|
loop_body=textwrap.indent(self._loop_body, ' '),
|
|
loop_tail=textwrap.indent(self._loop_tail, ' '))
|
|
|
|
def prepare_arguments(self):
|
|
'''Build argument tuple, to be passed to low level function.'''
|
|
return (self._iterations,)
|
|
|
|
def build_and_execute(self, repeat=10, print_assembly=True):
|
|
llvm_module = llvm.parse_assembly(self.get_ir())
|
|
llvm_module.verify()
|
|
|
|
# Compile the module to machine code using MCJIT
|
|
tm = llvm.Target.from_default_triple().create_target_machine()
|
|
tm.set_asm_verbosity(0)
|
|
runtimes = []
|
|
with llvm.create_mcjit_compiler(llvm_module, tm) as ee:
|
|
ee.finalize_object()
|
|
if print_assembly:
|
|
print('=== Assembly')
|
|
print(tm.emit_assembly(llvm_module))
|
|
|
|
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
|
|
# code in memory.
|
|
cfptr = ee.get_function_address('test')
|
|
|
|
# To convert an address to an actual callable thing we have to use
|
|
# CFUNCTYPE, and specify the arguments & return type.
|
|
cfunc = self._function_ctype(cfptr)
|
|
|
|
# Now 'cfunc' is an actual callable we can invoke
|
|
# TODO replace time.clock with a C implemententation for less overhead
|
|
# TODO return result in machine readable format
|
|
args = self.prepare_arguments()
|
|
for i in range(repeat):
|
|
start = time.perf_counter()
|
|
res = cfunc(*args)
|
|
end = time.perf_counter()
|
|
runtimes.append(end-start)
|
|
|
|
return {'iterations': self._iterations,
|
|
'runtimes': runtimes,
|
|
'frequency': psutil.cpu_freq().current*1e6}
|
|
|
|
|
|
class InstructionBenchmark(Benchmark):
|
|
def __init__(self, instruction='addq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('i','i64', '1'),),
|
|
parallelism=10):
|
|
'''
|
|
Build LLVM IR for arithmetic instruction benchmark without memory references.
|
|
|
|
Currently only one destination (dst) or combined destination and source (dstsrc) operand
|
|
is allowed. Only instruction's operands ($N) refer to the order of opernads found in
|
|
dst + dstsrc + src.
|
|
'''
|
|
Benchmark.__init__(self)
|
|
self.instruction = instruction
|
|
self.dst_operands = dst_operands
|
|
self.dstsrc_operands = dstsrc_operands
|
|
self.src_operands = src_operands
|
|
self.parallelism = parallelism
|
|
self._loop_init = ''
|
|
self._loop_body = ''
|
|
if len(dst_operands) + len(dstsrc_operands) != 1:
|
|
raise NotImplemented("Must have exactly one dst or dstsrc operand.")
|
|
if not all([op[0] in 'ir'
|
|
for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
|
|
raise NotImplemented("This class only supports register and immediate operands.")
|
|
|
|
self._ret_llvmtype = dst_operands[0][1] if dst_operands else dstsrc_operands[0][1]
|
|
|
|
# Part 1: PHI functions and initializations
|
|
for i, dstsrc_op in enumerate(itertools.chain(dstsrc_operands)):
|
|
# constraint code, llvm type string, initial value
|
|
if dstsrc_op[0] == 'r':
|
|
# register operand
|
|
for p in range(self.parallelism):
|
|
self._loop_body += (
|
|
'%"dstsrc{index}_{p}" = phi {type} '
|
|
'[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
|
|
index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
|
|
else:
|
|
raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
|
|
|
|
for i, dst_op in enumerate(itertools.chain(dst_operands)):
|
|
# No phi functions necessary
|
|
# TODO build phi function to switch between source and destination from one iteration
|
|
# to next
|
|
raise NotImplemented("Destination operand is not yet implemented")
|
|
|
|
# Part 2: Inline ASM call
|
|
for i, dstsrc_op in enumerate(itertools.chain(dstsrc_operands)):
|
|
# Build instruction from instruction and operands
|
|
# TODO support multiple dstsrc operands
|
|
# TODO support dst and dstsrc operands at the same time
|
|
# Build constraint string from operands
|
|
constraints = ','.join(
|
|
['='+dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
|
|
[sop[0] for sop in itertools.chain(src_operands, dstsrc_operands)])
|
|
|
|
for p in range(self.parallelism):
|
|
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
|
|
for i, dop in enumerate(dstsrc_operands):
|
|
operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=i, p=p))
|
|
args = ', '.join(operands)
|
|
|
|
self._loop_body += (
|
|
'%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
|
|
' "{instruction}", "{constraints}" ({args})\n').format(
|
|
index=i,
|
|
dst_type=dstsrc_op[1],
|
|
instruction=instruction,
|
|
constraints=constraints,
|
|
args=args,
|
|
p=p)
|
|
|
|
for i, dst_op in enumerate(dst_operands):
|
|
# FIXME support dst operands
|
|
# TODO support dst and dstsrc operands at the same time
|
|
raise NotImplemented("Destination operand is not yet implemented")
|
|
|
|
# Set %"ret" to something, needs to be a constant or phi function
|
|
self._loop_tail = textwrap.dedent('''\
|
|
%"ret" = phi {type} [{}, %"entry"], [%"dstsrc0_0.out", %"loop"]\
|
|
'''.format(dstsrc_operands[0][2], type=dstsrc_operands[0][1]))
|
|
|
|
|
|
class AddressGenerationBenchmark(Benchmark):
|
|
def __init__(self,
|
|
offset=('i', 'i64', '0x42'),
|
|
base=('r', 'i64', '0'),
|
|
index=('r', 'i64', '0'),
|
|
width=('i', None, '4'),
|
|
destination='base',
|
|
parallelism=10):
|
|
'''
|
|
Benchmark for address generation modes.
|
|
|
|
Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
|
|
or 'i' (immediate) and initial_value a string.
|
|
E.g., ('r', 'i64', '0') or ('i', None, '4')
|
|
|
|
+--------------------------------+-----------------------------+
|
|
| Mode | AT&T |
|
|
+--------------------------------+-----------------------------+
|
|
| Offset | leal 0x0100, %eax | <- no latency support
|
|
| Base | leal (%esi), %eax |
|
|
| Offset + Base | leal -8(%ebp), %eax |
|
|
| Offset + Index*Width | leal 0x100(,%ebx,4), %eax |
|
|
| Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax |
|
|
+--------------------------------+-----------------------------+
|
|
OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
|
|
offset: immediate integer (+/-)
|
|
base: register
|
|
index: register
|
|
width: immediate 1,2,4 or 8
|
|
'''
|
|
Benchmark.__init__(self)
|
|
self.offset = offset
|
|
self.base = base
|
|
self.index = index
|
|
self.width = width
|
|
self.destination = destination
|
|
self.parallelism = parallelism
|
|
# Sanity checks:
|
|
if bool(index) ^ bool(width):
|
|
raise ValueError("Index and width both need to be set, or be None.")
|
|
elif index and width:
|
|
if width[0] != 'i' or int(width[2]) not in [1,2,4,8]:
|
|
raise ValueError("Width may only be immediate 1,2,4 or 8.")
|
|
if index[0] != 'r':
|
|
raise ValueError("Index must be a register.")
|
|
|
|
if offset and offset[0] != 'i':
|
|
raise ValueError("Offset must be an immediate.")
|
|
if base and base[0] != 'r':
|
|
raise ValueError("Offset must be a register.")
|
|
|
|
if not index and not width and not offset and not base:
|
|
raise ValueError("Must provide at least an offset or base.")
|
|
|
|
if destination == 'base' and not base:
|
|
raise ValueError("Destination may only be set to 'base' if base is set.")
|
|
elif destination == 'index' and not index:
|
|
raise ValueError("Destination may only be set to 'index' if index is set.")
|
|
elif destination not in ['base', 'index']:
|
|
raise ValueError("Destination must be set to 'base' or 'index'.")
|
|
|
|
if not base and not index:
|
|
raise ValueError("Either base or index must be set for latency test to work.")
|
|
|
|
self._loop_init = ''
|
|
self._loop_body = ''
|
|
|
|
ops = ''
|
|
if offset:
|
|
ops += offset[2]
|
|
if base:
|
|
ops += '($0'
|
|
if width and index:
|
|
ops += ',$1,{}'.format(width[2])
|
|
ops += ')'
|
|
|
|
if destination == 'base':
|
|
ops += ', $0'
|
|
else: # destination == 'index'
|
|
ops += ', $1'
|
|
else:
|
|
if width and index:
|
|
ops += '(,$0,{}), $0'.format(width[2])
|
|
ops += ' '
|
|
|
|
if destination == 'base':
|
|
self._ret_llvmtype = base[1]
|
|
destination_reg = base
|
|
else: # destination == 'index'
|
|
self._ret_llvmtype = index[1]
|
|
destination_reg = index
|
|
|
|
# Part 1: PHI function for destination
|
|
for p in range(parallelism):
|
|
self._loop_body += (
|
|
'%"{name}_{p}" = '
|
|
'phi {type} [{initial}, %"entry"], [%"{name}_{p}.out", %"loop"]\n').format(
|
|
name=destination, type=destination_reg[1], initial=destination_reg[2], p=p)
|
|
|
|
|
|
for p in range(parallelism):
|
|
constraints = '=r,r'
|
|
if base and index:
|
|
constraints += ',r'
|
|
if destination == 'index':
|
|
args = '{base_type} %"{base_name}_{p}", {index_type} {index_value}'.format(
|
|
base_type=base[1], base_name=destination,
|
|
index_type=index[1], index_value=index[2], p=p)
|
|
else: # destination == 'index':
|
|
args ='{base_type} {base_value}, {index_type} %"{index_name}_{p}"'.format(
|
|
base_type=base[1], base_value=base[2],
|
|
index_type=index[1], index_name=destination, p=p)
|
|
else:
|
|
args = '{type} %"{name}_{p}"'.format(type=destination_reg[1], name=destination, p=p)
|
|
|
|
self._loop_body += (
|
|
'%"{name}_{p}.out" = call {type} asm sideeffect'
|
|
' "lea {ops}", "{constraints}" ({args})\n').format(
|
|
name=destination,
|
|
type=destination_reg[1],
|
|
ops=ops,
|
|
constraints=constraints,
|
|
args=args,
|
|
p=p)
|
|
|
|
# Set %"ret" to something, needs to be a constant or phi function
|
|
self._loop_tail = textwrap.dedent('''\
|
|
%"ret" = phi {type} [{initial_value}, %"entry"], [%"{name}_0.out", %"loop"]\
|
|
'''.format(name=destination, initial_value=destination_reg[2], type=destination_reg[1]))
|
|
|
|
|
|
class LoadBenchmark(Benchmark):
|
|
def __init__(self, chain_length=2048, repeat=100000, structure='linear', parallelism=6):
|
|
'''
|
|
Benchmark for L1 load using pointer chasing.
|
|
|
|
*chain_length* is the number of pointers to place in memory.
|
|
*repeat* is the number of iterations the chain run through.
|
|
*structure* may be 'linear' (1-offsets) or 'random'.
|
|
'''
|
|
Benchmark.__init__(self)
|
|
self._loop_init = ''
|
|
self._loop_body = ''
|
|
self._loop_tail = ''
|
|
element_type = ctypes.POINTER(ctypes.c_int)
|
|
self._function_ctype = ctypes.CFUNCTYPE(
|
|
ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
|
|
self.chain_length = chain_length
|
|
self.repeat = repeat
|
|
self._iterations = chain_length*repeat
|
|
self.parallelism = parallelism
|
|
self.structure = structure
|
|
self._pointer_field = (element_type * chain_length)()
|
|
|
|
# Initialize pointer field
|
|
# Field must represent a ring of pointers
|
|
if structure == 'linear':
|
|
for i in range(chain_length):
|
|
self._pointer_field[i] = ctypes.cast(
|
|
ctypes.pointer(self._pointer_field[(i+1)%chain_length]), element_type)
|
|
elif structure == 'random':
|
|
shuffled_indices = list(range(chain_length))
|
|
random.shuffle(shuffled_indices)
|
|
for i in range(chain_length):
|
|
self._pointer_field[shuffled_indices[i]] = ctypes.cast(
|
|
ctypes.pointer(self._pointer_field[shuffled_indices[(i+1)%chain_length]]),
|
|
element_type)
|
|
else:
|
|
raise ValueError("Given structure is not supported. Supported are: "
|
|
"linear and random.")
|
|
|
|
def prepare_arguments(self):
|
|
return (self._pointer_field, self.repeat)
|
|
|
|
def get_ir(self):
|
|
'''
|
|
Return LLVM IR equivalent of (in case of parallelism == 1):
|
|
|
|
int test(int** ptrf, int repeat) {
|
|
int** p0 = (int**)ptrf[0];
|
|
int i = 0;
|
|
while(i < N) {
|
|
int** p = (int**)*p0;
|
|
while(p != p0) {
|
|
p = (int**)*p;
|
|
}
|
|
i++;
|
|
}
|
|
return i;
|
|
}
|
|
'''
|
|
ret = textwrap.dedent('''
|
|
define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
|
|
entry:
|
|
''')
|
|
# Load pointer to ptrf[p] and p0
|
|
for p in range(self.parallelism):
|
|
if p > 0:
|
|
ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
|
|
ret += (
|
|
' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
|
|
' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
|
|
|
|
ret += textwrap.dedent('''
|
|
%"cmp.entry" = icmp sgt i32 %"repeats", 0
|
|
br i1 %"cmp.entry", label %"loop0", label %"end"
|
|
|
|
loop0:
|
|
br label %"loop1"
|
|
|
|
loop1:
|
|
%"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
|
|
br label %"loop2"
|
|
|
|
loop2:\n''')
|
|
|
|
for p in range(self.parallelism):
|
|
ret += (' %"p_{p}" = phi i32** '
|
|
'[ %"p0_{p}", %"loop1" ], [ %"p_{p}.1", %"loop2" ]\n').format(p=p)
|
|
|
|
# load p, compare to p0 and or-combine results
|
|
for p in range(self.parallelism):
|
|
ret += (' %"pp_{p}" = bitcast i32** %"p_{p}" to i32***\n'
|
|
' %"p_{p}.1" = load i32**, i32*** %"pp_{p}", align 8\n'
|
|
' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.1", %"p0_{p}"\n').format(p=p)
|
|
if p == 1:
|
|
ret += (' %"cmp__{p}.loop2" = '
|
|
'or i1 %"cmp_{p_before}.loop2", %"cmp_{p}.loop2"\n').format(
|
|
p=p, p_before=p-1)
|
|
elif p > 1:
|
|
ret += (' %"cmp__{p}.loop2" = '
|
|
'or i1 %"cmp__{p_before}.loop2", %"cmp_{p}.loop2"\n').format(
|
|
p=p, p_before=p-1)
|
|
|
|
if self.parallelism == 1:
|
|
ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
|
|
else:
|
|
ret += ' br i1 %"cmp__{p_last}.loop2", label %"loop3", label %"loop2"\n'.format(
|
|
p_last=self.parallelism-1)
|
|
|
|
ret += textwrap.dedent('''
|
|
loop3:
|
|
%"i.1" = add i32 %"i", 1
|
|
%"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
|
|
br i1 %"cmp.loop3", label %"end", label %"loop1"
|
|
|
|
end:
|
|
%"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
|
|
ret i32 %"ret"
|
|
}''')
|
|
return ret
|
|
|
|
|
|
if __name__ == '__main__':
|
|
llvm.initialize()
|
|
llvm.initialize_native_target()
|
|
llvm.initialize_native_asmprinter()
|
|
llvm.initialize_native_asmparser()
|
|
|
|
modules = collections.OrderedDict()
|
|
|
|
# immediate source
|
|
modules['add i64 r64 LAT'] = InstructionBenchmark(
|
|
instruction='addq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('i','i64', '1'),),
|
|
parallelism=1)
|
|
|
|
# register source
|
|
modules['add r64 r64 LAT'] = InstructionBenchmark(
|
|
instruction='addq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('r','i64', '1'),),
|
|
parallelism=1)
|
|
|
|
# multiple instructions
|
|
modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
|
|
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('i','i64', '1'),),
|
|
parallelism=1)
|
|
|
|
# immediate source
|
|
modules['add i64 r64 TP'] = InstructionBenchmark(
|
|
instruction='addq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('i','i64', '1'),),
|
|
parallelism=10)
|
|
|
|
# register source
|
|
modules['add r64 r64 TP'] = InstructionBenchmark(
|
|
instruction='addq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('r','i64', '1'),),
|
|
parallelism=10)
|
|
|
|
# multiple instructions
|
|
modules['4xadd i64 r64 TP'] = InstructionBenchmark(
|
|
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
|
|
dst_operands=(),
|
|
dstsrc_operands=(('r','i64', '0'),),
|
|
src_operands=(('i','i64', '1'),),
|
|
parallelism=10)
|
|
|
|
modules['lea base LAT'] = AddressGenerationBenchmark(
|
|
offset=None,
|
|
base=('r', 'i64', '666'),
|
|
index=None,
|
|
width=None,
|
|
destination='base',
|
|
parallelism=1)
|
|
|
|
modules['lea index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=None,
|
|
base=None,
|
|
index=('r', 'i64', '1'),
|
|
width=('i', None, '4'),
|
|
destination='index',
|
|
parallelism=1)
|
|
|
|
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=('i', 'i64', '-0x8'),
|
|
base=None,
|
|
index=('r', 'i64', '51'),
|
|
width=('i', None, '4'),
|
|
destination='index',
|
|
parallelism=1)
|
|
|
|
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=None,
|
|
base=('r', 'i64', '23'),
|
|
index=('r', 'i64', '12'),
|
|
width=('i', None, '4'),
|
|
destination='base',
|
|
parallelism=1)
|
|
|
|
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=('i', None, '42'),
|
|
base=('r', 'i64', '23'),
|
|
index=('r', 'i64', '12'),
|
|
width=('i', None, '4'),
|
|
destination='base',
|
|
parallelism=1)
|
|
|
|
modules['lea base LAT'] = AddressGenerationBenchmark(
|
|
offset=None,
|
|
base=('r', 'i64', '666'),
|
|
index=None,
|
|
width=None,
|
|
destination='base',
|
|
parallelism=10)
|
|
|
|
modules['lea index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=None,
|
|
base=None,
|
|
index=('r', 'i64', '1'),
|
|
width=('i', None, '4'),
|
|
destination='index',
|
|
parallelism=10)
|
|
|
|
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=('i', 'i64', '-0x8'),
|
|
base=None,
|
|
index=('r', 'i64', '51'),
|
|
width=('i', None, '4'),
|
|
destination='index',
|
|
parallelism=10)
|
|
|
|
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=None,
|
|
base=('r', 'i64', '23'),
|
|
index=('r', 'i64', '12'),
|
|
width=('i', None, '4'),
|
|
destination='base',
|
|
parallelism=10)
|
|
|
|
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
|
|
offset=('i', None, '42'),
|
|
base=('r', 'i64', '23'),
|
|
index=('r', 'i64', '12'),
|
|
width=('i', None, '4'),
|
|
destination='base',
|
|
parallelism=10)
|
|
|
|
modules['LD linear LAT'] = LoadBenchmark(
|
|
chain_length=2048, # 2048 * 8B = 16kB
|
|
repeat=100000,
|
|
structure='linear',
|
|
parallelism=1)
|
|
|
|
modules['LD random LAT'] = LoadBenchmark(
|
|
chain_length=2048, # 2048 * 8B = 16kB
|
|
repeat=100000,
|
|
structure='random',
|
|
parallelism=1)
|
|
|
|
modules['LD linear TP'] = LoadBenchmark(
|
|
chain_length=2048, # 2048 * 8B = 16kB
|
|
repeat=100000,
|
|
structure='linear',
|
|
parallelism=10)
|
|
|
|
modules['LD random TP'] = LoadBenchmark(
|
|
chain_length=2048, # 2048 * 8B = 16kB
|
|
repeat=100000,
|
|
structure='random',
|
|
parallelism=10)
|
|
|
|
verbose = 0
|
|
for key, module in modules.items():
|
|
if verbose > 0:
|
|
print("=== LLVM")
|
|
print(module.get_ir())
|
|
r = module.build_and_execute(print_assembly=verbose > 0)
|
|
|
|
if module.parallelism > 1:
|
|
cy_per_it = min(r['runtimes'])/r['iterations']*r['frequency']/module.parallelism
|
|
else:
|
|
cy_per_it = min(r['runtimes'])/r['iterations']*r['frequency']
|
|
print('{key:<32} {cy_per_it:.2f} cy/It with {runtime_sum:.4f}s'.format(
|
|
key=key,
|
|
module=module,
|
|
cy_per_it=cy_per_it,
|
|
runtime_sum=sum(r['runtimes'])))
|
|
|