Files
asmbench/jit.py
2018-04-25 16:25:45 +02:00

656 lines
24 KiB
Python
Executable File

#!/usr/bin/env python3
import ctypes
import sys
import time
import textwrap
import itertools
import random
import collections
import llvmlite.binding as llvm
import psutil
# TODOs
# * API to create test scenarios
# * DSL?
# * Test cases:
# * Instructions:
# * arithmetics \w reg and/or imm.
# * scalar
# * packed
# * lea
# * LOAD / mov \w mem
# * Single Latency
# * Single Throughput
# * Combined Throughput
# * Random Throughput
# * IACA marked binary output generation
# * Fuzzing algorithm
# * CLI
# * C based timing routine? As an extension?
# * make sanity checks during runtime, check for fixed frequency and pinning
class Benchmark:
LLVM2CTYPE = {
'i8': ctypes.c_int8,
'i16': ctypes.c_int16,
'i32': ctypes.c_int32,
'i64': ctypes.c_int64,
'f32': ctypes.c_float,
'f64': ctypes.c_double,
'i8*': ctypes.POINTER(ctypes.c_int8),
'i16*': ctypes.POINTER(ctypes.c_int16),
'i32*': ctypes.POINTER(ctypes.c_int32),
'i64*': ctypes.POINTER(ctypes.c_int64),
'f32*': ctypes.POINTER(ctypes.c_float),
'f64*': ctypes.POINTER(ctypes.c_double),
}
def __init__(self):
self._loop_init = ''
self._ret_llvmtype = 'i64'
self._ret_ctype = self.LLVM2CTYPE[self._ret_llvmtype]
self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int)
self._iterations = 100000000
# Do interesting work
self._loop_body = textwrap.dedent('''\
%"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
%"checksum.1" = call i64 asm sideeffect "
add $1, $0",
"=r,i,r" (i64 1, i64 %"checksum")\
''')
# Set %"ret" to something, needs to be a constant or phi function
self._loop_tail = textwrap.dedent('''\
%"ret" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]\
''')
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k,v) for k,v in self.__dict__.items()
if not k.startswith('_')]))
def get_ir(self):
return textwrap.dedent('''\
define {ret_type} @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
{loop_init}
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi {ret_type} [0, %"entry"], [%"loop_counter.1", %"loop"]
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
{loop_tail}
ret {ret_type} %"ret"
}}
''').format(
ret_type=self._ret_llvmtype,
loop_init=textwrap.indent(self._loop_init, ' '),
loop_body=textwrap.indent(self._loop_body, ' '),
loop_tail=textwrap.indent(self._loop_tail, ' '))
def prepare_arguments(self):
'''Build argument tuple, to be passed to low level function.'''
return (self._iterations,)
def build_and_execute(self, repeat=10, print_assembly=True):
llvm_module = llvm.parse_assembly(self.get_ir())
llvm_module.verify()
# Compile the module to machine code using MCJIT
tm = llvm.Target.from_default_triple().create_target_machine()
tm.set_asm_verbosity(0)
runtimes = []
with llvm.create_mcjit_compiler(llvm_module, tm) as ee:
ee.finalize_object()
if print_assembly:
print('=== Assembly')
print(tm.emit_assembly(llvm_module))
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self._function_ctype(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
args = self.prepare_arguments()
for i in range(repeat):
start = time.perf_counter()
res = cfunc(*args)
end = time.perf_counter()
runtimes.append(end-start)
return {'iterations': self._iterations,
'runtimes': runtimes,
'frequency': psutil.cpu_freq().current*1e6}
class InstructionBenchmark(Benchmark):
def __init__(self, instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('i','i64', '1'),),
parallelism=10):
'''
Build LLVM IR for arithmetic instruction benchmark without memory references.
Currently only one destination (dst) or combined destination and source (dstsrc) operand
is allowed. Only instruction's operands ($N) refer to the order of opernads found in
dst + dstsrc + src.
'''
Benchmark.__init__(self)
self.instruction = instruction
self.dst_operands = dst_operands
self.dstsrc_operands = dstsrc_operands
self.src_operands = src_operands
self.parallelism = parallelism
self._loop_init = ''
self._loop_body = ''
if len(dst_operands) + len(dstsrc_operands) != 1:
raise NotImplemented("Must have exactly one dst or dstsrc operand.")
if not all([op[0] in 'ir'
for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
raise NotImplemented("This class only supports register and immediate operands.")
self._ret_llvmtype = dst_operands[0][1] if dst_operands else dstsrc_operands[0][1]
# Part 1: PHI functions and initializations
for i, dstsrc_op in enumerate(itertools.chain(dstsrc_operands)):
# constraint code, llvm type string, initial value
if dstsrc_op[0] == 'r':
# register operand
for p in range(self.parallelism):
self._loop_body += (
'%"dstsrc{index}_{p}" = phi {type} '
'[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
else:
raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
for i, dst_op in enumerate(itertools.chain(dst_operands)):
# No phi functions necessary
# TODO build phi function to switch between source and destination from one iteration
# to next
raise NotImplemented("Destination operand is not yet implemented")
# Part 2: Inline ASM call
for i, dstsrc_op in enumerate(itertools.chain(dstsrc_operands)):
# Build instruction from instruction and operands
# TODO support multiple dstsrc operands
# TODO support dst and dstsrc operands at the same time
# Build constraint string from operands
constraints = ','.join(
['='+dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
[sop[0] for sop in itertools.chain(src_operands, dstsrc_operands)])
for p in range(self.parallelism):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
for i, dop in enumerate(dstsrc_operands):
operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=i, p=p))
args = ', '.join(operands)
self._loop_body += (
'%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dstsrc_op[1],
instruction=instruction,
constraints=constraints,
args=args,
p=p)
for i, dst_op in enumerate(dst_operands):
# FIXME support dst operands
# TODO support dst and dstsrc operands at the same time
raise NotImplemented("Destination operand is not yet implemented")
# Set %"ret" to something, needs to be a constant or phi function
self._loop_tail = textwrap.dedent('''\
%"ret" = phi {type} [{}, %"entry"], [%"dstsrc0_0.out", %"loop"]\
'''.format(dstsrc_operands[0][2], type=dstsrc_operands[0][1]))
class AddressGenerationBenchmark(Benchmark):
def __init__(self,
offset=('i', 'i64', '0x42'),
base=('r', 'i64', '0'),
index=('r', 'i64', '0'),
width=('i', None, '4'),
destination='base',
parallelism=10):
'''
Benchmark for address generation modes.
Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
or 'i' (immediate) and initial_value a string.
E.g., ('r', 'i64', '0') or ('i', None, '4')
+--------------------------------+-----------------------------+
| Mode | AT&T |
+--------------------------------+-----------------------------+
| Offset | leal 0x0100, %eax | <- no latency support
| Base | leal (%esi), %eax |
| Offset + Base | leal -8(%ebp), %eax |
| Offset + Index*Width | leal 0x100(,%ebx,4), %eax |
| Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax |
+--------------------------------+-----------------------------+
OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
offset: immediate integer (+/-)
base: register
index: register
width: immediate 1,2,4 or 8
'''
Benchmark.__init__(self)
self.offset = offset
self.base = base
self.index = index
self.width = width
self.destination = destination
self.parallelism = parallelism
# Sanity checks:
if bool(index) ^ bool(width):
raise ValueError("Index and width both need to be set, or be None.")
elif index and width:
if width[0] != 'i' or int(width[2]) not in [1,2,4,8]:
raise ValueError("Width may only be immediate 1,2,4 or 8.")
if index[0] != 'r':
raise ValueError("Index must be a register.")
if offset and offset[0] != 'i':
raise ValueError("Offset must be an immediate.")
if base and base[0] != 'r':
raise ValueError("Offset must be a register.")
if not index and not width and not offset and not base:
raise ValueError("Must provide at least an offset or base.")
if destination == 'base' and not base:
raise ValueError("Destination may only be set to 'base' if base is set.")
elif destination == 'index' and not index:
raise ValueError("Destination may only be set to 'index' if index is set.")
elif destination not in ['base', 'index']:
raise ValueError("Destination must be set to 'base' or 'index'.")
if not base and not index:
raise ValueError("Either base or index must be set for latency test to work.")
self._loop_init = ''
self._loop_body = ''
ops = ''
if offset:
ops += offset[2]
if base:
ops += '($0'
if width and index:
ops += ',$1,{}'.format(width[2])
ops += ')'
if destination == 'base':
ops += ', $0'
else: # destination == 'index'
ops += ', $1'
else:
if width and index:
ops += '(,$0,{}), $0'.format(width[2])
ops += ' '
if destination == 'base':
self._ret_llvmtype = base[1]
destination_reg = base
else: # destination == 'index'
self._ret_llvmtype = index[1]
destination_reg = index
# Part 1: PHI function for destination
for p in range(parallelism):
self._loop_body += (
'%"{name}_{p}" = '
'phi {type} [{initial}, %"entry"], [%"{name}_{p}.out", %"loop"]\n').format(
name=destination, type=destination_reg[1], initial=destination_reg[2], p=p)
for p in range(parallelism):
constraints = '=r,r'
if base and index:
constraints += ',r'
if destination == 'index':
args = '{base_type} %"{base_name}_{p}", {index_type} {index_value}'.format(
base_type=base[1], base_name=destination,
index_type=index[1], index_value=index[2], p=p)
else: # destination == 'index':
args ='{base_type} {base_value}, {index_type} %"{index_name}_{p}"'.format(
base_type=base[1], base_value=base[2],
index_type=index[1], index_name=destination, p=p)
else:
args = '{type} %"{name}_{p}"'.format(type=destination_reg[1], name=destination, p=p)
self._loop_body += (
'%"{name}_{p}.out" = call {type} asm sideeffect'
' "lea {ops}", "{constraints}" ({args})\n').format(
name=destination,
type=destination_reg[1],
ops=ops,
constraints=constraints,
args=args,
p=p)
# Set %"ret" to something, needs to be a constant or phi function
self._loop_tail = textwrap.dedent('''\
%"ret" = phi {type} [{initial_value}, %"entry"], [%"{name}_0.out", %"loop"]\
'''.format(name=destination, initial_value=destination_reg[2], type=destination_reg[1]))
class LoadBenchmark(Benchmark):
def __init__(self, chain_length=2048, repeat=100000, structure='linear', parallelism=6):
'''
Benchmark for L1 load using pointer chasing.
*chain_length* is the number of pointers to place in memory.
*repeat* is the number of iterations the chain run through.
*structure* may be 'linear' (1-offsets) or 'random'.
'''
Benchmark.__init__(self)
self._loop_init = ''
self._loop_body = ''
self._loop_tail = ''
element_type = ctypes.POINTER(ctypes.c_int)
self._function_ctype = ctypes.CFUNCTYPE(
ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
self.chain_length = chain_length
self.repeat = repeat
self._iterations = chain_length*repeat
self.parallelism = parallelism
self.structure = structure
self._pointer_field = (element_type * chain_length)()
# Initialize pointer field
# Field must represent a ring of pointers
if structure == 'linear':
for i in range(chain_length):
self._pointer_field[i] = ctypes.cast(
ctypes.pointer(self._pointer_field[(i+1)%chain_length]), element_type)
elif structure == 'random':
shuffled_indices = list(range(chain_length))
random.shuffle(shuffled_indices)
for i in range(chain_length):
self._pointer_field[shuffled_indices[i]] = ctypes.cast(
ctypes.pointer(self._pointer_field[shuffled_indices[(i+1)%chain_length]]),
element_type)
else:
raise ValueError("Given structure is not supported. Supported are: "
"linear and random.")
def prepare_arguments(self):
return (self._pointer_field, self.repeat)
def get_ir(self):
'''
Return LLVM IR equivalent of (in case of parallelism == 1):
int test(int** ptrf, int repeat) {
int** p0 = (int**)ptrf[0];
int i = 0;
while(i < N) {
int** p = (int**)*p0;
while(p != p0) {
p = (int**)*p;
}
i++;
}
return i;
}
'''
ret = textwrap.dedent('''
define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
entry:
''')
# Load pointer to ptrf[p] and p0
for p in range(self.parallelism):
if p > 0:
ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
ret += (
' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
ret += textwrap.dedent('''
%"cmp.entry" = icmp sgt i32 %"repeats", 0
br i1 %"cmp.entry", label %"loop0", label %"end"
loop0:
br label %"loop1"
loop1:
%"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
br label %"loop2"
loop2:\n''')
for p in range(self.parallelism):
ret += (' %"p_{p}" = phi i32** '
'[ %"p0_{p}", %"loop1" ], [ %"p_{p}.1", %"loop2" ]\n').format(p=p)
# load p, compare to p0 and or-combine results
for p in range(self.parallelism):
ret += (' %"pp_{p}" = bitcast i32** %"p_{p}" to i32***\n'
' %"p_{p}.1" = load i32**, i32*** %"pp_{p}", align 8\n'
' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.1", %"p0_{p}"\n').format(p=p)
if p == 1:
ret += (' %"cmp__{p}.loop2" = '
'or i1 %"cmp_{p_before}.loop2", %"cmp_{p}.loop2"\n').format(
p=p, p_before=p-1)
elif p > 1:
ret += (' %"cmp__{p}.loop2" = '
'or i1 %"cmp__{p_before}.loop2", %"cmp_{p}.loop2"\n').format(
p=p, p_before=p-1)
if self.parallelism == 1:
ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
else:
ret += ' br i1 %"cmp__{p_last}.loop2", label %"loop3", label %"loop2"\n'.format(
p_last=self.parallelism-1)
ret += textwrap.dedent('''
loop3:
%"i.1" = add i32 %"i", 1
%"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
br i1 %"cmp.loop3", label %"end", label %"loop1"
end:
%"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
ret i32 %"ret"
}''')
return ret
if __name__ == '__main__':
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
modules = collections.OrderedDict()
# immediate source
modules['add i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('i','i64', '1'),),
parallelism=1)
# register source
modules['add r64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('r','i64', '1'),),
parallelism=1)
# multiple instructions
modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('i','i64', '1'),),
parallelism=1)
# immediate source
modules['add i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('i','i64', '1'),),
parallelism=10)
# register source
modules['add r64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('r','i64', '1'),),
parallelism=10)
# multiple instructions
modules['4xadd i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r','i64', '0'),),
src_operands=(('i','i64', '1'),),
parallelism=10)
modules['lea base LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallelism=1)
modules['lea index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallelism=1)
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallelism=1)
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallelism=1)
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallelism=1)
modules['lea base LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallelism=10)
modules['lea index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallelism=10)
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallelism=10)
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallelism=10)
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallelism=10)
modules['LD linear LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
repeat=100000,
structure='linear',
parallelism=1)
modules['LD random LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
repeat=100000,
structure='random',
parallelism=1)
modules['LD linear TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
repeat=100000,
structure='linear',
parallelism=10)
modules['LD random TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
repeat=100000,
structure='random',
parallelism=10)
verbose = 0
for key, module in modules.items():
if verbose > 0:
print("=== LLVM")
print(module.get_ir())
r = module.build_and_execute(print_assembly=verbose > 0)
if module.parallelism > 1:
cy_per_it = min(r['runtimes'])/r['iterations']*r['frequency']/module.parallelism
else:
cy_per_it = min(r['runtimes'])/r['iterations']*r['frequency']
print('{key:<32} {cy_per_it:.2f} cy/It with {runtime_sum:.4f}s'.format(
key=key,
module=module,
cy_per_it=cy_per_it,
runtime_sum=sum(r['runtimes'])))