renamed to asmbench

This commit is contained in:
Julian Hammer
2018-09-25 10:23:40 +02:00
parent 3033246d4e
commit dbbd37585a
12 changed files with 29 additions and 2056 deletions

6
.idea/other.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PySciProjectComponent">
<option name="PY_SCI_VIEW_SUGGESTED" value="true" />
</component>
</project>

View File

@@ -1,6 +1,17 @@
# pyasmjit
# asmbench
A instruction latency and throughput benchmarking framework for out-of-order architectures.
A benchmark toolkit for assembly instructions using the LLVM JIT.
## Usage
To benchmark latency and throughput of a 64bit integer add use the following command:
```
python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}'
```
To benchmark two instructions interleaved use this:
```
python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}' 'sub {src:i64:r}, {srcdst:i64:r}'
```
To find out more add `-h` for help and `-v` for verbose mode.

View File

View File

@@ -1,42 +0,0 @@
#!/usr/bin/env python3
import argparse
import llvmlite.binding as llvm
from . import op, bench
def main():
parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
# parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
help='instruction declaration, e.g., "add {src:i32:r} {srcdst:i32:r}"')
parser.add_argument('--serialize', action='store_true',
help='Serialize instructions.')
parser.add_argument('--latency-serial', '-l', type=int, default=8,
help='length of serial chain for each instruction in latency benchmark')
parser.add_argument('--parallel', '-p',type=int, default=10,
help='number of parallel instances of serial chains in throughput '
'benchmark')
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
help='length of serial instances of serial chains in throughput benchmark')
parser.add_argument('--iaca', type=str, default=None,
help='Compare throughput measurement with IACA analysis, pass '
'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
parser.add_argument("--verbose", "-v", action="count", default=0,
help="increase output verbosity")
args = parser.parse_args()
bench.setup_llvm()
lat, tp = bench.bench_instructions(args.instructions,
serial_factor=args.latency_serial,
parallel_factor=args.parallel,
throughput_serial_factor=args.throughput_serial,
serialize=args.serialize,
verbosity=args.verbose,
iaca_comparison=args.iaca)
print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
if __name__ == "__main__":
main()

View File

@@ -1,398 +0,0 @@
#!/usr/bin/env python3
import ctypes
import time
import textwrap
import itertools
import re
from pprint import pprint
import tempfile
import subprocess
import sys
import llvmlite.binding as llvm
import psutil
try:
from kerncraft import iaca
except ImportError:
iaca = None
from . import op
def setup_llvm():
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
def uniquify(l):
# Uniquify list while preserving order
seen = set()
return [x for x in l if x not in seen and not seen.add(x)]
class Benchmark:
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
@staticmethod
def prepare_arguments(previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 10000000,
else:
try:
return int(previous_args[0] * time_factor),
except OverflowError:
return previous_args[0]*10,
@staticmethod
def get_iterations(args) -> int:
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def build_ir(self):
raise NotImplementedError()
def get_llvm_module(self, iaca_marker=False):
"""Build and return LLVM module from LLVM IR code."""
ir = self.build_ir(iaca_marker=iaca_marker)
return llvm.parse_assembly(ir)
def get_target_machine(self):
"""Instantiate and return target machine."""
features = llvm.get_host_cpu_features().flatten()
cpu = '' # llvm.get_host_cpu_name() # Work around until ryzen problems are fixed
return llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=3)
def get_assembly(self, iaca_marker=False):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
# Remove double comments
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
return asm
def get_function_ctype(self):
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
def get_iaca_analysis(self, arch):
"""Compile and return IACA analysis."""
if iaca is None:
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
tm = self.get_target_machine()
tmpf = tempfile.NamedTemporaryFile("wb")
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
tmpf.flush()
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
return_values = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self.get_function_ctype()(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
for i in range(repeat):
tries = 0
while True:
if tries > 10:
raise RuntimeError("Unable to measure non-zero runtime.")
tries += 1
start = time.perf_counter()
ret = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if ret != args[0]-1:
raise RuntimeError(
"Return value {} is invalid, should have been {}.".format(ret, args[0]-1))
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
factor = target_elapsed / elapsed
args = self.prepare_arguments(previous_args=args, time_factor=factor)
continue
else:
# After we have the right argument choice, we keep it.
fixed_args = True
break
return_values.append(ret)
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': psutil.cpu_freq().current * 1e6,
'returned': return_values}
class LoopBenchmark(Benchmark):
def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True):
super().__init__()
self.root_synth = root_synth
self.init_values = init_values or root_synth.get_default_init_values()
self.loop_carried_dependencies = loop_carried_dependencies
if len(root_synth.get_source_registers()) != len(self.init_values):
raise ValueError("Number of init values and source registers do not match.")
def get_source_names(self):
return ['%in.{}'.format(i) for i in range(len(self.root_synth.get_source_registers()))]
def get_destination_names(self):
return ['%out.{}'.format(i) for i in
range(len(self.root_synth.get_destination_registers()))]
def get_phi_code(self):
if not self.loop_carried_dependencies:
return ''
# Compile loop carried dependencies
lcd = []
# Change in naming (src <-> dst) is on purpose!
srcs = self.root_synth.get_destination_registers()
dsts = self.root_synth.get_source_registers()
# cycle iterator is used to not only reuse a single destination, but go through all of them
srcs_it = itertools.cycle(enumerate(srcs))
matched = False
last_match_idx = len(srcs) - 1
for dst_idx, dst in enumerate(dsts):
for src_idx, src in srcs_it:
if src.llvm_type == dst.llvm_type:
lcd.append([dst,
self.get_source_names()[dst_idx],
self.init_values[dst_idx],
src,
self.get_destination_names()[src_idx]])
matched = True
last_match_idx = src_idx
break
# since srcs_it is an infinity iterator, we need to abort after a complete cycle
if src_idx == last_match_idx:
break
if not matched:
raise ValueError("Unable to match source to any destination.")
code = ''
for dst_reg, dst_name, init_value, src_reg, src_name in lcd:
assert dst_reg.llvm_type == src_reg.llvm_type, \
"Source and destination types do not match"
code += ('{dst_name} = phi {llvm_type} [{init_value}, %"entry"], '
'[{src_name}, %"loop"]\n').format(
llvm_type=dst_reg.llvm_type,
dst_name=dst_name,
init_value=init_value,
src_name=src_name)
# Add extra phi for constant values. Assuming LLVM will optimize them "away"
for dst_idx, dst in enumerate(dsts):
if dst not in [d for d, dn, i, s, sn in lcd]:
code += ('{dst_reg} = phi {llvm_type} [{init_value}, %"entry"], '
'[{init_value}, %"loop"]\n').format(
llvm_type=dst.llvm_type,
dst_reg=self.get_source_names()[dst_idx],
init_value=self.init_values[dst_idx])
return code
def build_ir(self):
raise NotImplementedError()
class IntegerLoopBenchmark(LoopBenchmark):
def build_ir(self, iaca_marker=False):
if iaca_marker:
iaca_start_marker = textwrap.dedent('''\
call void asm "movl $$111,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
iaca_stop_marker = textwrap.dedent('''\
call void asm "movl $$222,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
else:
iaca_start_marker = ''
iaca_stop_marker = ''
ir = textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{phi}
{iaca_start_marker}
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
{iaca_stop_marker}
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(
self.root_synth.build_ir(self.get_destination_names(),
self.get_source_names()), ' '),
phi=textwrap.indent(self.get_phi_code(), ' '),
iaca_start_marker=iaca_start_marker,
iaca_stop_marker=iaca_stop_marker)
return ir
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
serialize=False, verbosity=0, iaca_comparison=None,
repeat=4, min_elapsed=0.1, max_elapsed=0.2):
not_serializable = False
try:
# Latency Benchmark
if verbosity > 0:
print('## Latency Benchmark')
p_instrs = []
if not serialize:
for i in instructions:
p_instrs.append(op.Serialized([i] * serial_factor))
else:
p_instrs = [op.Serialized(instructions * serial_factor)]
p = op.Parallelized(p_instrs)
b = IntegerLoopBenchmark(p)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
if verbosity >= 3:
print('### IACA Analysis')
print(b.get_iaca_analysis('SKL')['output'])
result = b.build_and_execute(
repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
result['latency'] = lat
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
except op.NotSerializableError as e:
print("Latency measurement not possible:", e)
not_serializable = True
if not_serializable:
throughput_serial_factor = 1
print("WARNING: throughput_serial_factor has be set to 1.")
# Throughput Benchmark
if verbosity > 0:
print('## Throughput Benchmark')
p_instrs = []
if not serialize:
for i in instructions:
p_instrs.append(op.Serialized([i] * throughput_serial_factor))
else:
p_instrs = [op.Serialized(instructions * throughput_serial_factor)]
p = op.Parallelized(p_instrs * parallel_factor)
b = IntegerLoopBenchmark(p)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
if verbosity >= 3:
print('### IACA Analysis')
print(b.get_iaca_analysis('SKL')['output'])
result = b.build_and_execute(
repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
tp = min(
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
result['throughput'] = tp
if iaca_comparison is not None:
iaca_analysis = b.get_iaca_analysis(iaca_comparison)
result['iaca throughput'] = iaca_analysis['throughput']/(
parallel_factor * throughput_serial_factor)
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
if verbosity > 1 and iaca_comparison is not None:
print('### IACA Results')
print(iaca_analysis['output'])
print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
throughput_serial_factor, parallel_factor))
# Result compilation
return lat, tp
if __name__ == '__main__':
setup_llvm()
i1 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
i2 = op.Instruction(
instruction='sub $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
s = op.Serialized([i1, i2])
i3 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
i4 = op.Instruction(
instruction='sub $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
i5 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
i6 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
s1 = op.Serialized([i1, i2])
s2 = op.Serialized([s1, i3])
s3 = op.Serialized([i4, i5])
p1 = op.Parallelized([i6, s2, s3])
init_values = ['1' for r in p1.get_source_registers()]
b = IntegerLoopBenchmark(p1, init_values)
print(b.build_ir())
print(b.get_assembly())
print(b.build_and_execute())
print(bench_instructions([op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])]))
# if len(s.get_source_operand_types())
# b = IntegerLoopBenchmark(loop_body,
# [(type_, dst_reg, '1', src_reg)
# # for type_, dst_reg, src_reg in zip(s.get_last_destination_type(), )])
# print(b.get_ir())

View File

@@ -1,860 +0,0 @@
#!/usr/bin/env python3
import ctypes
import sys
import time
import textwrap
import itertools
import random
import collections
import pprint
import math
import llvmlite.binding as llvm
import psutil
# TODOs
# * API to create test scenarios
# * DSL?
# * Test cases:
# * Instructions:
# * [x] arithmetics \w reg and/or imm.
# * scalar
# * packed
# * [x] lea
# * [x] LOAD / mov \w mem
# * [TODO] STORE / mov to mem
# * [x] Single Latency
# * [x] Single Throughput
# * [TODO] Combined Throughput
# * [TODO] Random Throughput
# * [TODO] Automated TP, Lat, #pipeline analysis
# * [TODO] IACA marked binary output generation
# * [TODO] Fuzzing algorithm
# * [TODO] CLI
# * C based timing routine? As an extension?
# * make sanity checks during runtime, check for fixed frequency and pinning
def floor_harmonic_fraction(n, error=0.1):
"""
Finds closest floored integer or inverse integer and returns error.
(numerator, denominator, relative error) where either numerator or denominator is exactly one.
"""
floor_n = math.floor(n)
if floor_n > 0:
return floor_n, 1, 1 - floor_n / n
else:
i = 2
while (1 / i) > n:
i += 1
return 1, i, 1 - (1 / i) / n
class Benchmark:
def __init__(self, parallel=1, serial=5):
self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
self.parallel = parallel
self.serial = serial
# Do interesting work
self._loop_body = textwrap.dedent('''\
%"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
%"checksum.1" = call i64 asm sideeffect "
add $1, $0",
"=r,i,r" (i64 1, i64 %"checksum")\
''')
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
def get_ir(self):
# FP add loop - may have issues
# return textwrap.dedent('''\
# define i64 @"test"(i64 %"N")
# {{
# entry:
# %"N.fp" = sitofp i64 %"N" to double
# %"loop_cond" = fcmp olt double 0.0, %"N.fp"
# br i1 %"loop_cond", label %"loop", label %"end"
#
# loop:
# %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"]
# {loop_body}
# %"loop_counter.1" = fadd double %"loop_counter", 1.0
# %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp"
# br i1 %"loop_cond.1", label %"loop", label %"end"
#
# end:
# %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"]
# %"ret" = fptosi double %"ret.fp" to i64
# ret i64 %"ret"
# }}
# ''').format(
# loop_body=textwrap.indent(self._loop_body, ' '))
return textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(self._loop_body, ' '))
def prepare_arguments(self, previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 100,
else:
return int(previous_args[0] * time_factor),
def get_iterations(self, args):
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def get_llvm_module(self):
"""Build and return LLVM module from LLVM IR code."""
if not hasattr(self, '_llvm_module'):
self._llvm_module = llvm.parse_assembly(self.get_ir())
self._llvm_module.verify()
return self._llvm_module
def get_target_machine(self):
"""Instantiate and return target machine."""
if not hasattr(self, '_llvm_module'):
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
self._tm = llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=1)
return self._tm
def get_assembly(self):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
return tm.emit_assembly(self.get_llvm_module())
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self._function_ctype(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
for i in range(repeat):
while True:
start = time.perf_counter()
res = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
factor = target_elapsed / elapsed
args = self.prepare_arguments(previous_args=args, time_factor=factor)
continue
else:
# After we have the right argument choice, we keep it.
fixed_args = True
break
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': psutil.cpu_freq().current * 1e6}
@classmethod
def get_latency(cls, max_serial=6, print_table=False, **kwargs):
if print_table:
print(' s |' + ''.join([' {:^5}'.format(i) for i in range(1, max_serial)]))
print(' | ', end='')
serial_runs = []
for s in range(1, max_serial):
m = cls(serial=s, parallel=1, **kwargs)
r = m.build_and_execute(repeat=1)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * m.parallel * m.serial)
if print_table:
print('{:.3f} '.format(cy_per_it), end='')
sys.stdout.flush()
serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
if print_table:
print()
print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format(
lat=min(serial_runs)[1]))
return min(serial_runs)[1]
@classmethod
def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs):
if print_table:
print('s\p |' + ''.join([' {:^5}'.format(i) for i in range(2, max_parallel)]))
parallel_runs = []
for s in range(1, max_serial):
if print_table:
print('{:>3} | '.format(s), end='')
for p in range(2, max_parallel):
m = cls(serial=s, parallel=p, **kwargs)
r = m.build_and_execute(repeat=1)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * m.parallel * m.serial)
if print_table:
print('{:.3f} '.format(cy_per_it), end='')
sys.stdout.flush()
parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
if print_table:
print()
if print_table:
print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format(
tp=min(parallel_runs)[1]))
return min(parallel_runs)[1]
class InstructionBenchmark(Benchmark):
def __init__(self, instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=4):
"""
Build LLVM IR for arithmetic instruction benchmark without memory references.
Currently only one destination (dst) or combined destination and source (dstsrc) operand
is allowed. Only instruction's operands ($N) refer to the order of opernads found in
dst + dstsrc + src.
"""
Benchmark.__init__(self, parallel=parallel, serial=serial)
self.instruction = instruction
self.dst_operands = dst_operands
self.dstsrc_operands = dstsrc_operands
self.src_operands = src_operands
self._loop_body = ''
if len(dst_operands) + len(dstsrc_operands) != 1:
raise NotImplemented("Must have exactly one dst or dstsrc operand.")
if not all([op[0] in 'irx'
for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
raise NotImplemented("This class only supports register and immediate operands.")
# Part 1: PHI functions and initializations
for i, dstsrc_op in enumerate(dstsrc_operands):
# constraint code, llvm type string, initial value
if dstsrc_op[0] in 'rx':
# register operand
for p in range(self.parallel):
self._loop_body += (
'%"dstsrc{index}_{p}" = phi {type} '
'[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
else:
raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
# Part 2: Inline ASM call
# Build constraint string from operands
constraints = ','.join(
['=' + dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
[sop[0] for sop in itertools.chain(src_operands)] +
['{}'.format(i + len(dst_operands)) for i in range(len(dstsrc_operands))])
for i, dstsrc_op in enumerate(dstsrc_operands):
# Build instruction from instruction and operands
# TODO support multiple dstsrc operands
# TODO support dst and dstsrc operands at the same time
for p in range(self.parallel):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
for j, dop in enumerate(dstsrc_operands):
operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=j, p=p))
args = ', '.join(operands)
self._loop_body += (
'%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dstsrc_op[1],
instruction='\n'.join([instruction] * self.serial),
constraints=constraints,
args=args,
p=p)
for i, dst_op in enumerate(dst_operands):
# Build instruction from instruction and operands
# TODO support multiple dst operands
# TODO support dst and dstsrc operands at the same time
if self.serial != 1:
raise NotImplemented("Serial > 1 and dst operand is not supported.")
for p in range(self.parallel):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
args = ', '.join(operands)
self._loop_body += (
'%"dst{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dst_op[1],
instruction=instruction,
constraints=constraints,
args=args,
p=p)
class AddressGenerationBenchmark(Benchmark):
def __init__(self,
offset=('i', 'i64', '0x42'),
base=('r', 'i64', '0'),
index=('r', 'i64', '0'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=4):
"""
Benchmark for address generation modes.
Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
or 'i' (immediate) and initial_value a string.
E.g., ('r', 'i64', '0') or ('i', None, '4')
+--------------------------------+-----------------------------+
| Mode | AT&T |
+--------------------------------+-----------------------------+
| Offset | leal 0x0100, %eax | <- no latency support
| Base | leal (%esi), %eax |
| Offset + Base | leal -8(%ebp), %eax |
| Offset + Index*Width | leal 0x100(,%ebx,4), %eax |
| Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax |
+--------------------------------+-----------------------------+
OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
offset: immediate integer (+/-)
base: register
index: register
width: immediate 1,2,4 or 8
"""
Benchmark.__init__(self, parallel=parallel, serial=serial)
self.offset = offset
self.base = base
self.index = index
self.width = width
self.destination = destination
self.parallel = parallel
# Sanity checks:
if bool(index) ^ bool(width):
raise ValueError("Index and width both need to be set, or be None.")
elif index and width:
if width[0] != 'i' or int(width[2]) not in [1, 2, 4, 8]:
raise ValueError("Width may only be immediate 1,2,4 or 8.")
if index[0] != 'r':
raise ValueError("Index must be a register.")
if offset and offset[0] != 'i':
raise ValueError("Offset must be an immediate.")
if base and base[0] != 'r':
raise ValueError("Offset must be a register.")
if not index and not width and not offset and not base:
raise ValueError("Must provide at least an offset or base.")
if destination == 'base' and not base:
raise ValueError("Destination may only be set to 'base' if base is set.")
elif destination == 'index' and not index:
raise ValueError("Destination may only be set to 'index' if index is set.")
elif destination not in ['base', 'index']:
raise ValueError("Destination must be set to 'base' or 'index'.")
if not base and not index:
raise ValueError("Either base or index must be set for latency test to work.")
if serial != 1 and not (base or index):
raise ValueError("Serial > 1 only works with index and/or base in use.")
self._loop_body = ''
ops = ''
if offset:
ops += offset[2]
if base:
ops += '($0'
if width and index:
ops += ',$1,{}'.format(width[2])
ops += ')'
if destination == 'base':
ops += ', $0'
else: # destination == 'index'
ops += ', $1'
else:
if width and index:
ops += '(,$0,{}), $0'.format(width[2])
ops += ' '
if destination == 'base':
destination_reg = base
else: # destination == 'index'
destination_reg = index
# Part 1: PHI function for destination
for p in range(parallel):
self._loop_body += (
'%"{name}_{p}.0" = '
'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format(
name=destination, type=destination_reg[1], initial=destination_reg[2], p=p,
s=self.serial)
for p in range(parallel):
for s in range(self.serial):
constraints = '=r,r'
if base and index:
constraints += ',r'
if destination == 'base':
args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format(
base_type=base[1], base_name=destination,
index_type=index[1], index_value=index[2], p=p, s_in=s)
else: # destination == 'index':
args = '{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format(
base_type=base[1], base_value=base[2],
index_type=index[1], index_name=destination, p=p, s_in=s)
else:
args = '{type} %"{name}_{p}.{s_in}"'.format(
type=destination_reg[1], name=destination, p=p, s_in=s)
self._loop_body += (
'%"{name}_{p}.{s_out}" = call {type} asm sideeffect'
' "lea {ops}", "{constraints}" ({args})\n').format(
name=destination,
type=destination_reg[1],
ops=ops,
constraints=constraints,
args=args,
p=p,
s_out=s + 1)
class LoadBenchmark(Benchmark):
def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4):
"""
Benchmark for L1 load using pointer chasing.
*chain_length* is the number of pointers to place in memory.
*structure* may be 'linear' (1-offsets) or 'random'.
"""
Benchmark.__init__(self, parallel=parallel, serial=serial)
self._loop_body = ''
element_type = ctypes.POINTER(ctypes.c_int)
self._function_ctype = ctypes.CFUNCTYPE(
ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
self.chain_length = chain_length
self.parallel = parallel
self.structure = structure
self._pointer_field = (element_type * chain_length)()
if chain_length % serial != 0:
raise ValueError(
"chain_length ({}) needs to be divisible by serial factor ({}).".format(
chain_length, serial))
# Initialize pointer field
# Field must represent a ring of pointers
if structure == 'linear':
for i in range(chain_length):
self._pointer_field[i] = ctypes.cast(
ctypes.pointer(self._pointer_field[(i + 1) % chain_length]), element_type)
elif structure == 'random':
shuffled_indices = list(range(chain_length))
random.shuffle(shuffled_indices)
for i in range(chain_length):
self._pointer_field[shuffled_indices[i]] = ctypes.cast(
ctypes.pointer(self._pointer_field[shuffled_indices[(i + 1) % chain_length]]),
element_type)
else:
raise ValueError("Given structure is not supported. Supported are: "
"linear and random.")
def prepare_arguments(self, previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return self._pointer_field, 100
else:
return previous_args[0], int(previous_args[1] * time_factor)
def get_iterations(self, args):
"""Return number of iterations performed, based on lower level function arguments."""
return self.chain_length * args[1]
def get_ir(self):
"""
Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1):
int test(int** ptrf, int repeat) {
int** p0 = (int**)ptrf[0];
int i = 0;
while(i < N) {
int** p = (int**)*p0;
while(p != p0) {
p = (int**)*p;
}
i++;
}
return i;
}
"""
ret = textwrap.dedent('''
define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
entry:
''')
# Load pointer to ptrf[p] and p0
for p in range(self.parallel):
if p > 0:
ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
ret += (
' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
ret += textwrap.dedent('''
%"cmp.entry" = icmp sgt i32 %"repeats", 0
br i1 %"cmp.entry", label %"loop0", label %"end"
loop0:
br label %"loop1"
loop1:
%"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
br label %"loop2"
loop2:\n''')
for p in range(self.parallel):
ret += (' %"p_{p}.0" = phi i32** '
'[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format(
p=p, s_max=self.serial)
# load p, compare to p0 and or-combine results
for p in range(self.parallel):
for s in range(self.serial):
ret += (' %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n'
' %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format(
p=p, s=s + 1, s_prev=s)
# Compare is needed for all registers, for llvm not to remove unused
# instructions:
ret += ' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format(
p=p, s_max=self.serial)
# TODO tree reduce cmp to make use of all cmp_* values
# It is sufficient to use only one compare, all others will be eliminated
ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
ret += textwrap.dedent('''
loop3:
%"i.1" = add i32 %"i", 1
%"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
br i1 %"cmp.loop3", label %"end", label %"loop1"
end:
%"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
ret i32 %"ret"
}''')
return ret
if __name__ == '__main__':
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
modules = collections.OrderedDict()
# immediate source
modules['add i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=1,
serial=5)
# register source
modules['add r64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('r', 'i64', '1'),),
parallel=1,
serial=5)
# multiple instructions
modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=1,
serial=5)
# immediate source
modules['add i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=5)
# register source
modules['add r64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('r', 'i64', '1'),),
parallel=10,
serial=5)
# multiple instructions
modules['4xadd i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=1)
modules['lea base LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5)
modules['lea base+offset LAT'] = AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5)
modules['lea index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5)
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5)
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5)
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5)
modules['lea base TP'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1)
modules['lea base+offset TP'] = AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1)
modules['lea index*width TP'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1)
modules['lea offset+index*width TP'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1)
modules['lea base+index*width TP'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1)
modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1)
modules['LD linear LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=1,
serial=2)
modules['LD random LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=1,
serial=2)
modules['LD linear TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=4,
serial=2)
modules['LD random TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=4,
serial=2)
modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark(
instruction='vaddpd $1, $0, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=1,
serial=5)
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark(
instruction='vmulpd $1, $0, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=1,
serial=5)
# This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies:
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark(
instruction='vmulpd $1, $2, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=10,
serial=1)
# modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('lea base LAT')])
verbose = 2 if '-v' in sys.argv else 0
for key, module in modules.items():
if verbose > 0:
print("=== Benchmark")
print(repr(module))
print("=== LLVM")
print(module.get_ir())
print("=== Assembly")
print(module.get_assembly())
r = module.build_and_execute(repeat=3)
if verbose > 0:
print("=== Result")
pprint.pprint(r)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * module.parallel * module.serial)
print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format(
key=key,
module=module,
cy_per_it=cy_per_it,
runtime_sum=sum(r['runtimes'])))
# InstructionBenchmark.get_latency(
# instruction='vmulpd $1, $0, $0',
# dst_operands=(),
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
# print_table=True)
# InstructionBenchmark.get_throughput(
# instruction='vmulpd $1, $0, $0',
# dst_operands=(),
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
# print_table=True)
#
# InstructionBenchmark.get_latency(
# instruction='nop',
# dst_operands=(),
# dstsrc_operands=(('r','i8', '0'),),
# src_operands=(),
# print_table=True)
# InstructionBenchmark.get_throughput(
# instruction='nop',
# dst_operands=(),
# dstsrc_operands=(('r','i8', '0'),),
# src_operands=(),
# print_table=True)

View File

@@ -1,501 +0,0 @@
#!/usr/bin/env python3
import re
# TODO use abc to force implementation of interface requirements
init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
# LLVM requires floating point constants to have a non-repeating binary representation
# See http://llvm.org/docs/LangRef.html#simple-constants for details
init_value_by_llvm_type.update({fp_type: str(1+1/2**10)
for fp_type in ['float', 'double', 'fp128']})
# For vector-types we reuse the scalar values
init_value_by_llvm_type.update(
{'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
for t, v in init_value_by_llvm_type.items()
for vec in [2, 4, 8, 16, 32, 64]})
class NotSerializableError(Exception):
pass
class Operand:
def __init__(self, llvm_type):
self.llvm_type = llvm_type
def get_constraint_char(self):
raise NotImplementedError()
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
@staticmethod
def from_string(s):
options = [Register.from_string, Immediate.from_string, MemoryReference.from_string]
for o in options:
try:
return o(s)
except ValueError:
continue
raise ValueError("No matching operand type found for '{}'.".format(s))
class Immediate(Operand):
def __init__(self, llvm_type, value):
Operand.__init__(self, llvm_type)
self.value = value
def get_constraint_char(self):
return 'i'
@classmethod
def from_string(cls, s):
"""
Create Immediate object from string.
:param s: must have the form: "llvm_type:value"
"""
llvm_type, value = s.split(':', 1)
value_regex = r'(0x[0-9a-fA-F]+|[0-9]+(\.[0-9]+)?)'
if not re.match(value_regex, value):
raise ValueError("Invalid immediate value, must match {!r}".format(value_regex))
return cls(llvm_type, value)
class MemoryReference(Operand):
"""
offset + base + index*width
OFFSET(BASE, INDEX, WIDTH) in AT&T assembly
Possible operand values:
offset: immediate integer (+/-)
base: register
index: register
width: immediate 1,2,4 or 8
"""
def __init__(self, llvm_type, offset=None, base=None, index=None, width=None):
super().__init__(llvm_type)
self.offset = offset
self.base = base
self.index = index
self.width = width
# Sanity checks:
if bool(index) ^ bool(width):
raise ValueError("Index and width both need to be set, or None.")
elif index and width:
if not (isinstance(width, Immediate) and int(width.value) in [1, 2, 4, 8]):
raise ValueError("Width may only be immediate 1,2,4 or 8.")
if not isinstance(index, Register):
raise ValueError("Index must be a register.")
if offset and not isinstance(offset, Immediate):
raise ValueError("Offset must be an immediate.")
if base and not isinstance(base, Register):
raise ValueError("Offset must be a register.")
if not index and not width and not offset and not base:
raise ValueError("Must provide at least an offset or base.")
def get_constraint_char(self):
return 'm'
def get_registers(self):
if self.base:
yield self.base
if self.index:
yield self.index
@classmethod
def from_string(cls, s):
"""
Create MemoryReference from string.
:param s: must fulfill the regex: "mem:[bdis]+"
"""
m = re.match(r"\*([^:]+):([obiw]+)", s)
if not m:
raise ValueError("Invalid format, must match 'mem:[obiw]+'.")
else:
llvm_type, features = m.groups()
offset = None
if 'o' in features:
offset = Immediate('i32', 8)
base = None
if 'b' in features:
base = Register('i64', 'r')
index = None
if 'i' in features:
index = Register('i64', 'r')
width = None
if 'w' in features:
width = Immediate('i32', 8)
return cls(llvm_type, offset=offset, base=base, index=index, width=width)
class Register(Operand):
def __init__(self, llvm_type, constraint_char='r'):
super().__init__(llvm_type)
self.constraint_char = constraint_char
def get_constraint_char(self):
return self.constraint_char
@classmethod
def from_string(cls, s):
"""
Create Register object from string.
:param s: must have the form: "llvm_type:constraint_char"
"""
llvm_type, constraint_char = s.split(':', 1)
valid_cc = 'rx'
if constraint_char not in valid_cc:
raise ValueError("Invalid constraint character, must be one of {!r}".format(valid_cc))
return cls(llvm_type, constraint_char)
class Synthable:
def __init__(self):
pass
def build_ir(self, dst_reg_names, src_reg_names, used_registers):
raise NotImplementedError()
def get_source_registers(self):
raise NotImplementedError()
def get_destination_registers(self):
raise NotImplementedError()
@staticmethod
def _get_unused_reg_name(used_registers):
name = None
i = 0
while name in used_registers or name is None:
name = '%"reg.{}"'.format(i)
i += 1
used_registers.add(name)
return name
def get_default_init_values(self):
return [init_value_by_llvm_type[reg.llvm_type] for reg in self.get_source_registers()]
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
class Operation(Synthable):
"""Base class for operations."""
class Instruction(Operation):
def __init__(self, instruction, destination_operand, source_operands):
super().__init__()
self.instruction = instruction
self.destination_operand = destination_operand
assert isinstance(destination_operand, Register), "Destination needs to be a register."
self.source_operands = source_operands
def get_source_registers(self):
sop_types = set()
sr = []
for sop in self.source_operands:
if isinstance(sop, Register):
if sop.llvm_type not in sop_types:
sop_types.add(sop.llvm_type)
sr.append(sop)
elif isinstance(sop, MemoryReference):
sr += list(sop.get_registers())
return sr
def get_destination_registers(self):
if isinstance(self.destination_operand, Register):
return [self.destination_operand]
else:
return []
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
"""
Build IR string based on in and out operand names and types.
"""
if used_registers is None:
used_registers = set(dst_reg_names + src_reg_names)
# Build constraint string from operands
constraints = ','.join(
['=' + self.destination_operand.get_constraint_char()] +
[sop.get_constraint_char() for sop in self.source_operands])
# Build argument string from operands and register names
operands = []
sop_types = {}
i = 0
for sop in self.source_operands:
if isinstance(sop, Immediate):
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=sop.value))
elif isinstance(sop, Register):
if sop.llvm_type in sop_types:
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[sop_types[sop.llvm_type]]))
else:
sop_types[sop.llvm_type] = i
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[i]))
i += 1
elif isinstance(sop, MemoryReference):
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[i]))
i += 1
else:
raise NotImplementedError("Only register and immediate operands are supported.")
args = ', '.join(operands)
# Build instruction from instruction and operands
return ('{dst_reg} = call {dst_type} asm '
' "{instruction}", "{constraints}" ({args})').format(
dst_reg=dst_reg_names[0],
dst_type=self.destination_operand.llvm_type,
instruction=self.instruction,
constraints=constraints,
args=args)
@classmethod
def from_string(cls, s):
"""
Create Instruction object from string.
:param s: must have the form:
"asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
"""
instruction = s
# It is important that the match objects are in reverse order, to allow string replacements
# based on original match group locations
operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
# Destination indices start at 0
dst_index = 0
# Source indices at "number of destination operands"
src_index = ['dst' in o.group(1) for o in operands].count(True)
dst_ops = []
src_ops = []
for m in operands:
direction, operand_string = m.group(1, 2)
operand = Operand.from_string(operand_string)
if 'src' in direction and not 'dst' in direction:
src_ops.append(operand)
# replace with index string
instruction = (instruction[:m.start()] + "${}".format(src_index)
+ instruction[m.end():])
src_index += 1
if 'dst' in direction:
dst_ops.append(operand)
# replace with index string
instruction = (instruction[:m.start()] + "${}".format(dst_index)
+ instruction[m.end():])
if 'src' in direction:
src_ops.append(Register(operand_string.split(':', 1)[0], str(dst_index)))
src_index += 1
dst_index += 1
if len(dst_ops) != 1:
raise ValueError("Instruction supports only single destinations.")
return cls(instruction, dst_ops[0], src_ops)
class Load(Operation):
def __init__(self, chain_length, structure='linear'):
"""
*chain_length* is the number of pointers to place in memory.
*structure* may be 'linear' (1-offsets) or 'random'.
"""
super().__init__()
self.chain_length = chain_length
self.structure = structure
# TODO
class AddressGeneration(Operation):
def __init__(self, offset, base, index, width, destination='base'):
super().__init__()
self.offset = offset
self.base = base
self.index = index
self.width = width
self.destination = destination
raise NotImplementedError()
class Serialized(Synthable):
def __init__(self, synths):
super().__init__()
self.synths = synths
assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
def get_source_registers(self):
if self.synths:
return self.synths[0].get_source_registers()
else:
return []
def get_destination_registers(self):
if self.synths:
return self.synths[-1].get_destination_registers()
else:
return []
@staticmethod
def match(source_registers, destination_registers):
"""
Find maximum number of matches from source (previous destinations) to
destination (current source) registers.
Return list of two-tuples of matches (src_idx, dst_idx)
"""
matched_pairs = []
unmatched_dests = set(destination_registers)
for dst_idx, dst in enumerate(destination_registers):
for src_idx, src in enumerate(source_registers):
if src.llvm_type == dst.llvm_type:
matched_pairs.append((src_idx, dst_idx))
unmatched_dests.discard(dst)
return matched_pairs, unmatched_dests
def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
reg_naming_out = []
dst_naming = []
last_s = None
for i, s in enumerate(self.synths):
if i == 0:
# first source is passed in from outside
src_naming = src_reg_names
else:
# match with previous destinations
src_naming = []
match = False
for src in s.get_source_registers():
# Find matching destination from previous synths
src_match = False
for dst_idx, dst in enumerate(last_s.get_destination_registers()):
if dst.llvm_type == src.llvm_type:
match = src_match = True
src_naming.append(dst_naming[dst_idx])
# If source could not be matched, use constant value instead
if not src_match:
src_naming.append(init_value_by_llvm_type[src.llvm_type])
if not match:
raise NotSerializableError("Unable to find match.")
if i == len(self.synths) - 1:
# last destination is passed in from outside
dst_naming = dst_reg_names
else:
# noinspection PyUnusedLocal
dst_naming = [self._get_unused_reg_name(used_registers)
for j in s.get_destination_registers()]
reg_naming_out.append((dst_naming, src_naming))
last_s = s
return reg_naming_out, used_registers
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
if used_registers is None:
used_registers = set(dst_reg_names + src_reg_names)
reg_names, used_registers = self.generate_register_naming(
dst_reg_names, src_reg_names, used_registers)
code = []
for s, r in zip(self.synths, reg_names):
code.append(s.build_ir(*r, used_registers))
return '\n'.join(code)
class Parallelized(Synthable):
def __init__(self, synths):
super().__init__()
self.synths = synths
assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
def get_source_registers(self):
sources = []
for s in self.synths:
sources += s.get_source_registers()
return sources
def get_destination_registers(self):
destinations = []
for s in self.synths:
destinations += s.get_destination_registers()
return destinations
def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
# Split reg_naming among all synths
reg_naming_out = []
for s in self.synths:
n_dsts = len(s.get_destination_registers())
n_srcs = len(s.get_source_registers())
reg_naming_out.append((dst_reg_names[:n_dsts], src_reg_names[:n_srcs]))
dst_reg_names, src_reg_names = (dst_reg_names[n_dsts:], src_reg_names[n_srcs:])
return reg_naming_out, used_registers
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
if used_registers is None:
used_registers = set(dst_reg_names + src_reg_names)
reg_names, used_registers = self.generate_register_naming(
dst_reg_names, src_reg_names, used_registers)
code = []
for s, r in zip(self.synths, reg_names):
code.append(s.build_ir(*r, used_registers))
return '\n'.join(code)
if __name__ == '__main__':
i1 = Instruction(
instruction='add $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
i2 = Instruction(
instruction='sub $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
i3 = Instruction(
instruction='mul $1, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Register('i64', 'r')])
i4 = Instruction(
instruction='div $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
i5 = Instruction(
instruction='mul $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
i6 = Instruction(
instruction='inc $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r')])
s1 = Serialized([i1, i2])
s2 = Serialized([s1, i3])
print(s1.build_ir(['%out'], ['%in']), '\n')
print(s2.build_ir(['%out'], ['%in']), '\n')
s3 = Serialized([i4, i5])
p1 = Parallelized([i6, s2, s3])
print(p1.build_ir(['%out.0', '%out.1', '%out.2'], ['%in.0', '%in.1', '%in.2']), '\n')
s4 = Serialized([i1, i2, i3, i4, i5, i6])
print(s4.build_ir(['%out'], ['%in']), '\n')
print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))

View File

@@ -1,243 +0,0 @@
#!/usr/bin/env python3
import collections
import itertools
import socket
import numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
from asmjit import op, bench
from asmjit import oldjit
def jit_based_benchs():
modules = collections.OrderedDict()
modules['lea_b'] = (
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1))
modules['lea_b+off'] = (
oldjit.AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1))
modules['lea_idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1))
modules['lea_off+idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1))
modules['lea_b+idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1))
modules['lea_b+off+idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1))
modules['LD_linear'] = (
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=1,
serial=2),
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=4,
serial=2))
modules['LD_random'] = (
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=1,
serial=2),
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=4,
serial=2))
for name, mods in modules.items():
lat_module, tp_module = mods
r_lat = lat_module.build_and_execute(repeat=3)
cy_per_it_lat = min(r_lat['runtimes']) * r_lat['frequency'] / (
r_lat['iterations'] * lat_module.parallel * lat_module.serial)
r_tp = tp_module.build_and_execute(repeat=3)
cy_per_it_tp = min(r_tp['runtimes']) * r_tp['frequency'] / (
r_tp['iterations'] * tp_module.parallel * tp_module.serial)
print('{key:<16} LAT {cy_per_it_lat:.3f} cy TP {cy_per_it_tp:.3f} cy'.format(
key=name,
cy_per_it_lat=cy_per_it_lat,
cy_per_it_tp=cy_per_it_tp))
def plot_combined(single_measured, combined_measured):
instructions = list(single_measured.keys())
d = numpy.ndarray((len(single_measured), len(single_measured)))
d.fill(float('nan'))
for k, v in combined_measured.items():
i1, i2 = [instructions.index(i) for i in [c[0] for c in k]]
d[i1, i2] = v[2]
cmap = mpl.cm.get_cmap('plasma', 5)
cmap.set_bad('w') # default value is 'k'
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(111)
cax = ax1.imshow(d, interpolation="nearest", cmap=cmap, norm=mpl.colors.Normalize(vmin=-.5, vmax=1.5))
ax1.set_xticks(range(len(instructions)))
ax1.set_xticklabels(instructions, rotation=90)
ax1.set_yticks(range(len(instructions)))
ax1.set_yticklabels(instructions)
ax1.set_title(socket.gethostname())
ax1.grid()
cb = fig.colorbar(cax, shrink=0.65)
cb.set_ticks([-.5, 0, 1, 1.5])
cb.set_ticklabels(['< -0.5', '0.0 (complete overlap)', '1.0 (no overlap)', '> 1.5'])
cb.set_label('inverse parallel overlap')
fig.tight_layout()
plt.show()
if __name__ == '__main__':
bench.setup_llvm()
instructions = [
(i[0], i[1], op.Instruction.from_string(i[1]))
for i in [
('ADD32ri', 'add {src:i32:1}, {srcdst:i32:r}'),
('ADD64ri32', 'add {src:i32:1}, {srcdst:i64:r}'),
('INC64r', 'inc {srcdst:i64:r}'),
('SUB32ri', 'sub {src:i32:1}, {srcdst:i64:r}'),
('MOV64ri32', 'mov {src:i32:1}, {srcdst:i64:r}'),
('VINSERTF128rr', 'vinsertf128 {src:i8:0}, {src:<2 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VCVTSI642SSrr', 'vcvtsi2ss {src:i64:r}, {src:float:x}, {dst:float:x}'),
('VADDPDYrr', 'vaddpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VADDSDrr', 'vaddsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VADDSSrr', 'vaddss {src:float:x}, {src:float:x}, {dst:float:x}'),
('VFMADD213PDYr', 'vfmadd213pd {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
('VFMADD213PDr', 'vfmadd213pd {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
('VFMADD213PSYr', 'vfmadd213ps {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
('VFMADD213PSr', 'vfmadd213ps {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
('VFMADD213SDr', 'vfmadd213sd {src:double:x}, {src:double:x}, {srcdst:double:x}'),
('VFMADD213SSr', 'vfmadd213ss {src:float:x}, {src:float:x}, {srcdst:float:x}'),
('VMULPDYrr', 'vmulpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VMULSDrr', 'vmulsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VMULSSrr', 'vmulss {src:float:x}, {src:float:x}, {dst:float:x}'),
('VSUBSDrr', 'vsubsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VSUBSSrr', 'vsubss {src:float:x}, {src:float:x}, {dst:float:x}'),
('VDIVPDYrr', 'vdivpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VDIVSDrr', 'vdivsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VDIVSSrr', 'vdivss {src:float:x}, {src:float:x}, {dst:float:x}'),
]
]
instructions_measured = collections.OrderedDict()
for llvm_name, i_str, i in instructions:
lat, tp = bench.bench_instructions(
[i],
serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
print('{:<16} LAT {:.3f} cy TP {:.3f} cy'.format(llvm_name, lat, tp))
instructions_measured[llvm_name] = (lat, tp)
jit_based_benchs()
two_combinations_measured = collections.OrderedDict()
for a, b in itertools.combinations_with_replacement(instructions, 2):
lat, tp = bench.bench_instructions(
[a[2], b[2]],
serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
same_port_metric = ((
tp-max(instructions_measured[a[0]][1], instructions_measured[b[0]][1])) /
min(instructions_measured[a[0]][1], instructions_measured[b[0]][1]))
print('{:<16} {:<16} LAT {:.3f} cy TP {:.3f} cy SPM {:>5.2f}'.format(
a[0], b[0], lat, tp, same_port_metric))
two_combinations_measured[(a[0], a[1]), (b[0], b[1])] = (lat, tp, same_port_metric)
plot_combined(instructions_measured, two_combinations_measured)

View File

@@ -8,7 +8,7 @@
In order to construct an accurate instruction execution model for modern out-of-order micro architectures, an accurate description of instruction latency and throughput, as well as resource conflicts is indispensable. Already existing resources and vendor provided information is neither complete nor detailed enough and sometimes faulty. We therefore proclaim to deduct this information through runtime instruction benchmarking of single and composite instructions, and present a framework to support such investigations based on LLVM's just-in-time and cross-platform compilation capabilities.
`pyasmjit` abstracts instructions, registers, immediates, memory operands and dependency chains, to easily construct benchmarks. The synthesized code is interactively compiled and executed using the `llvmlite` library, which in turn is based on the stable LLVM C-API. `pyasmjit` offers a command line as well as a programming interface.
`asmbench` abstracts instructions, registers, immediates, memory operands and dependency chains, to easily construct benchmarks. The synthesized code is interactively compiled and executed using the `llvmlite` library, which in turn is based on the stable LLVM C-API. `asmbench` offers a command line as well as a programming interface.
Unlike other approaches, we do not rely on model specific performance counters and focus on interoperability and automation to support quick modeling of many microarchitectures.
@@ -23,7 +23,7 @@ Unlike other approaches, we do not rely on model specific performance counters a
### A.2.2 How software can be obtained (if available)
Check out https://github.com/RRZE-HPC/pyasmjit
Check out https://github.com/RRZE-HPC/asmbench
### A.2.3 Hardware dependencies
@@ -47,15 +47,15 @@ None required, all included.
## A.3 Installation
To install `asmjit` in the correct version and all its dependencies into the users home directory, execute: `pip3 install --user asmjit[sc18src]==0.1.2`.
To install `asmbench` in the correct version and all its dependencies into the users home directory, execute: `pip3 install --user asmbench[sc18src]==0.1.2`.
Alternatively clone https://github.com/RRZE-HPC/pyasmjit with commit hash 515b28cb4e44426239e6161dc3a79d888a9e0e21 and install using included `setup.py`.
Alternatively clone https://github.com/RRZE-HPC/asmbench with commit hash 515b28cb4e44426239e6161dc3a79d888a9e0e21 and install using included `setup.py`.
## A.4 Experiment workflow
1. Fix frequency, e.g., using likwid: `likwid-setFrequencies -f <FREQ>`. `<FREQ>` should be the base clock for the specific model used.
2. Disable turbo mode, e.g., using likwid: `likwid-setFrequencies -t 0`.
3. Run `asmjit.sc18src` module: `python3 -m asmjit.sc18src`.
3. Run `asmbench.sc18src` module: `python3 -m asmbench.sc18src`.
## A.5 Evaluation and expected result

View File

@@ -2,7 +2,7 @@
import collections
import itertools
from asmjit import op, bench, oldjit
from asmbench import op, bench, oldjit
def jit_based_benchs():

View File

@@ -5,13 +5,13 @@ with open('README.md') as f:
long_description = f.read()
setup(
name='asmjit',
version='0.1.2',
name='asmbench',
version='0.1.0',
packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
url='',
license='AGPLv3',
author='Julian Hammer',
author_email='julian.hammer@u-sys.org',
author_email='julian.hammer@fau.de',
description='A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT',
long_description_content_type='text/markdown',
long_description=long_description,

View File

@@ -10,7 +10,7 @@ import argparse
import random
from pprint import pprint
from asmjit import op, bench
from asmbench import op, bench
def split_list(raw):