renamed to asmbench

2025-07-21 04:31:05 +02:00 · 2018-09-25 10:23:40 +02:00
parent 3033246d4e
commit dbbd37585a
12 changed files with 29 additions and 2056 deletions
--- a/.idea/other.xml
+++ b/.idea/other.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PySciProjectComponent">
+    <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
+  </component>
+</project>
--- a/README.md
+++ b/README.md
@@ -1,6 +1,17 @@
-# pyasmjit
+# asmbench

-A instruction latency and throughput benchmarking framework for out-of-order architectures.
+A benchmark toolkit for assembly instructions using the LLVM JIT.

 ## Usage

+To benchmark latency and throughput of a 64bit integer add use the following command:
+```
+python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}'
+```
+
+To benchmark two instructions interleaved use this:
+```
+python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}' 'sub {src:i64:r}, {srcdst:i64:r}'
+```
+
+To find out more add `-h` for help and `-v` for verbose mode.
--- a/asmjit/init.py
+++ b/asmjit/init.py
--- a/asmjit/main.py
+++ b/asmjit/main.py
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-
-import llvmlite.binding as llvm
-
-from . import op, bench
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
-    # parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
-    parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
-                        help='instruction declaration, e.g., "add {src:i32:r} {srcdst:i32:r}"')
-    parser.add_argument('--serialize', action='store_true',
-                        help='Serialize instructions.')
-    parser.add_argument('--latency-serial', '-l', type=int, default=8,
-                         help='length of serial chain for each instruction in latency benchmark')
-    parser.add_argument('--parallel', '-p',type=int, default=10,
-                        help='number of parallel instances of serial chains in throughput '
-                             'benchmark')
-    parser.add_argument('--throughput-serial', '-t', type=int, default=8,
-                        help='length of serial instances of serial chains in throughput benchmark')
-    parser.add_argument('--iaca', type=str, default=None,
-                        help='Compare throughput measurement with IACA analysis, pass '
-                             'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
-    parser.add_argument("--verbose", "-v", action="count", default=0,
-                        help="increase output verbosity")
-    args = parser.parse_args()
-
-    bench.setup_llvm()
-    lat, tp = bench.bench_instructions(args.instructions,
-                                       serial_factor=args.latency_serial,
-                                       parallel_factor=args.parallel,
-                                       throughput_serial_factor=args.throughput_serial,
-                                       serialize=args.serialize,
-                                       verbosity=args.verbose,
-                                       iaca_comparison=args.iaca)
-    print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
-
-
-if __name__ == "__main__":
-    main()
--- a/asmjit/bench.py
+++ b/asmjit/bench.py
@@ -1,398 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-import time
-import textwrap
-import itertools
-import re
-from pprint import pprint
-import tempfile
-import subprocess
-import sys
-
-import llvmlite.binding as llvm
-import psutil
-try:
-    from kerncraft import iaca
-except ImportError:
-    iaca = None
-
-from . import op
-
-
-def setup_llvm():
-    llvm.initialize()
-    llvm.initialize_native_target()
-    llvm.initialize_native_asmprinter()
-    llvm.initialize_native_asmparser()
-
-
-def uniquify(l):
-    # Uniquify list while preserving order
-    seen = set()
-    return [x for x in l if x not in seen and not seen.add(x)]
-
-
-class Benchmark:
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-    @staticmethod
-    def prepare_arguments(previous_args=None, time_factor=1.0):
-        """Build argument tuple, to be passed to low level function."""
-        if previous_args is None:
-            return 10000000,
-        else:
-            try:
-                return int(previous_args[0] * time_factor),
-            except OverflowError:
-                return previous_args[0]*10,
-
-    @staticmethod
-    def get_iterations(args) -> int:
-        """Return number of iterations performed, based on lower level function arguments."""
-        return args[0]
-
-    def build_ir(self):
-        raise NotImplementedError()
-
-    def get_llvm_module(self, iaca_marker=False):
-        """Build and return LLVM module from LLVM IR code."""
-        ir = self.build_ir(iaca_marker=iaca_marker)
-        return llvm.parse_assembly(ir)
-
-    def get_target_machine(self):
-        """Instantiate and return target machine."""
-        features = llvm.get_host_cpu_features().flatten()
-        cpu = '' # llvm.get_host_cpu_name()  # Work around until ryzen problems are fixed
-        return llvm.Target.from_default_triple().create_target_machine(
-             cpu=cpu, features=features, opt=3)
-
-    def get_assembly(self, iaca_marker=False):
-        """Compile and return assembly from LLVM module."""
-        tm = self.get_target_machine()
-        tm.set_asm_verbosity(0)
-        asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
-        # Remove double comments
-        asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
-        return asm
-
-    def get_function_ctype(self):
-        return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
-
-    def get_iaca_analysis(self, arch):
-        """Compile and return IACA analysis."""
-        if iaca is None:
-            raise ValueError("kerncraft not installed. IACA analysis is not supported.")
-        tm = self.get_target_machine()
-        tmpf = tempfile.NamedTemporaryFile("wb")
-        tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
-        tmpf.flush()
-        return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
-
-    def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
-        # Compile the module to machine code using MCJIT
-        tm = self.get_target_machine()
-        runtimes = []
-        return_values = []
-        args = self.prepare_arguments()
-        with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
-            ee.finalize_object()
-
-            # Obtain a pointer to the compiled 'sum' - it's the address of its JITed
-            # code in memory.
-            cfptr = ee.get_function_address('test')
-
-            # To convert an address to an actual callable thing we have to use
-            # CFUNCTYPE, and specify the arguments & return type.
-            cfunc = self.get_function_ctype()(cfptr)
-
-            # Now 'cfunc' is an actual callable we can invoke
-            # TODO replace time.clock with a C implemententation for less overhead
-            # TODO return result in machine readable format
-            fixed_args = False
-            for i in range(repeat):
-                tries = 0
-                while True:
-                    if tries > 10:
-                        raise RuntimeError("Unable to measure non-zero runtime.")
-                    tries += 1
-                    start = time.perf_counter()
-                    ret = cfunc(*args)
-                    end = time.perf_counter()
-                    elapsed = end - start
-                    if ret != args[0]-1:
-                        raise RuntimeError(
-                            "Return value {} is invalid, should have been {}.".format(ret, args[0]-1))
-                    if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
-                        target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
-                        factor = target_elapsed / elapsed
-                        args = self.prepare_arguments(previous_args=args, time_factor=factor)
-                        continue
-                    else:
-                        # After we have the right argument choice, we keep it.
-                        fixed_args = True
-                        break
-                return_values.append(ret)
-                runtimes.append(elapsed)
-
-        return {'iterations': self.get_iterations(args),
-                'arguments': args,
-                'runtimes': runtimes,
-                'frequency': psutil.cpu_freq().current * 1e6,
-                'returned': return_values}
-
-
-class LoopBenchmark(Benchmark):
-    def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True):
-        super().__init__()
-        self.root_synth = root_synth
-        self.init_values = init_values or root_synth.get_default_init_values()
-        self.loop_carried_dependencies = loop_carried_dependencies
-
-        if len(root_synth.get_source_registers()) != len(self.init_values):
-            raise ValueError("Number of init values and source registers do not match.")
-
-    def get_source_names(self):
-        return ['%in.{}'.format(i) for i in range(len(self.root_synth.get_source_registers()))]
-
-    def get_destination_names(self):
-        return ['%out.{}'.format(i) for i in
-                range(len(self.root_synth.get_destination_registers()))]
-
-    def get_phi_code(self):
-        if not self.loop_carried_dependencies:
-            return ''
-        # Compile loop carried dependencies
-        lcd = []
-        # Change in naming (src <-> dst) is on purpose!
-        srcs = self.root_synth.get_destination_registers()
-        dsts = self.root_synth.get_source_registers()
-        # cycle iterator is used to not only reuse a single destination, but go through all of them
-        srcs_it = itertools.cycle(enumerate(srcs))
-        matched = False
-        last_match_idx = len(srcs) - 1
-        for dst_idx, dst in enumerate(dsts):
-            for src_idx, src in srcs_it:
-                if src.llvm_type == dst.llvm_type:
-                    lcd.append([dst,
-                                self.get_source_names()[dst_idx],
-                                self.init_values[dst_idx],
-                                src,
-                                self.get_destination_names()[src_idx]])
-                    matched = True
-                    last_match_idx = src_idx
-                    break
-                # since srcs_it is an infinity iterator, we need to abort after a complete cycle
-                if src_idx == last_match_idx:
-                    break
-        if not matched:
-            raise ValueError("Unable to match source to any destination.")
-
-        code = ''
-        for dst_reg, dst_name, init_value, src_reg, src_name in lcd:
-            assert dst_reg.llvm_type == src_reg.llvm_type, \
-                "Source and destination types do not match"
-            code += ('{dst_name} = phi {llvm_type} [{init_value}, %"entry"], '
-                     '[{src_name}, %"loop"]\n').format(
-                llvm_type=dst_reg.llvm_type,
-                dst_name=dst_name,
-                init_value=init_value,
-                src_name=src_name)
-
-        # Add extra phi for constant values. Assuming LLVM will optimize them "away"
-        for dst_idx, dst in enumerate(dsts):
-            if dst not in [d for d, dn, i, s, sn in lcd]:
-                code += ('{dst_reg} = phi {llvm_type} [{init_value}, %"entry"], '
-                         '[{init_value}, %"loop"]\n').format(
-                    llvm_type=dst.llvm_type,
-                    dst_reg=self.get_source_names()[dst_idx],
-                    init_value=self.init_values[dst_idx])
-
-        return code
-
-    def build_ir(self):
-        raise NotImplementedError()
-
-
-class IntegerLoopBenchmark(LoopBenchmark):
-    def build_ir(self, iaca_marker=False):
-        if iaca_marker:
-            iaca_start_marker = textwrap.dedent('''\
-                call void asm "movl    $$111,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-            iaca_stop_marker = textwrap.dedent('''\
-                call void asm "movl    $$222,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-        else:
-            iaca_start_marker = ''
-            iaca_stop_marker = ''
-
-        ir = textwrap.dedent('''\
-            define i64 @"test"(i64 %"N")
-            {{
-            entry:
-              %"loop_cond" = icmp slt i64 0, %"N"
-              br i1 %"loop_cond", label %"loop", label %"end"
-
-            loop:
-              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-            {phi}
-            {iaca_start_marker}
-            {loop_body}
-              %"loop_counter.1" = add i64 %"loop_counter", 1
-              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-              br i1 %"loop_cond.1", label %"loop", label %"end"
-            
-            end:
-              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-            {iaca_stop_marker}
-              ret i64 %"ret"
-            }}
-            ''').format(
-            loop_body=textwrap.indent(
-                self.root_synth.build_ir(self.get_destination_names(),
-                                         self.get_source_names()), '  '),
-            phi=textwrap.indent(self.get_phi_code(), '  '),
-            iaca_start_marker=iaca_start_marker,
-            iaca_stop_marker=iaca_stop_marker)
-
-        return ir
-
-
-def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
-                       serialize=False, verbosity=0, iaca_comparison=None,
-                       repeat=4, min_elapsed=0.1, max_elapsed=0.2):
-    not_serializable = False
-    try:
-        # Latency Benchmark
-        if verbosity > 0:
-            print('## Latency Benchmark')
-        p_instrs = []
-        if not serialize:
-            for i in instructions:
-                p_instrs.append(op.Serialized([i] * serial_factor))
-        else:
-            p_instrs = [op.Serialized(instructions * serial_factor)]
-        p = op.Parallelized(p_instrs)
-        b = IntegerLoopBenchmark(p)
-        if verbosity >= 3:
-            print('### LLVM IR')
-            print(b.build_ir())
-        if verbosity >= 2:
-            print('### Assembly')
-            print(b.get_assembly())
-        if verbosity >= 3:
-            print('### IACA Analysis')
-            print(b.get_iaca_analysis('SKL')['output'])
-        result = b.build_and_execute(
-            repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
-        lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
-                    for t in result['runtimes']])
-        result['latency'] = lat
-        if verbosity > 0:
-            print('### Detailed Results')
-            pprint(result)
-            print()
-    except op.NotSerializableError as e:
-        print("Latency measurement not possible:", e)
-        not_serializable = True
-
-    if not_serializable:
-        throughput_serial_factor = 1
-        print("WARNING: throughput_serial_factor has be set to 1.")
-
-    # Throughput Benchmark
-    if verbosity > 0:
-        print('## Throughput Benchmark')
-    p_instrs = []
-    if not serialize:
-        for i in instructions:
-            p_instrs.append(op.Serialized([i] * throughput_serial_factor))
-    else:
-        p_instrs = [op.Serialized(instructions * throughput_serial_factor)]
-    p = op.Parallelized(p_instrs * parallel_factor)
-    b = IntegerLoopBenchmark(p)
-    if verbosity >= 3:
-        print('### LLVM IR')
-        print(b.build_ir())
-    if verbosity >= 2:
-        print('### Assembly')
-        print(b.get_assembly())
-    if verbosity >= 3:
-        print('### IACA Analysis')
-        print(b.get_iaca_analysis('SKL')['output'])
-    result = b.build_and_execute(
-        repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
-    tp = min(
-        [(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
-         for t in result['runtimes']])
-    result['throughput'] = tp
-    if iaca_comparison is not None:
-        iaca_analysis = b.get_iaca_analysis(iaca_comparison)
-        result['iaca throughput'] = iaca_analysis['throughput']/(
-                parallel_factor * throughput_serial_factor)
-    if verbosity > 0:
-        print('### Detailed Results')
-        pprint(result)
-        print()
-    if verbosity > 1 and iaca_comparison is not None:
-        print('### IACA Results')
-        print(iaca_analysis['output'])
-        print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
-            throughput_serial_factor, parallel_factor))
-
-    # Result compilation
-    return lat, tp
-
-
-if __name__ == '__main__':
-    setup_llvm()
-
-    i1 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
-    i2 = op.Instruction(
-        instruction='sub $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
-    s = op.Serialized([i1, i2])
-    i3 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
-    i4 = op.Instruction(
-        instruction='sub $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
-    i5 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
-    i6 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
-    s1 = op.Serialized([i1, i2])
-    s2 = op.Serialized([s1, i3])
-    s3 = op.Serialized([i4, i5])
-    p1 = op.Parallelized([i6, s2, s3])
-    init_values = ['1' for r in p1.get_source_registers()]
-    b = IntegerLoopBenchmark(p1, init_values)
-    print(b.build_ir())
-    print(b.get_assembly())
-    print(b.build_and_execute())
-
-    print(bench_instructions([op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])]))
-
-    # if len(s.get_source_operand_types())
-    # b = IntegerLoopBenchmark(loop_body,
-    #                          [(type_, dst_reg, '1', src_reg)
-    #                           # for type_, dst_reg, src_reg in zip(s.get_last_destination_type(), )])
-    # print(b.get_ir())
--- a/asmjit/oldjit.py
+++ b/asmjit/oldjit.py
@@ -1,860 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-import sys
-import time
-import textwrap
-import itertools
-import random
-import collections
-import pprint
-import math
-
-import llvmlite.binding as llvm
-import psutil
-
-
-# TODOs
-# * API to create test scenarios
-#   * DSL?
-# * Test cases:
-#   * Instructions:
-#     * [x] arithmetics \w reg and/or imm.
-#       * scalar
-#       * packed
-#     * [x] lea
-#     * [x] LOAD / mov \w mem
-#     * [TODO] STORE / mov to mem
-#   * [x] Single Latency
-#   * [x] Single Throughput
-#   * [TODO] Combined Throughput
-#   * [TODO] Random Throughput
-# * [TODO] Automated TP, Lat, #pipeline analysis
-# * [TODO] IACA marked binary output generation
-# * [TODO] Fuzzing algorithm
-# * [TODO] CLI
-# * C based timing routine? As an extension?
-# * make sanity checks during runtime, check for fixed frequency and pinning
-
-def floor_harmonic_fraction(n, error=0.1):
-    """
-    Finds closest floored integer or inverse integer and returns error.
-
-    (numerator, denominator, relative error) where either numerator or denominator is exactly one.
-    """
-    floor_n = math.floor(n)
-    if floor_n > 0:
-        return floor_n, 1, 1 - floor_n / n
-    else:
-        i = 2
-        while (1 / i) > n:
-            i += 1
-
-        return 1, i, 1 - (1 / i) / n
-
-
-class Benchmark:
-    def __init__(self, parallel=1, serial=5):
-        self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
-        self.parallel = parallel
-        self.serial = serial
-
-        # Do interesting work
-        self._loop_body = textwrap.dedent('''\
-            %"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
-            %"checksum.1" = call i64 asm sideeffect "
-                add $1, $0",
-                "=r,i,r" (i64 1, i64 %"checksum")\
-            ''')
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-    def get_ir(self):
-        # FP add loop - may have issues
-        # return textwrap.dedent('''\
-        #    define i64 @"test"(i64 %"N")
-        #    {{
-        #    entry:
-        #      %"N.fp" = sitofp i64 %"N" to double
-        #      %"loop_cond" = fcmp olt double 0.0, %"N.fp"
-        #      br i1 %"loop_cond", label %"loop", label %"end"
-        #
-        #    loop:
-        #      %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"]
-        #    {loop_body}
-        #      %"loop_counter.1" = fadd double %"loop_counter", 1.0
-        #      %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp"
-        #      br i1 %"loop_cond.1", label %"loop", label %"end"
-        #
-        #    end:
-        #      %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"]
-        #      %"ret" = fptosi double %"ret.fp" to i64
-        #      ret i64 %"ret"
-        #    }}
-        #    ''').format(
-        #        loop_body=textwrap.indent(self._loop_body, '  '))
-        return textwrap.dedent('''\
-            define i64 @"test"(i64 %"N")
-            {{
-            entry:
-              %"loop_cond" = icmp slt i64 0, %"N"
-              br i1 %"loop_cond", label %"loop", label %"end"
-
-            loop:
-              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-            {loop_body}
-              %"loop_counter.1" = add i64 %"loop_counter", 1
-              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-              br i1 %"loop_cond.1", label %"loop", label %"end"
-
-            end:
-              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-              ret i64 %"ret"
-            }}
-            ''').format(
-            loop_body=textwrap.indent(self._loop_body, '  '))
-
-    def prepare_arguments(self, previous_args=None, time_factor=1.0):
-        """Build argument tuple, to be passed to low level function."""
-        if previous_args is None:
-            return 100,
-        else:
-            return int(previous_args[0] * time_factor),
-
-    def get_iterations(self, args):
-        """Return number of iterations performed, based on lower level function arguments."""
-        return args[0]
-
-    def get_llvm_module(self):
-        """Build and return LLVM module from LLVM IR code."""
-        if not hasattr(self, '_llvm_module'):
-            self._llvm_module = llvm.parse_assembly(self.get_ir())
-            self._llvm_module.verify()
-        return self._llvm_module
-
-    def get_target_machine(self):
-        """Instantiate and return target machine."""
-        if not hasattr(self, '_llvm_module'):
-            features = llvm.get_host_cpu_features().flatten()
-            cpu = llvm.get_host_cpu_name()
-            self._tm = llvm.Target.from_default_triple().create_target_machine(
-                cpu=cpu, features=features, opt=1)
-        return self._tm
-
-    def get_assembly(self):
-        """Compile and return assembly from LLVM module."""
-        tm = self.get_target_machine()
-        tm.set_asm_verbosity(0)
-        return tm.emit_assembly(self.get_llvm_module())
-
-    def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
-        # Compile the module to machine code using MCJIT
-        tm = self.get_target_machine()
-        runtimes = []
-        args = self.prepare_arguments()
-        with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
-            ee.finalize_object()
-
-            # Obtain a pointer to the compiled 'sum' - it's the address of its JITed
-            # code in memory.
-            cfptr = ee.get_function_address('test')
-
-            # To convert an address to an actual callable thing we have to use
-            # CFUNCTYPE, and specify the arguments & return type.
-            cfunc = self._function_ctype(cfptr)
-
-            # Now 'cfunc' is an actual callable we can invoke
-            # TODO replace time.clock with a C implemententation for less overhead
-            # TODO return result in machine readable format
-            fixed_args = False
-            for i in range(repeat):
-                while True:
-                    start = time.perf_counter()
-                    res = cfunc(*args)
-                    end = time.perf_counter()
-                    elapsed = end - start
-                    if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
-                        target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
-                        factor = target_elapsed / elapsed
-                        args = self.prepare_arguments(previous_args=args, time_factor=factor)
-                        continue
-                    else:
-                        # After we have the right argument choice, we keep it.
-                        fixed_args = True
-                        break
-
-                runtimes.append(elapsed)
-
-        return {'iterations': self.get_iterations(args),
-                'arguments': args,
-                'runtimes': runtimes,
-                'frequency': psutil.cpu_freq().current * 1e6}
-
-    @classmethod
-    def get_latency(cls, max_serial=6, print_table=False, **kwargs):
-        if print_table:
-            print(' s |' + ''.join([' {:^5}'.format(i) for i in range(1, max_serial)]))
-            print('   | ', end='')
-        serial_runs = []
-        for s in range(1, max_serial):
-            m = cls(serial=s, parallel=1, **kwargs)
-            r = m.build_and_execute(repeat=1)
-            cy_per_it = min(r['runtimes']) * r['frequency'] / (
-                        r['iterations'] * m.parallel * m.serial)
-            if print_table:
-                print('{:.3f} '.format(cy_per_it), end='')
-            sys.stdout.flush()
-
-            serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
-
-        if print_table:
-            print()
-            print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format(
-                lat=min(serial_runs)[1]))
-
-        return min(serial_runs)[1]
-
-    @classmethod
-    def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs):
-        if print_table:
-            print('s\p |' + ''.join([' {:^5}'.format(i) for i in range(2, max_parallel)]))
-        parallel_runs = []
-        for s in range(1, max_serial):
-            if print_table:
-                print('{:>3} | '.format(s), end='')
-            for p in range(2, max_parallel):
-                m = cls(serial=s, parallel=p, **kwargs)
-                r = m.build_and_execute(repeat=1)
-                cy_per_it = min(r['runtimes']) * r['frequency'] / (
-                            r['iterations'] * m.parallel * m.serial)
-                if print_table:
-                    print('{:.3f} '.format(cy_per_it), end='')
-                sys.stdout.flush()
-                parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
-            if print_table:
-                print()
-
-        if print_table:
-            print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format(
-                tp=min(parallel_runs)[1]))
-
-        return min(parallel_runs)[1]
-
-
-class InstructionBenchmark(Benchmark):
-    def __init__(self, instruction='addq $1, $0',
-                 dst_operands=(),
-                 dstsrc_operands=(('r', 'i64', '0'),),
-                 src_operands=(('i', 'i64', '1'),),
-                 parallel=10,
-                 serial=4):
-        """
-        Build LLVM IR for arithmetic instruction benchmark without memory references.
-
-        Currently only one destination (dst) or combined destination and source (dstsrc) operand
-        is allowed. Only instruction's operands ($N) refer to the order of opernads found in
-        dst + dstsrc + src.
-        """
-        Benchmark.__init__(self, parallel=parallel, serial=serial)
-        self.instruction = instruction
-        self.dst_operands = dst_operands
-        self.dstsrc_operands = dstsrc_operands
-        self.src_operands = src_operands
-        self._loop_body = ''
-        if len(dst_operands) + len(dstsrc_operands) != 1:
-            raise NotImplemented("Must have exactly one dst or dstsrc operand.")
-        if not all([op[0] in 'irx'
-                    for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
-            raise NotImplemented("This class only supports register and immediate operands.")
-
-        # Part 1: PHI functions and initializations
-        for i, dstsrc_op in enumerate(dstsrc_operands):
-            # constraint code, llvm type string, initial value
-            if dstsrc_op[0] in 'rx':
-                # register operand
-                for p in range(self.parallel):
-                    self._loop_body += (
-                        '%"dstsrc{index}_{p}" = phi {type} '
-                        '[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
-                        index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
-            else:
-                raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
-
-        # Part 2: Inline ASM call
-        # Build constraint string from operands
-        constraints = ','.join(
-            ['=' + dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
-            [sop[0] for sop in itertools.chain(src_operands)] +
-            ['{}'.format(i + len(dst_operands)) for i in range(len(dstsrc_operands))])
-
-        for i, dstsrc_op in enumerate(dstsrc_operands):
-            # Build instruction from instruction and operands
-            # TODO support multiple dstsrc operands
-            # TODO support dst and dstsrc operands at the same time
-            for p in range(self.parallel):
-                operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
-                for j, dop in enumerate(dstsrc_operands):
-                    operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=j, p=p))
-                args = ', '.join(operands)
-
-                self._loop_body += (
-                    '%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
-                    ' "{instruction}", "{constraints}" ({args})\n').format(
-                    index=i,
-                    dst_type=dstsrc_op[1],
-                    instruction='\n'.join([instruction] * self.serial),
-                    constraints=constraints,
-                    args=args,
-                    p=p)
-
-        for i, dst_op in enumerate(dst_operands):
-            # Build instruction from instruction and operands
-            # TODO support multiple dst operands
-            # TODO support dst and dstsrc operands at the same time
-            if self.serial != 1:
-                raise NotImplemented("Serial > 1 and dst operand is not supported.")
-            for p in range(self.parallel):
-                operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
-                args = ', '.join(operands)
-
-                self._loop_body += (
-                    '%"dst{index}_{p}.out" = call {dst_type} asm sideeffect'
-                    ' "{instruction}", "{constraints}" ({args})\n').format(
-                    index=i,
-                    dst_type=dst_op[1],
-                    instruction=instruction,
-                    constraints=constraints,
-                    args=args,
-                    p=p)
-
-
-class AddressGenerationBenchmark(Benchmark):
-    def __init__(self,
-                 offset=('i', 'i64', '0x42'),
-                 base=('r', 'i64', '0'),
-                 index=('r', 'i64', '0'),
-                 width=('i', None, '4'),
-                 destination='base',
-                 parallel=10,
-                 serial=4):
-        """
-        Benchmark for address generation modes.
-
-        Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
-        or 'i' (immediate) and initial_value a string.
-        E.g., ('r', 'i64', '0') or ('i', None, '4')
-
-        +--------------------------------+-----------------------------+
-        | Mode                           | AT&T                        |
-        +--------------------------------+-----------------------------+
-        | Offset                         | leal           0x0100, %eax | <- no latency support
-        | Base                           | leal           (%esi), %eax |
-        | Offset + Base                  | leal         -8(%ebp), %eax |
-        | Offset + Index*Width           | leal   0x100(,%ebx,4), %eax |
-        | Offset + Base + Index*Width    | leal 0x8(%edx,%ebx,4), %eax |
-        +--------------------------------+-----------------------------+
-        OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
-        offset: immediate integer (+/-)
-        base: register
-        index: register
-        width: immediate 1,2,4 or 8
-        """
-        Benchmark.__init__(self, parallel=parallel, serial=serial)
-        self.offset = offset
-        self.base = base
-        self.index = index
-        self.width = width
-        self.destination = destination
-        self.parallel = parallel
-        # Sanity checks:
-        if bool(index) ^ bool(width):
-            raise ValueError("Index and width both need to be set, or be None.")
-        elif index and width:
-            if width[0] != 'i' or int(width[2]) not in [1, 2, 4, 8]:
-                raise ValueError("Width may only be immediate 1,2,4 or 8.")
-            if index[0] != 'r':
-                raise ValueError("Index must be a register.")
-
-        if offset and offset[0] != 'i':
-            raise ValueError("Offset must be an immediate.")
-        if base and base[0] != 'r':
-            raise ValueError("Offset must be a register.")
-
-        if not index and not width and not offset and not base:
-            raise ValueError("Must provide at least an offset or base.")
-
-        if destination == 'base' and not base:
-            raise ValueError("Destination may only be set to 'base' if base is set.")
-        elif destination == 'index' and not index:
-            raise ValueError("Destination may only be set to 'index' if index is set.")
-        elif destination not in ['base', 'index']:
-            raise ValueError("Destination must be set to 'base' or 'index'.")
-
-        if not base and not index:
-            raise ValueError("Either base or index must be set for latency test to work.")
-
-        if serial != 1 and not (base or index):
-            raise ValueError("Serial > 1 only works with index and/or base in use.")
-
-        self._loop_body = ''
-
-        ops = ''
-        if offset:
-            ops += offset[2]
-        if base:
-            ops += '($0'
-            if width and index:
-                ops += ',$1,{}'.format(width[2])
-            ops += ')'
-
-            if destination == 'base':
-                ops += ', $0'
-            else:  # destination == 'index'
-                ops += ', $1'
-        else:
-            if width and index:
-                ops += '(,$0,{}), $0'.format(width[2])
-        ops += ' '
-
-        if destination == 'base':
-            destination_reg = base
-        else:  # destination == 'index'
-            destination_reg = index
-
-        # Part 1: PHI function for destination
-        for p in range(parallel):
-            self._loop_body += (
-                '%"{name}_{p}.0" = '
-                'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format(
-                name=destination, type=destination_reg[1], initial=destination_reg[2], p=p,
-                s=self.serial)
-
-        for p in range(parallel):
-            for s in range(self.serial):
-                constraints = '=r,r'
-                if base and index:
-                    constraints += ',r'
-                    if destination == 'base':
-                        args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format(
-                            base_type=base[1], base_name=destination,
-                            index_type=index[1], index_value=index[2], p=p, s_in=s)
-                    else:  # destination == 'index':
-                        args = '{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format(
-                            base_type=base[1], base_value=base[2],
-                            index_type=index[1], index_name=destination, p=p, s_in=s)
-                else:
-                    args = '{type} %"{name}_{p}.{s_in}"'.format(
-                        type=destination_reg[1], name=destination, p=p, s_in=s)
-
-                self._loop_body += (
-                    '%"{name}_{p}.{s_out}" = call {type} asm sideeffect'
-                    ' "lea {ops}", "{constraints}" ({args})\n').format(
-                    name=destination,
-                    type=destination_reg[1],
-                    ops=ops,
-                    constraints=constraints,
-                    args=args,
-                    p=p,
-                    s_out=s + 1)
-
-
-class LoadBenchmark(Benchmark):
-    def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4):
-        """
-        Benchmark for L1 load using pointer chasing.
-
-        *chain_length* is the number of pointers to place in memory.
-        *structure* may be 'linear' (1-offsets) or 'random'.
-        """
-        Benchmark.__init__(self, parallel=parallel, serial=serial)
-        self._loop_body = ''
-        element_type = ctypes.POINTER(ctypes.c_int)
-        self._function_ctype = ctypes.CFUNCTYPE(
-            ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
-        self.chain_length = chain_length
-        self.parallel = parallel
-        self.structure = structure
-        self._pointer_field = (element_type * chain_length)()
-        if chain_length % serial != 0:
-            raise ValueError(
-                "chain_length ({}) needs to be divisible by serial factor ({}).".format(
-                    chain_length, serial))
-
-        # Initialize pointer field
-        # Field must represent a ring of pointers
-        if structure == 'linear':
-            for i in range(chain_length):
-                self._pointer_field[i] = ctypes.cast(
-                    ctypes.pointer(self._pointer_field[(i + 1) % chain_length]), element_type)
-        elif structure == 'random':
-            shuffled_indices = list(range(chain_length))
-            random.shuffle(shuffled_indices)
-            for i in range(chain_length):
-                self._pointer_field[shuffled_indices[i]] = ctypes.cast(
-                    ctypes.pointer(self._pointer_field[shuffled_indices[(i + 1) % chain_length]]),
-                    element_type)
-        else:
-            raise ValueError("Given structure is not supported. Supported are: "
-                             "linear and random.")
-
-    def prepare_arguments(self, previous_args=None, time_factor=1.0):
-        """Build argument tuple, to be passed to low level function."""
-        if previous_args is None:
-            return self._pointer_field, 100
-        else:
-            return previous_args[0], int(previous_args[1] * time_factor)
-
-    def get_iterations(self, args):
-        """Return number of iterations performed, based on lower level function arguments."""
-        return self.chain_length * args[1]
-
-    def get_ir(self):
-        """
-        Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1):
-
-        int test(int** ptrf, int repeat) {
-            int** p0 = (int**)ptrf[0];
-            int i = 0;
-            while(i < N) {
-                int** p = (int**)*p0;
-                while(p != p0) {
-                    p = (int**)*p;
-                }
-                i++;
-            }
-            return i;
-        }
-        """
-        ret = textwrap.dedent('''
-        define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
-        entry:
-        ''')
-        # Load pointer to ptrf[p] and p0
-        for p in range(self.parallel):
-            if p > 0:
-                ret += '  %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
-            ret += (
-                '  %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
-                '  %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
-
-        ret += textwrap.dedent('''
-            %"cmp.entry" = icmp sgt i32 %"repeats", 0
-            br i1 %"cmp.entry", label %"loop0", label %"end"
-
-        loop0:
-            br label %"loop1"
-
-        loop1:
-            %"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
-            br label %"loop2"
-
-        loop2:\n''')
-
-        for p in range(self.parallel):
-            ret += ('  %"p_{p}.0" = phi i32** '
-                    '[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format(
-                p=p, s_max=self.serial)
-
-        # load p, compare to p0 and or-combine results
-        for p in range(self.parallel):
-            for s in range(self.serial):
-                ret += ('  %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n'
-                        '  %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format(
-                    p=p, s=s + 1, s_prev=s)
-
-            # Compare is needed for all registers, for llvm not to remove unused 
-            # instructions:
-            ret += '  %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format(
-                p=p, s_max=self.serial)
-
-        # TODO tree reduce cmp to make use of all cmp_* values
-
-        # It is sufficient to use only one compare, all others will be eliminated
-        ret += '  br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
-
-        ret += textwrap.dedent('''
-        loop3:
-            %"i.1" = add i32 %"i", 1
-            %"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
-            br i1 %"cmp.loop3", label %"end", label %"loop1"
-
-        end:
-            %"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
-            ret i32 %"ret"
-        }''')
-        return ret
-
-
-if __name__ == '__main__':
-    llvm.initialize()
-    llvm.initialize_native_target()
-    llvm.initialize_native_asmprinter()
-    llvm.initialize_native_asmparser()
-
-    modules = collections.OrderedDict()
-
-    # immediate source
-    modules['add i64 r64 LAT'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=1,
-        serial=5)
-
-    # register source
-    modules['add r64 r64 LAT'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('r', 'i64', '1'),),
-        parallel=1,
-        serial=5)
-
-    # multiple instructions
-    modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
-        instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=1,
-        serial=5)
-
-    # immediate source
-    modules['add i64 r64 TP'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=10,
-        serial=5)
-
-    # register source
-    modules['add r64 r64 TP'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('r', 'i64', '1'),),
-        parallel=10,
-        serial=5)
-
-    # multiple instructions
-    modules['4xadd i64 r64 TP'] = InstructionBenchmark(
-        instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=10,
-        serial=1)
-
-    modules['lea base LAT'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=1,
-        serial=5)
-
-    modules['lea base+offset LAT'] = AddressGenerationBenchmark(
-        offset=('i', None, '23'),
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=1,
-        serial=5)
-
-    modules['lea index*width LAT'] = AddressGenerationBenchmark(
-        offset=None,
-        base=None,
-        index=('r', 'i64', '1'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=1,
-        serial=5)
-
-    modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
-        offset=('i', 'i64', '-0x8'),
-        base=None,
-        index=('r', 'i64', '51'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=1,
-        serial=5)
-
-    modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=1,
-        serial=5)
-
-    modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
-        offset=('i', None, '42'),
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=1,
-        serial=5)
-
-    modules['lea base TP'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=10,
-        serial=1)
-
-    modules['lea base+offset TP'] = AddressGenerationBenchmark(
-        offset=('i', None, '23'),
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=10,
-        serial=1)
-
-    modules['lea index*width TP'] = AddressGenerationBenchmark(
-        offset=None,
-        base=None,
-        index=('r', 'i64', '1'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=10,
-        serial=1)
-
-    modules['lea offset+index*width TP'] = AddressGenerationBenchmark(
-        offset=('i', 'i64', '-0x8'),
-        base=None,
-        index=('r', 'i64', '51'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=10,
-        serial=1)
-
-    modules['lea base+index*width TP'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=10,
-        serial=1)
-
-    modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark(
-        offset=('i', None, '42'),
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=10,
-        serial=1)
-
-    modules['LD linear LAT'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='linear',
-        parallel=1,
-        serial=2)
-
-    modules['LD random LAT'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='random',
-        parallel=1,
-        serial=2)
-
-    modules['LD linear TP'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='linear',
-        parallel=4,
-        serial=2)
-
-    modules['LD random TP'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='random',
-        parallel=4,
-        serial=2)
-
-    modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark(
-        instruction='vaddpd $1, $0, $0',
-        dst_operands=(),
-        dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
-        src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
-        parallel=1,
-        serial=5)
-
-    modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark(
-        instruction='vmulpd $1, $0, $0',
-        dst_operands=(),
-        dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
-        src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
-        parallel=1,
-        serial=5)
-
-    # This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies:
-    modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark(
-        instruction='vmulpd $1, $2, $0',
-        dst_operands=(),
-        dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
-        src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
-        parallel=10,
-        serial=1)
-
-    # modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('lea base LAT')])
-
-    verbose = 2 if '-v' in sys.argv else 0
-    for key, module in modules.items():
-        if verbose > 0:
-            print("=== Benchmark")
-            print(repr(module))
-            print("=== LLVM")
-            print(module.get_ir())
-            print("=== Assembly")
-            print(module.get_assembly())
-        r = module.build_and_execute(repeat=3)
-        if verbose > 0:
-            print("=== Result")
-            pprint.pprint(r)
-
-        cy_per_it = min(r['runtimes']) * r['frequency'] / (
-                    r['iterations'] * module.parallel * module.serial)
-        print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format(
-            key=key,
-            module=module,
-            cy_per_it=cy_per_it,
-            runtime_sum=sum(r['runtimes'])))
-
-    # InstructionBenchmark.get_latency(
-    #    instruction='vmulpd $1, $0, $0',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
-    #    src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
-    #                  ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
-    #    print_table=True)
-    # InstructionBenchmark.get_throughput(
-    #    instruction='vmulpd $1, $0, $0',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
-    #    src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
-    #                  ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
-    #    print_table=True)
-    #
-    # InstructionBenchmark.get_latency(
-    #    instruction='nop',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('r','i8', '0'),),
-    #    src_operands=(),
-    #    print_table=True)
-    # InstructionBenchmark.get_throughput(
-    #    instruction='nop',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('r','i8', '0'),),
-    #    src_operands=(),
-    #    print_table=True)
--- a/asmjit/op.py
+++ b/asmjit/op.py
@@ -1,501 +0,0 @@
-#!/usr/bin/env python3
-import re
-
-# TODO use abc to force implementation of interface requirements
-
-init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
-# LLVM requires floating point constants to have a non-repeating binary representation
-# See http://llvm.org/docs/LangRef.html#simple-constants for details
-init_value_by_llvm_type.update({fp_type: str(1+1/2**10)
-                                for fp_type in ['float', 'double', 'fp128']})
-# For vector-types we reuse the scalar values
-init_value_by_llvm_type.update(
-    {'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
-     for t, v in init_value_by_llvm_type.items()
-     for vec in [2, 4, 8, 16, 32, 64]})
-
-
-class NotSerializableError(Exception):
-    pass
-
-class Operand:
-    def __init__(self, llvm_type):
-        self.llvm_type = llvm_type
-
-    def get_constraint_char(self):
-        raise NotImplementedError()
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-    @staticmethod
-    def from_string(s):
-        options = [Register.from_string, Immediate.from_string, MemoryReference.from_string]
-        for o in options:
-            try:
-                return o(s)
-            except ValueError:
-                continue
-        raise ValueError("No matching operand type found for '{}'.".format(s))
-
-
-class Immediate(Operand):
-    def __init__(self, llvm_type, value):
-        Operand.__init__(self, llvm_type)
-        self.value = value
-
-    def get_constraint_char(self):
-        return 'i'
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create Immediate object from string.
-
-        :param s: must have the form: "llvm_type:value"
-        """
-        llvm_type, value = s.split(':', 1)
-        value_regex = r'(0x[0-9a-fA-F]+|[0-9]+(\.[0-9]+)?)'
-        if not re.match(value_regex, value):
-            raise ValueError("Invalid immediate value, must match {!r}".format(value_regex))
-        return cls(llvm_type, value)
-
-
-class MemoryReference(Operand):
-    """
-    offset + base + index*width
-
-    OFFSET(BASE, INDEX, WIDTH) in AT&T assembly
-
-    Possible operand values:
-        offset: immediate integer (+/-)
-        base: register
-        index: register
-        width: immediate 1,2,4 or 8
-    """
-
-    def __init__(self, llvm_type, offset=None, base=None, index=None, width=None):
-        super().__init__(llvm_type)
-        self.offset = offset
-        self.base = base
-        self.index = index
-        self.width = width
-
-        # Sanity checks:
-        if bool(index) ^ bool(width):
-            raise ValueError("Index and width both need to be set, or None.")
-        elif index and width:
-            if not (isinstance(width, Immediate) and int(width.value) in [1, 2, 4, 8]):
-                raise ValueError("Width may only be immediate 1,2,4 or 8.")
-            if not isinstance(index, Register):
-                raise ValueError("Index must be a register.")
-
-        if offset and not isinstance(offset, Immediate):
-            raise ValueError("Offset must be an immediate.")
-        if base and not isinstance(base, Register):
-            raise ValueError("Offset must be a register.")
-
-        if not index and not width and not offset and not base:
-            raise ValueError("Must provide at least an offset or base.")
-
-    def get_constraint_char(self):
-        return 'm'
-
-    def get_registers(self):
-        if self.base:
-            yield self.base
-        if self.index:
-            yield self.index
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create MemoryReference from string.
-
-        :param s: must fulfill the regex: "mem:[bdis]+"
-        """
-        m = re.match(r"\*([^:]+):([obiw]+)", s)
-        if not m:
-            raise ValueError("Invalid format, must match 'mem:[obiw]+'.")
-        else:
-            llvm_type, features = m.groups()
-            offset = None
-            if 'o' in features:
-                offset = Immediate('i32', 8)
-            base = None
-            if 'b' in features:
-                base = Register('i64', 'r')
-            index = None
-            if 'i' in features:
-                index = Register('i64', 'r')
-            width = None
-            if 'w' in features:
-                width = Immediate('i32', 8)
-            return cls(llvm_type, offset=offset, base=base, index=index, width=width)
-
-
-class Register(Operand):
-    def __init__(self, llvm_type, constraint_char='r'):
-        super().__init__(llvm_type)
-        self.constraint_char = constraint_char
-
-    def get_constraint_char(self):
-        return self.constraint_char
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create Register object from string.
-
-        :param s: must have the form: "llvm_type:constraint_char"
-        """
-        llvm_type, constraint_char = s.split(':', 1)
-        valid_cc = 'rx'
-        if constraint_char not in valid_cc:
-            raise ValueError("Invalid constraint character, must be one of {!r}".format(valid_cc))
-        return cls(llvm_type, constraint_char)
-
-
-class Synthable:
-    def __init__(self):
-        pass
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers):
-        raise NotImplementedError()
-
-    def get_source_registers(self):
-        raise NotImplementedError()
-
-    def get_destination_registers(self):
-        raise NotImplementedError()
-
-    @staticmethod
-    def _get_unused_reg_name(used_registers):
-        name = None
-        i = 0
-        while name in used_registers or name is None:
-            name = '%"reg.{}"'.format(i)
-            i += 1
-        used_registers.add(name)
-        return name
-
-    def get_default_init_values(self):
-        return [init_value_by_llvm_type[reg.llvm_type] for reg in self.get_source_registers()]
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-
-class Operation(Synthable):
-    """Base class for operations."""
-
-
-class Instruction(Operation):
-    def __init__(self, instruction, destination_operand, source_operands):
-        super().__init__()
-        self.instruction = instruction
-        self.destination_operand = destination_operand
-        assert isinstance(destination_operand, Register), "Destination needs to be a register."
-        self.source_operands = source_operands
-
-    def get_source_registers(self):
-        sop_types = set()
-        sr = []
-        for sop in self.source_operands:
-            if isinstance(sop, Register):
-                if sop.llvm_type not in sop_types:
-                    sop_types.add(sop.llvm_type)
-                    sr.append(sop)
-            elif isinstance(sop, MemoryReference):
-                sr += list(sop.get_registers())
-
-        return sr
-
-    def get_destination_registers(self):
-        if isinstance(self.destination_operand, Register):
-            return [self.destination_operand]
-        else:
-            return []
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
-        """
-        Build IR string based on in and out operand names and types.
-        """
-        if used_registers is None:
-            used_registers = set(dst_reg_names + src_reg_names)
-
-        # Build constraint string from operands
-        constraints = ','.join(
-            ['=' + self.destination_operand.get_constraint_char()] +
-            [sop.get_constraint_char() for sop in self.source_operands])
-
-        # Build argument string from operands and register names
-        operands = []
-        sop_types = {}
-        i = 0
-        for sop in self.source_operands:
-            if isinstance(sop, Immediate):
-                operands.append('{type} {repr}'.format(
-                    type=sop.llvm_type,
-                    repr=sop.value))
-            elif isinstance(sop, Register):
-                if sop.llvm_type in sop_types:
-                    operands.append('{type} {repr}'.format(
-                        type=sop.llvm_type,
-                        repr=src_reg_names[sop_types[sop.llvm_type]]))
-                else:
-                    sop_types[sop.llvm_type] = i
-                    operands.append('{type} {repr}'.format(
-                        type=sop.llvm_type,
-                        repr=src_reg_names[i]))
-                    i += 1
-            elif isinstance(sop, MemoryReference):
-                operands.append('{type} {repr}'.format(
-                    type=sop.llvm_type,
-                    repr=src_reg_names[i]))
-                i += 1
-            else:
-                raise NotImplementedError("Only register and immediate operands are supported.")
-        args = ', '.join(operands)
-
-        # Build instruction from instruction and operands
-        return ('{dst_reg} = call {dst_type} asm '
-                ' "{instruction}", "{constraints}" ({args})').format(
-            dst_reg=dst_reg_names[0],
-            dst_type=self.destination_operand.llvm_type,
-            instruction=self.instruction,
-            constraints=constraints,
-            args=args)
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create Instruction object from string.
-
-        :param s: must have the form:
-                  "asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
-        """
-        instruction = s
-        # It is important that the match objects are in reverse order, to allow string replacements
-        # based on original match group locations
-        operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
-        # Destination indices start at 0
-        dst_index = 0
-        # Source indices at "number of destination operands"
-        src_index = ['dst' in o.group(1) for o in operands].count(True)
-
-        dst_ops = []
-        src_ops = []
-        for m in operands:
-            direction, operand_string = m.group(1, 2)
-            operand = Operand.from_string(operand_string)
-            if 'src' in direction and not 'dst' in direction:
-                src_ops.append(operand)
-                # replace with index string
-                instruction = (instruction[:m.start()] + "${}".format(src_index)
-                               + instruction[m.end():])
-                src_index += 1
-            if 'dst' in direction:
-                dst_ops.append(operand)
-                # replace with index string
-                instruction = (instruction[:m.start()] + "${}".format(dst_index)
-                               + instruction[m.end():])
-                if 'src' in direction:
-                    src_ops.append(Register(operand_string.split(':', 1)[0], str(dst_index)))
-                    src_index += 1
-                dst_index += 1
-
-        if len(dst_ops) != 1:
-            raise ValueError("Instruction supports only single destinations.")
-        return cls(instruction, dst_ops[0], src_ops)
-
-
-class Load(Operation):
-    def __init__(self, chain_length, structure='linear'):
-        """
-        *chain_length* is the number of pointers to place in memory.
-        *structure* may be 'linear' (1-offsets) or 'random'.
-        """
-        super().__init__()
-        self.chain_length = chain_length
-        self.structure = structure
-    # TODO
-
-
-class AddressGeneration(Operation):
-    def __init__(self, offset, base, index, width, destination='base'):
-        super().__init__()
-        self.offset = offset
-        self.base = base
-        self.index = index
-        self.width = width
-        self.destination = destination
-        raise NotImplementedError()
-
-
-class Serialized(Synthable):
-    def __init__(self, synths):
-        super().__init__()
-        self.synths = synths
-        assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
-
-    def get_source_registers(self):
-        if self.synths:
-            return self.synths[0].get_source_registers()
-        else:
-            return []
-
-    def get_destination_registers(self):
-        if self.synths:
-            return self.synths[-1].get_destination_registers()
-        else:
-            return []
-
-    @staticmethod
-    def match(source_registers, destination_registers):
-        """
-        Find maximum number of matches from source (previous destinations) to
-        destination (current source) registers.
-
-        Return list of two-tuples of matches (src_idx, dst_idx)
-        """
-        matched_pairs = []
-        unmatched_dests = set(destination_registers)
-        for dst_idx, dst in enumerate(destination_registers):
-            for src_idx, src in enumerate(source_registers):
-                if src.llvm_type == dst.llvm_type:
-                    matched_pairs.append((src_idx, dst_idx))
-                    unmatched_dests.discard(dst)
-
-        return matched_pairs, unmatched_dests
-
-    def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
-        reg_naming_out = []
-        dst_naming = []
-        last_s = None
-        for i, s in enumerate(self.synths):
-            if i == 0:
-                # first source is passed in from outside
-                src_naming = src_reg_names
-            else:
-                # match with previous destinations
-                src_naming = []
-                match = False
-                for src in s.get_source_registers():
-                    # Find matching destination from previous synths
-                    src_match = False
-                    for dst_idx, dst in enumerate(last_s.get_destination_registers()):
-                        if dst.llvm_type == src.llvm_type:
-                            match = src_match = True
-                            src_naming.append(dst_naming[dst_idx])
-                    # If source could not be matched, use constant value instead
-                    if not src_match:
-                        src_naming.append(init_value_by_llvm_type[src.llvm_type])
-                if not match:
-                    raise NotSerializableError("Unable to find match.")
-
-            if i == len(self.synths) - 1:
-                # last destination is passed in from outside
-                dst_naming = dst_reg_names
-            else:
-                # noinspection PyUnusedLocal
-                dst_naming = [self._get_unused_reg_name(used_registers)
-                              for j in s.get_destination_registers()]
-
-            reg_naming_out.append((dst_naming, src_naming))
-            last_s = s
-        return reg_naming_out, used_registers
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
-        if used_registers is None:
-            used_registers = set(dst_reg_names + src_reg_names)
-        reg_names, used_registers = self.generate_register_naming(
-            dst_reg_names, src_reg_names, used_registers)
-        code = []
-        for s, r in zip(self.synths, reg_names):
-            code.append(s.build_ir(*r, used_registers))
-        return '\n'.join(code)
-
-
-class Parallelized(Synthable):
-    def __init__(self, synths):
-        super().__init__()
-        self.synths = synths
-        assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
-
-    def get_source_registers(self):
-        sources = []
-        for s in self.synths:
-            sources += s.get_source_registers()
-        return sources
-
-    def get_destination_registers(self):
-        destinations = []
-        for s in self.synths:
-            destinations += s.get_destination_registers()
-        return destinations
-
-    def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
-        # Split reg_naming among all synths
-        reg_naming_out = []
-        for s in self.synths:
-            n_dsts = len(s.get_destination_registers())
-            n_srcs = len(s.get_source_registers())
-            reg_naming_out.append((dst_reg_names[:n_dsts], src_reg_names[:n_srcs]))
-            dst_reg_names, src_reg_names = (dst_reg_names[n_dsts:], src_reg_names[n_srcs:])
-        return reg_naming_out, used_registers
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
-        if used_registers is None:
-            used_registers = set(dst_reg_names + src_reg_names)
-        reg_names, used_registers = self.generate_register_naming(
-            dst_reg_names, src_reg_names, used_registers)
-        code = []
-        for s, r in zip(self.synths, reg_names):
-            code.append(s.build_ir(*r, used_registers))
-        return '\n'.join(code)
-
-
-if __name__ == '__main__':
-    i1 = Instruction(
-        instruction='add $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
-    i2 = Instruction(
-        instruction='sub $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
-    i3 = Instruction(
-        instruction='mul $1, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Register('i64', 'r')])
-    i4 = Instruction(
-        instruction='div $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
-    i5 = Instruction(
-        instruction='mul $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
-    i6 = Instruction(
-        instruction='inc $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r')])
-    s1 = Serialized([i1, i2])
-    s2 = Serialized([s1, i3])
-    print(s1.build_ir(['%out'], ['%in']), '\n')
-    print(s2.build_ir(['%out'], ['%in']), '\n')
-    s3 = Serialized([i4, i5])
-    p1 = Parallelized([i6, s2, s3])
-    print(p1.build_ir(['%out.0', '%out.1', '%out.2'], ['%in.0', '%in.1', '%in.2']), '\n')
-
-    s4 = Serialized([i1, i2, i3, i4, i5, i6])
-    print(s4.build_ir(['%out'], ['%in']), '\n')
-
-    print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))
--- a/asmjit/sc18src.py
+++ b/asmjit/sc18src.py
@@ -1,243 +0,0 @@
-#!/usr/bin/env python3
-import collections
-import itertools
-import socket
-
-import numpy
-import matplotlib.pyplot as plt
-import matplotlib as mpl
-
-from asmjit import op, bench
-from asmjit import oldjit
-
-
-def jit_based_benchs():
-    modules = collections.OrderedDict()
-    modules['lea_b'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['lea_b+off'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '23'),
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '23'),
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['lea_idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=None,
-            index=('r', 'i64', '1'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=None,
-            index=('r', 'i64', '1'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=10,
-            serial=1))
-
-    modules['lea_off+idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', 'i64', '-0x8'),
-            base=None,
-            index=('r', 'i64', '51'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', 'i64', '-0x8'),
-            base=None,
-            index=('r', 'i64', '51'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=10,
-            serial=1))
-
-    modules['lea_b+idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['lea_b+off+idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '42'),
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '42'),
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['LD_linear'] = (
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='linear',
-            parallel=1,
-            serial=2),
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='linear',
-            parallel=4,
-            serial=2))
-
-    modules['LD_random'] = (
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='random',
-            parallel=1,
-            serial=2),
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='random',
-            parallel=4,
-            serial=2))
-
-    for name, mods in modules.items():
-        lat_module, tp_module = mods
-        r_lat = lat_module.build_and_execute(repeat=3)
-        cy_per_it_lat = min(r_lat['runtimes']) * r_lat['frequency'] / (
-                    r_lat['iterations'] * lat_module.parallel * lat_module.serial)
-        r_tp = tp_module.build_and_execute(repeat=3)
-        cy_per_it_tp = min(r_tp['runtimes']) * r_tp['frequency'] / (
-                    r_tp['iterations'] * tp_module.parallel * tp_module.serial)
-        print('{key:<16} LAT {cy_per_it_lat:.3f} cy  TP {cy_per_it_tp:.3f} cy'.format(
-            key=name,
-            cy_per_it_lat=cy_per_it_lat,
-            cy_per_it_tp=cy_per_it_tp))
-
-def plot_combined(single_measured, combined_measured):
-    instructions = list(single_measured.keys())
-    d = numpy.ndarray((len(single_measured), len(single_measured)))
-    d.fill(float('nan'))
-    for k, v in combined_measured.items():
-        i1, i2 = [instructions.index(i) for i in [c[0] for c in k]]
-        d[i1, i2] = v[2]
-    cmap = mpl.cm.get_cmap('plasma', 5)
-    cmap.set_bad('w') # default value is 'k'
-    fig = plt.figure(figsize=(10,10))
-    ax1 = fig.add_subplot(111)
-    cax = ax1.imshow(d, interpolation="nearest", cmap=cmap, norm=mpl.colors.Normalize(vmin=-.5, vmax=1.5))
-    ax1.set_xticks(range(len(instructions)))
-    ax1.set_xticklabels(instructions, rotation=90)
-    ax1.set_yticks(range(len(instructions)))
-    ax1.set_yticklabels(instructions)
-    ax1.set_title(socket.gethostname())
-    ax1.grid()
-    cb = fig.colorbar(cax, shrink=0.65)
-    cb.set_ticks([-.5, 0, 1, 1.5])
-    cb.set_ticklabels(['< -0.5', '0.0 (complete overlap)', '1.0 (no overlap)', '> 1.5'])
-    cb.set_label('inverse parallel overlap')
-    fig.tight_layout()
-    plt.show()
-
-
-if __name__ == '__main__':
-    bench.setup_llvm()
-    instructions = [
-        (i[0], i[1], op.Instruction.from_string(i[1]))
-        for i in [
-            ('ADD32ri', 'add {src:i32:1}, {srcdst:i32:r}'),
-            ('ADD64ri32', 'add {src:i32:1}, {srcdst:i64:r}'),
-            ('INC64r', 'inc {srcdst:i64:r}'),
-            ('SUB32ri', 'sub {src:i32:1}, {srcdst:i64:r}'),
-            ('MOV64ri32', 'mov {src:i32:1}, {srcdst:i64:r}'),
-            ('VINSERTF128rr', 'vinsertf128 {src:i8:0}, {src:<2 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VCVTSI642SSrr', 'vcvtsi2ss {src:i64:r}, {src:float:x}, {dst:float:x}'),
-            ('VADDPDYrr', 'vaddpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VADDSDrr', 'vaddsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VADDSSrr', 'vaddss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ('VFMADD213PDYr', 'vfmadd213pd {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
-            ('VFMADD213PDr', 'vfmadd213pd {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
-            ('VFMADD213PSYr', 'vfmadd213ps {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
-            ('VFMADD213PSr', 'vfmadd213ps {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
-            ('VFMADD213SDr', 'vfmadd213sd {src:double:x}, {src:double:x}, {srcdst:double:x}'),
-            ('VFMADD213SSr', 'vfmadd213ss {src:float:x}, {src:float:x}, {srcdst:float:x}'),
-            ('VMULPDYrr', 'vmulpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VMULSDrr', 'vmulsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VMULSSrr', 'vmulss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ('VSUBSDrr', 'vsubsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VSUBSSrr', 'vsubss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ('VDIVPDYrr', 'vdivpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VDIVSDrr', 'vdivsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VDIVSSrr', 'vdivss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ]
-    ]
-    instructions_measured = collections.OrderedDict()
-    for llvm_name, i_str, i in instructions:
-        lat, tp = bench.bench_instructions(
-            [i],
-            serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
-            verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
-        print('{:<16}  LAT {:.3f} cy  TP {:.3f} cy'.format(llvm_name, lat, tp))
-        instructions_measured[llvm_name] = (lat, tp)
-
-    jit_based_benchs()
-
-    two_combinations_measured = collections.OrderedDict()
-
-    for a, b in itertools.combinations_with_replacement(instructions, 2):
-        lat, tp = bench.bench_instructions(
-            [a[2], b[2]],
-            serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
-            verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
-        same_port_metric = ((
-            tp-max(instructions_measured[a[0]][1], instructions_measured[b[0]][1])) /
-            min(instructions_measured[a[0]][1], instructions_measured[b[0]][1]))
-        print('{:<16} {:<16}  LAT {:.3f} cy  TP {:.3f} cy  SPM {:>5.2f}'.format(
-            a[0], b[0], lat, tp, same_port_metric))
-        two_combinations_measured[(a[0], a[1]), (b[0], b[1])] = (lat, tp, same_port_metric)
-
-    plot_combined(instructions_measured, two_combinations_measured)
--- a/doc/sc18src_artifact_appendix.md
+++ b/doc/sc18src_artifact_appendix.md
@@ -8,7 +8,7 @@

 In order to construct an accurate instruction execution model for modern out-of-order micro architectures, an accurate description of instruction latency and throughput, as well as resource conflicts is indispensable. Already existing resources and vendor provided information is neither complete nor detailed enough and sometimes faulty. We therefore proclaim to deduct this information through runtime instruction benchmarking of single and composite instructions, and present a framework to support such investigations based on LLVM's just-in-time and cross-platform compilation capabilities.

-`pyasmjit` abstracts instructions, registers, immediates, memory operands and dependency chains, to easily construct benchmarks. The synthesized code is interactively compiled and executed using the `llvmlite` library, which in turn is based on the stable LLVM C-API. `pyasmjit` offers a command line as well as a programming interface.
+`asmbench` abstracts instructions, registers, immediates, memory operands and dependency chains, to easily construct benchmarks. The synthesized code is interactively compiled and executed using the `llvmlite` library, which in turn is based on the stable LLVM C-API. `asmbench` offers a command line as well as a programming interface.

 Unlike other approaches, we do not rely on model specific performance counters and focus on interoperability and automation to support quick modeling of many microarchitectures.

@@ -23,7 +23,7 @@ Unlike other approaches, we do not rely on model specific performance counters a

 ### A.2.2 How software can be obtained (if available)

-Check out https://github.com/RRZE-HPC/pyasmjit
+Check out https://github.com/RRZE-HPC/asmbench

 ### A.2.3 Hardware dependencies

@@ -47,15 +47,15 @@ None required, all included.

 ## A.3 Installation

-To install `asmjit` in the correct version and all its dependencies into the users home directory, execute: `pip3 install --user asmjit[sc18src]==0.1.2`.
+To install `asmbench` in the correct version and all its dependencies into the users home directory, execute: `pip3 install --user asmbench[sc18src]==0.1.2`.

-Alternatively clone https://github.com/RRZE-HPC/pyasmjit with commit hash 515b28cb4e44426239e6161dc3a79d888a9e0e21 and install using included `setup.py`.
+Alternatively clone https://github.com/RRZE-HPC/asmbench with commit hash 515b28cb4e44426239e6161dc3a79d888a9e0e21 and install using included `setup.py`.

 ## A.4 Experiment workflow

 1. Fix frequency, e.g., using likwid: `likwid-setFrequencies -f <FREQ>`. `<FREQ>` should be the base clock for the specific model used.
 2. Disable turbo mode, e.g., using likwid: `likwid-setFrequencies -t 0`.
-3. Run `asmjit.sc18src` module: `python3 -m asmjit.sc18src`.
+3. Run `asmbench.sc18src` module: `python3 -m asmbench.sc18src`.

 ## A.5 Evaluation and expected result

--- a/run_SC18_SRC.py
+++ b/run_SC18_SRC.py
@@ -2,7 +2,7 @@
 import collections
 import itertools

-from asmjit import op, bench, oldjit
+from asmbench import op, bench, oldjit


 def jit_based_benchs():
--- a/setup.py
+++ b/setup.py
@@ -5,13 +5,13 @@ with open('README.md') as f:
    long_description = f.read()

 setup(
-    name='asmjit',
-    version='0.1.2',
+    name='asmbench',
+    version='0.1.0',
    packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
    url='',
    license='AGPLv3',
    author='Julian Hammer',
-    author_email='julian.hammer@u-sys.org',
+    author_email='julian.hammer@fau.de',
    description='A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT',
    long_description_content_type='text/markdown',
    long_description=long_description,
--- a/tablegen.py
+++ b/tablegen.py
@@ -10,7 +10,7 @@ import argparse
 import random
 from pprint import pprint

-from asmjit import op, bench
+from asmbench import op, bench


 def split_list(raw):