Revert "fixed up to work with latest kerncraft"

This reverts commit 2ccfb0c9ea.
2025-07-20 20:21:05 +02:00 · 2020-05-27 14:09:13 +02:00
parent 2ccfb0c9ea
commit 9c511f9ddf
80 changed files with 4 additions and 2327784 deletions
--- a/.idea/asmbench.iml
+++ b/.idea/asmbench.iml
@@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="TestRunnerService">
-    <option name="projectConfiguration" value="Nosetests" />
-    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
-  </component>
-</module>
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
-</project>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="JavaScriptSettings">
-    <option name="languageLevel" value="ES6" />
-  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.4 (/opt/local/bin/python)" project-jdk-type="Python SDK" />
-</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/asmbench.iml" filepath="$PROJECT_DIR$/.idea/asmbench.iml" />
-    </modules>
-  </component>
-</project>
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -1,469 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="ce9d0a71-6676-44f6-88f0-52583274be24" name="Default" comment="">
-      <change beforePath="$PROJECT_DIR$/.idea/vcs.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/doc/asmbench-SC18SRC-poster/sc18-src-poster.ai" beforeDir="false" afterPath="$PROJECT_DIR$/doc/asmbench-SC18SRC-poster/sc18-src-poster.ai" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/tablegen.py" beforeDir="false" afterPath="$PROJECT_DIR$/tablegen.py" afterDir="false" />
-    </list>
-    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="FileEditorManager">
-    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
-      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/asmbench/oldjit.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="12135">
-              <caret line="820" column="33" selection-start-line="820" selection-start-column="33" selection-end-line="820" selection-end-column="33" />
-              <folding>
-                <element signature="e#23#36#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/asmbench/sc18src.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="120">
-              <caret line="17" column="21" selection-start-line="17" selection-start-column="21" selection-end-line="17" selection-end-column="21" />
-              <folding>
-                <element signature="e#23#41#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/tablegen.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="1350">
-              <caret line="100" column="28" selection-start-line="100" selection-start-column="28" selection-end-line="100" selection-end-column="28" />
-              <folding>
-                <element signature="e#24#34#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file pinned="false" current-in-tab="true">
-        <entry file="file://$PROJECT_DIR$/asmbench/streams.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="1230">
-              <caret line="82" lean-forward="true" selection-start-line="82" selection-end-line="82" />
-              <folding>
-                <element signature="e#24#42#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/asmbench/op.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="3900">
-              <caret line="260" column="35" selection-start-line="260" selection-start-column="35" selection-end-line="260" selection-end-column="35" />
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/asmbench/bench.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="45">
-              <caret line="3" column="15" selection-start-line="3" selection-end-line="4" />
-            </state>
-          </provider>
-        </entry>
-      </file>
-    </leaf>
-  </component>
-  <component name="FileTemplateManagerImpl">
-    <option name="RECENT_TEMPLATES">
-      <list>
-        <option value="Setup Script" />
-        <option value="Python Script" />
-      </list>
-    </option>
-  </component>
-  <component name="FindInProjectRecents">
-    <findStrings>
-      <find>build_ir</find>
-      <find>combined_instructions</find>
-      <find>random</find>
-      <find>serial</find>
-      <find>IntegerLoopBenchmark</find>
-      <find>latenchy</find>
-      <find>iaca_markers</find>
-      <find>setup_llvm</find>
-      <find>get_iaca_analysis</find>
-      <find>get_target_machine</find>
-      <find>foo</find>
-      <find>instructions_ret_type</find>
-      <find>iaca</find>
-      <find>get_registers</find>
-      <find>sop_t</find>
-      <find>AddressGenerationBenchmark</find>
-      <find>lea</find>
-      <find>,)</find>
-      <find>prepare_arguments</find>
-      <find>VSUBSSrr</find>
-      <find>build_and_execute</find>
-      <find>jit.</find>
-      <find>fn</find>
-      <find>asmjit</find>
-      <find>ValueError</find>
-      <find>split_llvm_vector_type</find>
-      <find>get_default_init_values</find>
-      <find>llvm</find>
-      <find>self.init_val</find>
-      <find>i64</find>
-    </findStrings>
-    <replaceStrings>
-      <replace>generate_register_nameing</replace>
-      <replace>naming</replace>
-      <replace>iaca_marker</replace>
-      <replace>jit.AddressGenerationBenchmark</replace>
-      <replace>)</replace>
-      <replace>oldjit.</replace>
-      <replace>asmbench</replace>
-    </replaceStrings>
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="IdeDocumentHistory">
-    <option name="CHANGED_PATHS">
-      <list>
-        <option value="$PROJECT_DIR$/debug_avx_feature.py" />
-        <option value="$PROJECT_DIR$/op.py" />
-        <option value="$PROJECT_DIR$/bench.py" />
-        <option value="$PROJECT_DIR$/asmjit/__init__.py" />
-        <option value="$PROJECT_DIR$/jit.py" />
-        <option value="$PROJECT_DIR$/tablegen.py" />
-        <option value="$PROJECT_DIR$/dev_test/reproduce.py" />
-        <option value="$PROJECT_DIR$/asmjit/__main__.py" />
-        <option value="$PROJECT_DIR$/asmjit/op.py" />
-        <option value="$PROJECT_DIR$/README.md" />
-        <option value="$PROJECT_DIR$/asmjit/bench.py" />
-        <option value="$PROJECT_DIR$/run_SC18_SRC.py" />
-        <option value="$PROJECT_DIR$/asmjit/sc18src.py" />
-        <option value="$PROJECT_DIR$/README.md" />
-        <option value="$PROJECT_DIR$/doc/sc18src_artifact_appendix.md" />
-        <option value="$PROJECT_DIR$/README.rst" />
-        <option value="$PROJECT_DIR$/MANIFEST.in" />
-        <option value="$PROJECT_DIR$/setup.py" />
-        <option value="$PROJECT_DIR$/setup.py" />
-        <option value="$PROJECT_DIR$/asmbench/bench.py" />
-        <option value="$PROJECT_DIR$/asmbench/op.py" />
-        <option value="$APPLICATION_CONFIG_DIR$/scratches/scratch.py" />
-        <option value="$PROJECT_DIR$/asmbench/streams.py" />
-      </list>
-    </option>
-  </component>
-  <component name="ProjectFrameBounds">
-    <option name="x" value="971" />
-    <option name="y" value="-1669" />
-    <option name="width" value="1241" />
-    <option name="height" value="1669" />
-  </component>
-  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
-  <component name="ProjectView">
-    <navigator proportions="" version="1">
-      <foldersAlwaysOnTop value="true" />
-    </navigator>
-    <panes>
-      <pane id="Scope" />
-      <pane id="ProjectPane">
-        <subPane>
-          <expand>
-            <path>
-              <item name="asmbench" type="b2602c69:ProjectViewProjectNode" />
-              <item name="asmbench" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="asmbench" type="b2602c69:ProjectViewProjectNode" />
-              <item name="asmbench" type="462c0819:PsiDirectoryNode" />
-              <item name="asmbench" type="462c0819:PsiDirectoryNode" />
-            </path>
-          </expand>
-          <select />
-        </subPane>
-      </pane>
-    </panes>
-  </component>
-  <component name="PropertiesComponent">
-    <property name="WebServerToolWindowFactoryState" value="false" />
-    <property name="com.intellij.ide.scratch.LRUPopupBuilder$1/New Scratch File" value="Python" />
-    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
-    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
-    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
-    <property name="run.code.analysis.last.selected.profile" value="pProject Default" />
-    <property name="settings.editor.selected.configurable" value="editor.preferences.completion" />
-  </component>
-  <component name="PyConsoleOptionsProvider">
-    <option name="myPythonConsoleState">
-      <console-settings is-module-sdk="true">
-        <option name="myUseModuleSdk" value="true" />
-      </console-settings>
-    </option>
-  </component>
-  <component name="RecentsManager">
-    <key name="MoveFile.RECENT_KEYS">
-      <recent name="$PROJECT_DIR$/asmjit" />
-    </key>
-  </component>
-  <component name="RunDashboard">
-    <option name="ruleStates">
-      <list>
-        <RuleState>
-          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
-        </RuleState>
-        <RuleState>
-          <option name="name" value="StatusDashboardGroupingRule" />
-        </RuleState>
-      </list>
-    </option>
-  </component>
-  <component name="SvnConfiguration">
-    <configuration />
-  </component>
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="ce9d0a71-6676-44f6-88f0-52583274be24" name="Default" comment="" />
-      <created>1528185911695</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1528185911695</updated>
-      <workItem from="1549014553562" duration="10478000" />
-      <workItem from="1549470823118" duration="191000" />
-      <workItem from="1549577395449" duration="719000" />
-      <workItem from="1549629861489" duration="622000" />
-      <workItem from="1549636051326" duration="400000" />
-      <workItem from="1550675127118" duration="4866000" />
-      <workItem from="1553613650758" duration="756000" />
-    </task>
-    <servers />
-  </component>
-  <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="18032000" />
-  </component>
-  <component name="ToolWindowManager">
-    <frame x="971" y="-1669" width="1241" height="1669" extended-state="0" />
-    <editor active="true" />
-    <layout>
-      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25771475" />
-      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info id="Favorites" order="2" side_tool="true" />
-      <window_info anchor="bottom" id="Message" order="0" />
-      <window_info anchor="bottom" id="Find" order="1" weight="0.32980832" />
-      <window_info anchor="bottom" id="Run" order="2" />
-      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
-      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
-      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
-      <window_info anchor="bottom" id="TODO" order="6" />
-      <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
-      <window_info anchor="bottom" id="Database Changes" order="8" show_stripe_button="false" />
-      <window_info anchor="bottom" id="Terminal" order="9" />
-      <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
-      <window_info anchor="bottom" id="Version Control" order="11" />
-      <window_info anchor="bottom" id="Messages" order="12" />
-      <window_info anchor="bottom" id="Python Console" order="13" />
-      <window_info active="true" anchor="bottom" id="Inspection Results" order="14" visible="true" weight="0.32980832" />
-      <window_info anchor="right" id="Commander" order="0" weight="0.4" />
-      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
-      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
-      <window_info anchor="right" id="SciView" order="3" />
-      <window_info anchor="right" id="Database" order="4" />
-    </layout>
-    <layout-to-restore>
-      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.23436196" />
-      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info id="Favorites" order="2" side_tool="true" />
-      <window_info anchor="bottom" id="Message" order="0" />
-      <window_info anchor="bottom" id="Find" order="1" weight="0.32980832" />
-      <window_info anchor="bottom" id="Run" order="2" />
-      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
-      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
-      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
-      <window_info anchor="bottom" id="TODO" order="6" />
-      <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
-      <window_info anchor="bottom" id="Version Control" order="8" />
-      <window_info anchor="bottom" id="Database Changes" order="9" show_stripe_button="false" />
-      <window_info anchor="bottom" id="Python Console" order="10" />
-      <window_info anchor="bottom" id="Terminal" order="11" />
-      <window_info anchor="bottom" id="Event Log" order="12" side_tool="true" />
-      <window_info anchor="bottom" id="Messages" order="13" />
-      <window_info active="true" anchor="bottom" id="Inspection Results" order="14" visible="true" weight="0.32980832" />
-      <window_info anchor="right" id="Commander" order="0" weight="0.4" />
-      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
-      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
-      <window_info anchor="right" id="SciView" order="3" />
-      <window_info anchor="right" id="Database" order="4" />
-    </layout-to-restore>
-  </component>
-  <component name="TypeScriptGeneratedFilesManager">
-    <option name="version" value="1" />
-  </component>
-  <component name="XDebuggerManager">
-    <breakpoint-manager>
-      <breakpoints>
-        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
-          <url>file://$PROJECT_DIR$/asmbench/bench.py</url>
-          <line>1</line>
-          <option name="timeStamp" value="3" />
-        </line-breakpoint>
-      </breakpoints>
-    </breakpoint-manager>
-  </component>
-  <component name="editorHistoryManager">
-    <entry file="file://$PROJECT_DIR$/debug_avx_feature.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="30">
-          <caret line="2" selection-start-line="2" selection-end-line="2" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/dev_test/reproduce.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="705">
-          <caret line="46" column="33" selection-start-line="46" selection-start-column="33" selection-end-line="46" selection-end-column="33" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_skylapesp2.txt" />
-    <entry file="file://$PROJECT_DIR$/SC18_SRC_skylapesp2.txt">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="135">
-          <caret line="9" column="43" lean-forward="true" selection-start-line="9" selection-start-column="43" selection-end-line="9" selection-end-column="43" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_naples1.txt" />
-    <entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_skylakesp2.txt" />
-    <entry file="file://$PROJECT_DIR$/README.md" />
-    <entry file="file://$PROJECT_DIR$/MANIFEST.in">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="15">
-          <caret line="1" column="18" selection-start-line="1" selection-start-column="18" selection-end-line="1" selection-end-column="18" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/doc/sc18src_artifact_appendix.md">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="15">
-          <caret line="1" lean-forward="true" selection-start-line="1" selection-end-line="1" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/README.rst">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="105">
-          <caret line="7" selection-start-line="7" selection-end-line="7" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/setup.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="420">
-          <caret line="28" lean-forward="true" selection-start-line="28" selection-end-line="28" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/build/lib/asmbench/bench.py" />
-    <entry file="file://$PROJECT_DIR$/build/lib/asmjit/bench.py" />
-    <entry file="file://$PROJECT_DIR$/asmbench/__init__.py">
-      <provider selected="true" editor-type-id="text-editor" />
-    </entry>
-    <entry file="file://$APPLICATION_CONFIG_DIR$/scratches/scratch.py">
-      <provider selected="true" editor-type-id="text-editor" />
-    </entry>
-    <entry file="file://$PROJECT_DIR$/asmbench/__main__.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="255">
-          <caret line="21" column="21" selection-start-line="21" selection-start-column="21" selection-end-line="21" selection-end-column="21" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/run_SC18_SRC.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-2084">
-          <caret line="2" column="16" selection-start-line="2" selection-start-column="16" selection-end-line="2" selection-end-column="16" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/asmbench/oldjit.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="12135">
-          <caret line="820" column="33" selection-start-line="820" selection-start-column="33" selection-end-line="820" selection-end-column="33" />
-          <folding>
-            <element signature="e#23#36#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/asmbench/sc18src.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="120">
-          <caret line="17" column="21" selection-start-line="17" selection-start-column="21" selection-end-line="17" selection-end-column="21" />
-          <folding>
-            <element signature="e#23#41#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/tablegen.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="1350">
-          <caret line="100" column="28" selection-start-line="100" selection-start-column="28" selection-end-line="100" selection-end-column="28" />
-          <folding>
-            <element signature="e#24#34#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/asmbench/op.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="3900">
-          <caret line="260" column="35" selection-start-line="260" selection-start-column="35" selection-end-line="260" selection-end-column="35" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/asmbench/bench.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="45">
-          <caret line="3" column="15" selection-start-line="3" selection-end-line="4" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/asmbench/streams.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="1230">
-          <caret line="82" lean-forward="true" selection-start-line="82" selection-end-line="82" />
-          <folding>
-            <element signature="e#24#42#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-  </component>
-  <component name="masterDetails">
-    <states>
-      <state key="ScopeChooserConfigurable.UI">
-        <settings>
-          <splitter-proportions>
-            <option name="proportions">
-              <list>
-                <option value="0.2" />
-              </list>
-            </option>
-          </splitter-proportions>
-        </settings>
-      </state>
-    </states>
-  </component>
-</project>
--- a/SC18_SRC_naples1.txt
+++ b/SC18_SRC_naples1.txt
--- a/SC18_SRC_skylapesp2.txt
+++ b/SC18_SRC_skylapesp2.txt
@@ -1,261 +0,0 @@
-ADD32ri           LAT 1.001 cy  TP 0.293 cy
-ADD64ri32         LAT 1.001 cy  TP 0.295 cy
-INC64r            LAT 1.000 cy  TP 0.314 cy
-MOV64ri32         LAT 0.535 cy  TP 0.354 cy
-SUB32ri           LAT 1.001 cy  TP 0.330 cy
-VADDPDYrr         LAT 4.002 cy  TP 0.523 cy
-VADDSDrr          LAT 4.002 cy  TP 0.523 cy
-VADDSSrr          LAT 4.002 cy  TP 0.523 cy
-VCVTSI642SSrr     LAT 2.001 cy  TP 2.001 cy
-VFMADD213PDYr     LAT 4.002 cy  TP 0.523 cy
-VFMADD213PDr      LAT 4.002 cy  TP 0.523 cy
-VFMADD213PSYr     LAT 4.002 cy  TP 0.523 cy
-VFMADD213PSr      LAT 4.002 cy  TP 0.523 cy
-VFMADD213SDr      LAT 4.002 cy  TP 0.523 cy
-VFMADD213SSr      LAT 4.002 cy  TP 0.523 cy
-VINSERTF128rr     LAT 3.001 cy  TP 1.000 cy
-VMULPDYrr         LAT 4.002 cy  TP 0.523 cy
-VMULSDrr          LAT 4.002 cy  TP 0.523 cy
-VMULSSrr          LAT 4.002 cy  TP 0.523 cy
-VSUBSDrr          LAT 4.002 cy  TP 0.523 cy
-VSUBSSrr          LAT 4.002 cy  TP 0.523 cy
-lea_b            LAT 0.600 cy  TP 0.550 cy
-lea_b+off        LAT 0.600 cy  TP 0.550 cy
-lea_idx*w        LAT 0.600 cy  TP 0.550 cy
-lea_off+idx*w    LAT 0.600 cy  TP 0.550 cy
-lea_b+idx*w      LAT 1.000 cy  TP 0.601 cy
-lea_b+off+idx*w  LAT 3.001 cy  TP 1.000 cy
-LD_linear        LAT 2.006 cy  TP 0.502 cy
-LD_random        LAT 2.006 cy  TP 0.502 cy
-ADD32ri          ADD32ri           LAT 1.086 cy  TP 0.614 cy  SPM  1.09
-ADD32ri          ADD64ri32         LAT 1.086 cy  TP 0.614 cy  SPM  1.09
-ADD32ri          INC64r            LAT 1.086 cy  TP 0.629 cy  SPM  1.08
-ADD32ri          MOV64ri32         LAT 1.000 cy  TP 0.603 cy  SPM  0.85
-ADD32ri          SUB32ri           LAT 1.086 cy  TP 0.614 cy  SPM  0.97
-ADD32ri          VADDPDYrr         LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VADDSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VADDSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VCVTSI642SSrr     LAT 2.001 cy  TP 2.001 cy  SPM  0.00
-ADD32ri          VFMADD213PDYr     LAT 4.002 cy  TP 0.581 cy  SPM  0.20
-ADD32ri          VFMADD213PDr      LAT 4.002 cy  TP 0.582 cy  SPM  0.20
-ADD32ri          VFMADD213PSYr     LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VFMADD213PSr      LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VFMADD213SDr      LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VFMADD213SSr      LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VINSERTF128rr     LAT 3.001 cy  TP 1.000 cy  SPM -0.00
-ADD32ri          VMULPDYrr         LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VMULSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VMULSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VSUBSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD32ri          VSUBSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        ADD64ri32         LAT 1.086 cy  TP 0.611 cy  SPM  1.07
-ADD64ri32        INC64r            LAT 1.086 cy  TP 0.605 cy  SPM  0.99
-ADD64ri32        MOV64ri32         LAT 1.000 cy  TP 0.578 cy  SPM  0.76
-ADD64ri32        SUB32ri           LAT 1.086 cy  TP 0.611 cy  SPM  0.95
-ADD64ri32        VADDPDYrr         LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VADDSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VADDSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VCVTSI642SSrr     LAT 2.001 cy  TP 1.000 cy  SPM -3.39
-ADD64ri32        VFMADD213PDYr     LAT 4.002 cy  TP 0.581 cy  SPM  0.20
-ADD64ri32        VFMADD213PDr      LAT 4.002 cy  TP 0.581 cy  SPM  0.20
-ADD64ri32        VFMADD213PSYr     LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VFMADD213PSr      LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VFMADD213SDr      LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VFMADD213SSr      LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VINSERTF128rr     LAT 3.002 cy  TP 1.001 cy  SPM  0.00
-ADD64ri32        VMULPDYrr         LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VMULSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VMULSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VSUBSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-ADD64ri32        VSUBSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.23
-INC64r           INC64r            LAT 1.086 cy  TP 0.611 cy  SPM  0.95
-INC64r           MOV64ri32         LAT 1.000 cy  TP 0.588 cy  SPM  0.74
-INC64r           SUB32ri           LAT 1.086 cy  TP 0.609 cy  SPM  0.89
-INC64r           VADDPDYrr         LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VADDSDrr          LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VADDSSrr          LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VCVTSI642SSrr     LAT 2.001 cy  TP 1.000 cy  SPM -3.19
-INC64r           VFMADD213PDYr     LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VFMADD213PDr      LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VFMADD213PSYr     LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VFMADD213PSr      LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VFMADD213SDr      LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VFMADD213SSr      LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VINSERTF128rr     LAT 3.001 cy  TP 1.000 cy  SPM  0.00
-INC64r           VMULPDYrr         LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VMULSDrr          LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VMULSSrr          LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VSUBSDrr          LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-INC64r           VSUBSSrr          LAT 4.002 cy  TP 0.564 cy  SPM  0.13
-MOV64ri32        MOV64ri32         LAT 0.657 cy  TP 0.578 cy  SPM  0.63
-MOV64ri32        SUB32ri           LAT 1.000 cy  TP 0.578 cy  SPM  0.68
-MOV64ri32        VADDPDYrr         LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VADDSDrr          LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VADDSSrr          LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VCVTSI642SSrr     LAT 2.001 cy  TP 1.001 cy  SPM -2.83
-MOV64ri32        VFMADD213PDYr     LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VFMADD213PDr      LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VFMADD213PSYr     LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VFMADD213PSr      LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VFMADD213SDr      LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VFMADD213SSr      LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VINSERTF128rr     LAT 3.001 cy  TP 1.001 cy  SPM  0.00
-MOV64ri32        VMULPDYrr         LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VMULSDrr          LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VMULSSrr          LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VSUBSDrr          LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-MOV64ri32        VSUBSSrr          LAT 4.002 cy  TP 0.557 cy  SPM  0.10
-SUB32ri          SUB32ri           LAT 1.086 cy  TP 0.611 cy  SPM  0.85
-SUB32ri          VADDPDYrr         LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VADDSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VADDSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VCVTSI642SSrr     LAT 2.001 cy  TP 1.000 cy  SPM -3.03
-SUB32ri          VFMADD213PDYr     LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VFMADD213PDr      LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VFMADD213PSYr     LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VFMADD213PSr      LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VFMADD213SDr      LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VFMADD213SSr      LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VINSERTF128rr     LAT 3.001 cy  TP 1.000 cy  SPM -0.00
-SUB32ri          VMULPDYrr         LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VMULSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VMULSSrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VSUBSDrr          LAT 4.002 cy  TP 0.592 cy  SPM  0.21
-SUB32ri          VSUBSSrr          LAT 4.002 cy  TP 0.961 cy  SPM  1.33
-VADDPDYrr        VADDPDYrr         LAT 4.002 cy  TP 1.036 cy  SPM  0.98
-VADDPDYrr        VADDSDrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDPDYrr        VADDSSrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDPDYrr        VCVTSI642SSrr     LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VADDPDYrr        VFMADD213PDYr     LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDPDYrr        VFMADD213PDr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDPDYrr        VFMADD213PSYr     LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDPDYrr        VFMADD213PSr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDPDYrr        VFMADD213SDr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDPDYrr        VFMADD213SSr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDPDYrr        VINSERTF128rr     LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VADDPDYrr        VMULPDYrr         LAT 4.002 cy  TP 1.036 cy  SPM  0.98
-VADDPDYrr        VMULSDrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDPDYrr        VMULSSrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDPDYrr        VSUBSDrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDPDYrr        VSUBSSrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDSDrr         VADDSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VADDSDrr         VADDSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VADDSDrr         VCVTSI642SSrr     LAT 4.002 cy  TP 2.001 cy  SPM  0.00
-VADDSDrr         VFMADD213PDYr     LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSDrr         VFMADD213PDr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSDrr         VFMADD213PSYr     LAT 4.002 cy  TP 1.030 cy  SPM  0.97
-VADDSDrr         VFMADD213PSr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSDrr         VFMADD213SDr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSDrr         VFMADD213SSr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSDrr         VINSERTF128rr     LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VADDSDrr         VMULPDYrr         LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDSDrr         VMULSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VADDSDrr         VMULSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VADDSDrr         VSUBSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VADDSDrr         VSUBSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VADDSSrr         VADDSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VADDSSrr         VCVTSI642SSrr     LAT 4.002 cy  TP 2.001 cy  SPM  0.00
-VADDSSrr         VFMADD213PDYr     LAT 4.002 cy  TP 1.030 cy  SPM  0.97
-VADDSSrr         VFMADD213PDr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSSrr         VFMADD213PSYr     LAT 4.002 cy  TP 1.030 cy  SPM  0.97
-VADDSSrr         VFMADD213PSr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSSrr         VFMADD213SDr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSSrr         VFMADD213SSr      LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VADDSSrr         VINSERTF128rr     LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VADDSSrr         VMULPDYrr         LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VADDSSrr         VMULSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VADDSSrr         VMULSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VADDSSrr         VSUBSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VADDSSrr         VSUBSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VCVTSI642SSrr    VCVTSI642SSrr     LAT 4.002 cy  TP 4.002 cy  SPM  1.00
-VCVTSI642SSrr    VFMADD213PDYr     LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VCVTSI642SSrr    VFMADD213PDr      LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VCVTSI642SSrr    VFMADD213PSYr     LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VCVTSI642SSrr    VFMADD213PSr      LAT 4.002 cy  TP 2.001 cy  SPM  0.00
-VCVTSI642SSrr    VFMADD213SDr      LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VCVTSI642SSrr    VFMADD213SSr      LAT 4.002 cy  TP 2.147 cy  SPM  0.28
-VCVTSI642SSrr    VINSERTF128rr     LAT 3.002 cy  TP 3.001 cy  SPM  1.00
-VCVTSI642SSrr    VMULPDYrr         LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VCVTSI642SSrr    VMULSDrr          LAT 4.002 cy  TP 2.001 cy  SPM  0.00
-VCVTSI642SSrr    VMULSSrr          LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VCVTSI642SSrr    VSUBSDrr          LAT 4.002 cy  TP 2.001 cy  SPM  0.00
-VCVTSI642SSrr    VSUBSSrr          LAT 4.002 cy  TP 2.001 cy  SPM -0.00
-VFMADD213PDYr    VFMADD213PDYr     LAT 4.002 cy  TP 1.047 cy  SPM  1.00
-VFMADD213PDYr    VFMADD213PDr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PDYr    VFMADD213PSYr     LAT 4.002 cy  TP 1.047 cy  SPM  1.00
-VFMADD213PDYr    VFMADD213PSr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PDYr    VFMADD213SDr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PDYr    VFMADD213SSr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PDYr    VINSERTF128rr     LAT 4.002 cy  TP 1.001 cy  SPM  0.00
-VFMADD213PDYr    VMULPDYrr         LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PDYr    VMULSDrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PDYr    VMULSSrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PDYr    VSUBSDrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PDYr    VSUBSSrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PDr     VFMADD213PDr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PDr     VFMADD213PSYr     LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PDr     VFMADD213PSr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PDr     VFMADD213SDr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PDr     VFMADD213SSr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PDr     VINSERTF128rr     LAT 4.002 cy  TP 0.675 cy  SPM -0.62
-VFMADD213PDr     VMULPDYrr         LAT 4.002 cy  TP 1.026 cy  SPM  0.96
-VFMADD213PDr     VMULSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PDr     VMULSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PDr     VSUBSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PDr     VSUBSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PSYr    VFMADD213PSYr     LAT 4.002 cy  TP 1.047 cy  SPM  1.00
-VFMADD213PSYr    VFMADD213PSr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PSYr    VFMADD213SDr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PSYr    VFMADD213SSr      LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VFMADD213PSYr    VINSERTF128rr     LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VFMADD213PSYr    VMULPDYrr         LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PSYr    VMULSDrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PSYr    VMULSSrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PSYr    VSUBSDrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PSYr    VSUBSSrr          LAT 4.002 cy  TP 1.029 cy  SPM  0.97
-VFMADD213PSr     VFMADD213PSr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PSr     VFMADD213SDr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PSr     VFMADD213SSr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213PSr     VINSERTF128rr     LAT 4.002 cy  TP 0.675 cy  SPM -0.62
-VFMADD213PSr     VMULPDYrr         LAT 4.002 cy  TP 1.026 cy  SPM  0.96
-VFMADD213PSr     VMULSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PSr     VMULSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PSr     VSUBSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213PSr     VSUBSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213SDr     VFMADD213SDr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213SDr     VFMADD213SSr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213SDr     VINSERTF128rr     LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VFMADD213SDr     VMULPDYrr         LAT 4.002 cy  TP 1.026 cy  SPM  0.96
-VFMADD213SDr     VMULSDrr          LAT 4.002 cy  TP 1.156 cy  SPM  1.21
-VFMADD213SDr     VMULSSrr          LAT 4.002 cy  TP 1.156 cy  SPM  1.21
-VFMADD213SDr     VSUBSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213SDr     VSUBSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213SSr     VFMADD213SSr      LAT 4.002 cy  TP 1.046 cy  SPM  1.00
-VFMADD213SSr     VINSERTF128rr     LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VFMADD213SSr     VMULPDYrr         LAT 4.002 cy  TP 1.026 cy  SPM  0.96
-VFMADD213SSr     VMULSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213SSr     VMULSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213SSr     VSUBSDrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VFMADD213SSr     VSUBSSrr          LAT 4.002 cy  TP 1.028 cy  SPM  0.97
-VINSERTF128rr    VINSERTF128rr     LAT 3.001 cy  TP 2.001 cy  SPM  1.00
-VINSERTF128rr    VMULPDYrr         LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VINSERTF128rr    VMULSDrr          LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VINSERTF128rr    VMULSSrr          LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VINSERTF128rr    VSUBSDrr          LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VINSERTF128rr    VSUBSSrr          LAT 4.002 cy  TP 1.000 cy  SPM -0.00
-VMULPDYrr        VMULPDYrr         LAT 4.002 cy  TP 1.036 cy  SPM  0.98
-VMULPDYrr        VMULSDrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VMULPDYrr        VMULSSrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VMULPDYrr        VSUBSDrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VMULPDYrr        VSUBSSrr          LAT 4.002 cy  TP 1.045 cy  SPM  1.00
-VMULSDrr         VMULSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VMULSDrr         VMULSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VMULSDrr         VSUBSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VMULSDrr         VSUBSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VMULSSrr         VMULSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VMULSSrr         VSUBSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VMULSSrr         VSUBSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VSUBSDrr         VSUBSDrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-VSUBSDrr         VSUBSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.98
-VSUBSSrr         VSUBSSrr          LAT 4.002 cy  TP 1.038 cy  SPM  0.99
-[1;34m[likwid-pin] Main PID -> core 0 - OK[0m
--- a/SC18_SRC_summitridge1.txt
+++ b/SC18_SRC_summitridge1.txt
--- a/pycache/tablegen.cpython-35.pyc
+++ b/pycache/tablegen.cpython-35.pyc
--- a/a.out
+++ b/a.out
--- a/a.out.dSYM/Contents/Info.plist
+++ b/a.out.dSYM/Contents/Info.plist
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-	<dict>
-		<key>CFBundleDevelopmentRegion</key>
-		<string>English</string>
-		<key>CFBundleIdentifier</key>
-		<string>com.apple.xcode.dsym.a.out</string>
-		<key>CFBundleInfoDictionaryVersion</key>
-		<string>6.0</string>
-		<key>CFBundlePackageType</key>
-		<string>dSYM</string>
-		<key>CFBundleSignature</key>
-		<string>????</string>
-		<key>CFBundleShortVersionString</key>
-		<string>1.0</string>
-		<key>CFBundleVersion</key>
-		<string>1</string>
-	</dict>
-</plist>
--- a/a.out.dSYM/Contents/Resources/DWARF/a.out
+++ b/a.out.dSYM/Contents/Resources/DWARF/a.out
--- a/asmbench.egg-info/PKG-INFO
+++ b/asmbench.egg-info/PKG-INFO
@@ -1,29 +0,0 @@
-Metadata-Version: 2.1
-Name: asmbench
-Version: 0.1.4
-Summary: A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT
-Home-page: https://github.com/RRZE-HPC/asmbench
-Author: Julian Hammer
-Author-email: julian.hammer@fau.de
-License: AGPLv3
-Description: asmbench
-        ========
-        
-        A benchmark toolkit for assembly instructions using the LLVM JIT.
-        
-        Usage
-        =====
-        
-        To benchmark latency and throughput of a 64bit integer add use the following command:
-        
-        ``python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}'``
-        
-        To benchmark two instructions interleaved use this:
-        
-        ``python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}' 'sub {src:i64:r}, {srcdst:i64:r}'``
-        
-        To find out more add `-h` for help and `-v` for verbose mode.
-        
-Platform: UNKNOWN
-Provides-Extra: iaca
-Provides-Extra: sc18src
--- a/asmbench.egg-info/SOURCES.txt
+++ b/asmbench.egg-info/SOURCES.txt
@@ -1,17 +0,0 @@
-LICENSE
-MANIFEST.in
-README.rst
-setup.py
-asmbench/__init__.py
-asmbench/__main__.py
-asmbench/bench.py
-asmbench/oldjit.py
-asmbench/op.py
-asmbench/sc18src.py
-asmbench/streams.py
-asmbench.egg-info/PKG-INFO
-asmbench.egg-info/SOURCES.txt
-asmbench.egg-info/dependency_links.txt
-asmbench.egg-info/entry_points.txt
-asmbench.egg-info/requires.txt
-asmbench.egg-info/top_level.txt
--- a/asmbench.egg-info/dependency_links.txt
+++ b/asmbench.egg-info/dependency_links.txt
@@ -1 +0,0 @@
-
--- a/asmbench.egg-info/entry_points.txt
+++ b/asmbench.egg-info/entry_points.txt
@@ -1,3 +0,0 @@
-[console_scripts]
-asmbench = asmbench.__main__:main
-
--- a/asmbench.egg-info/requires.txt
+++ b/asmbench.egg-info/requires.txt
@@ -1,9 +0,0 @@
-llvmlite>=0.23.2
-psutil
-
-[iaca]
-kerncraft
-
-[sc18src]
-numpy
-matplotlib
--- a/asmbench.egg-info/top_level.txt
+++ b/asmbench.egg-info/top_level.txt
@@ -1 +0,0 @@
-asmbench
--- a/asmbench/pycache/init.cpython-35.pyc
+++ b/asmbench/pycache/init.cpython-35.pyc
--- a/asmbench/pycache/init.cpython-37.pyc
+++ b/asmbench/pycache/init.cpython-37.pyc
--- a/asmbench/pycache/main.cpython-35.pyc
+++ b/asmbench/pycache/main.cpython-35.pyc
--- a/asmbench/pycache/main.cpython-37.pyc
+++ b/asmbench/pycache/main.cpython-37.pyc
--- a/asmbench/pycache/bench.cpython-35.pyc
+++ b/asmbench/pycache/bench.cpython-35.pyc
--- a/asmbench/pycache/bench.cpython-37.pyc
+++ b/asmbench/pycache/bench.cpython-37.pyc
--- a/asmbench/pycache/op.cpython-35.pyc
+++ b/asmbench/pycache/op.cpython-35.pyc
--- a/asmbench/pycache/op.cpython-37.pyc
+++ b/asmbench/pycache/op.cpython-37.pyc
--- a/asmbench/bench.py
+++ b/asmbench/bench.py
@@ -12,9 +12,9 @@ import sys
 import llvmlite.binding as llvm
 import psutil
 try:
-    from kerncraft import incode_model
+    from kerncraft import iaca
 except ImportError:
-    incode_model = None
+    iaca = None

 from . import op

@@ -87,13 +87,13 @@ class Benchmark:

    def get_iaca_analysis(self, arch):
        """Compile and return IACA analysis."""
-        if incode_model is None:
+        if iaca is None:
            raise ValueError("kerncraft not installed. IACA analysis is not supported.")
        tm = self.get_target_machine()
        tmpf = tempfile.NamedTemporaryFile("wb")
        tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
        tmpf.flush()
-        return incode_model.iaca_analyse_instrumented_binary(tmpf.name, arch)
+        return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)

    def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
        # Compile the module to machine code using MCJIT
--- a/asmbench/streams.py
+++ b/asmbench/streams.py
@@ -1,82 +0,0 @@
-#!/usr/bin/env python3
-
-import collections
-import itertools
-import socket
-import textwrap
-
-import numpy
-import matplotlib.pyplot as plt
-import matplotlib as mpl
-
-from asmbench import op, bench
-from asmbench import oldjit
-
-
-type_size = {
-    'i32': 4,
-    'i64': 8,
-    'f32': 4,
-    'float': 4,
-    'f64': 8,
-    'double': 8,
-}
-
-
-class StreamsBenchmark(bench.Benchmark):
-    def __init__(self,
-                 read_streams=0, read_write_streams=0, write_streams=0,
-                 stream_byte_length=0,
-                 element_type='i64'):
-        super().__init__()
-        self.read_streams = read_streams
-        self.read_write_streams = read_write_streams
-        self.write_streams = write_streams
-        self.stream_byte_length = stream_byte_length
-        self.element_type = element_type
-
-    def build_ir(self, iaca_marker=False):
-        if iaca_marker:
-            iaca_start_marker = textwrap.dedent('''\
-                call void asm "movl    $$111,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-            iaca_stop_marker = textwrap.dedent('''\
-                call void asm "movl    $$222,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-        else:
-            iaca_start_marker = ''
-            iaca_stop_marker = ''
-
-        ir = textwrap.dedent('''\
-            define i64 @"test"(i64 %"N"{pointer_arguments})
-            {{
-            entry:
-              %"loop_cond" = icmp slt i64 0, %"N"
-              br i1 %"loop_cond", label %"loop", label %"end"
-
-            loop:
-              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-            {iaca_start_marker}
-            {loop_body}
-              %"loop_counter.1" = add i64 %"loop_counter", 1
-              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-              br i1 %"loop_cond.1", label %"loop", label %"end"
-
-            end:
-              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-            {iaca_stop_marker}
-              ret i64 %"ret"
-            }}
-            ''').format(
-            pointer_arguments='',
-            loop_body='',
-            iaca_start_marker=iaca_start_marker,
-            iaca_stop_marker=iaca_stop_marker)
-
-        return ir
-
-if __name__ == '__main__':
-    bench.setup_llvm()
-    sb = StreamsBenchmark()
-    print(sb.build_and_execute())
-
--- a/build/lib/asmbench/init.py
+++ b/build/lib/asmbench/init.py
@@ -1 +0,0 @@
-__version__ = '0.1.4'
--- a/build/lib/asmbench/main.py
+++ b/build/lib/asmbench/main.py
@@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-
-import psutil
-import llvmlite.binding as llvm
-
-from . import op, bench
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
-    # parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
-    parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
-                        help='instruction declaration, e.g., "add {src:i32:r}, {srcdst:i32:r}"')
-    parser.add_argument('--serialize', action='store_true',
-                        help='Serialize instructions.')
-    parser.add_argument('--latency-serial', '-l', type=int, default=8,
-                         help='length of serial chain for each instruction in latency benchmark')
-    parser.add_argument('--parallel', '-p',type=int, default=10,
-                        help='number of parallel instances of serial chains in throughput '
-                             'benchmark')
-    parser.add_argument('--throughput-serial', '-t', type=int, default=8,
-                        help='length of serial instances of serial chains in throughput benchmark')
-    parser.add_argument('--iaca', type=str, default=None,
-                        help='Compare throughput measurement with IACA analysis, pass '
-                             'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
-    parser.add_argument("--verbose", "-v", action="count", default=0,
-                        help="increase output verbosity")
-    parser.add_argument('-f', '--frequency', type=float, required=psutil.cpu_freq() is None,
-                        help='Provided (in GHz), if psutil.cpu_freq() does report anything.')
-    args = parser.parse_args()
-    if args.frequency:
-        args.frequency *= 1e9
-
-    bench.setup_llvm()
-    lat, tp = bench.bench_instructions(args.instructions,
-                                       serial_factor=args.latency_serial,
-                                       parallel_factor=args.parallel,
-                                       throughput_serial_factor=args.throughput_serial,
-                                       serialize=args.serialize,
-                                       verbosity=args.verbose,
-                                       iaca_comparison=args.iaca,
-                                       frequency=args.frequency)
-    print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
-
-
-if __name__ == "__main__":
-    main()
--- a/build/lib/asmbench/bench.py
+++ b/build/lib/asmbench/bench.py
@@ -1,399 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-import time
-import textwrap
-import itertools
-import re
-from pprint import pprint
-import tempfile
-import subprocess
-import sys
-
-import llvmlite.binding as llvm
-import psutil
-try:
-    from kerncraft import iaca
-except ImportError:
-    iaca = None
-
-from . import op
-
-
-def setup_llvm():
-    llvm.initialize()
-    llvm.initialize_native_target()
-    llvm.initialize_native_asmprinter()
-    llvm.initialize_native_asmparser()
-
-
-def uniquify(l):
-    # Uniquify list while preserving order
-    seen = set()
-    return [x for x in l if x not in seen and not seen.add(x)]
-
-
-class Benchmark:
-    def __init__(self, frequency=None):
-        self.frequency = frequency or psutil.cpu_freq().max * 1e6
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-    @staticmethod
-    def prepare_arguments(previous_args=None, time_factor=1.0):
-        """Build argument tuple, to be passed to low level function."""
-        if previous_args is None:
-            return 10000000,
-        else:
-            try:
-                return int(previous_args[0] * time_factor),
-            except OverflowError:
-                return previous_args[0]*10,
-
-    @staticmethod
-    def get_iterations(args) -> int:
-        """Return number of iterations performed, based on lower level function arguments."""
-        return args[0]
-
-    def build_ir(self):
-        raise NotImplementedError()
-
-    def get_llvm_module(self, iaca_marker=False):
-        """Build and return LLVM module from LLVM IR code."""
-        ir = self.build_ir(iaca_marker=iaca_marker)
-        return llvm.parse_assembly(ir)
-
-    def get_target_machine(self):
-        """Instantiate and return target machine."""
-        features = llvm.get_host_cpu_features().flatten()
-        cpu = '' # llvm.get_host_cpu_name()  # Work around until ryzen problems are fixed
-        return llvm.Target.from_default_triple().create_target_machine(
-             cpu=cpu, features=features, opt=3)
-
-    def get_assembly(self, iaca_marker=False):
-        """Compile and return assembly from LLVM module."""
-        tm = self.get_target_machine()
-        tm.set_asm_verbosity(0)
-        asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
-        # Remove double comments
-        asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
-        return asm
-
-    def get_function_ctype(self):
-        return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
-
-    def get_iaca_analysis(self, arch):
-        """Compile and return IACA analysis."""
-        if iaca is None:
-            raise ValueError("kerncraft not installed. IACA analysis is not supported.")
-        tm = self.get_target_machine()
-        tmpf = tempfile.NamedTemporaryFile("wb")
-        tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
-        tmpf.flush()
-        return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
-
-    def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
-        # Compile the module to machine code using MCJIT
-        tm = self.get_target_machine()
-        runtimes = []
-        return_values = []
-        args = self.prepare_arguments()
-        with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
-            ee.finalize_object()
-
-            # Obtain a pointer to the compiled 'sum' - it's the address of its JITed
-            # code in memory.
-            cfptr = ee.get_function_address('test')
-
-            # To convert an address to an actual callable thing we have to use
-            # CFUNCTYPE, and specify the arguments & return type.
-            cfunc = self.get_function_ctype()(cfptr)
-
-            # Now 'cfunc' is an actual callable we can invoke
-            # TODO replace time.clock with a C implemententation for less overhead
-            # TODO return result in machine readable format
-            fixed_args = False
-            for i in range(repeat):
-                tries = 0
-                while True:
-                    if tries > 10:
-                        raise RuntimeError("Unable to measure non-zero runtime.")
-                    tries += 1
-                    start = time.perf_counter()
-                    ret = cfunc(*args)
-                    end = time.perf_counter()
-                    elapsed = end - start
-                    if ret != args[0]-1:
-                        raise RuntimeError(
-                            "Return value {} is invalid, should have been {}.".format(ret, args[0]-1))
-                    if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
-                        target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
-                        factor = target_elapsed / elapsed
-                        args = self.prepare_arguments(previous_args=args, time_factor=factor)
-                        continue
-                    else:
-                        # After we have the right argument choice, we keep it.
-                        fixed_args = True
-                        break
-                return_values.append(ret)
-                runtimes.append(elapsed)
-        return {'iterations': self.get_iterations(args),
-                'arguments': args,
-                'runtimes': runtimes,
-                'frequency': self.frequency,
-                'returned': return_values}
-
-
-class LoopBenchmark(Benchmark):
-    def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True, **kwargs):
-        super().__init__(**kwargs)
-        self.root_synth = root_synth
-        self.init_values = init_values or root_synth.get_default_init_values()
-        self.loop_carried_dependencies = loop_carried_dependencies
-
-        if len(root_synth.get_source_registers()) != len(self.init_values):
-            raise ValueError("Number of init values and source registers do not match.")
-
-    def get_source_names(self):
-        return ['%in.{}'.format(i) for i in range(len(self.root_synth.get_source_registers()))]
-
-    def get_destination_names(self):
-        return ['%out.{}'.format(i) for i in
-                range(len(self.root_synth.get_destination_registers()))]
-
-    def get_phi_code(self):
-        if not self.loop_carried_dependencies:
-            return ''
-        # Compile loop carried dependencies
-        lcd = []
-        # Change in naming (src <-> dst) is on purpose!
-        srcs = self.root_synth.get_destination_registers()
-        dsts = self.root_synth.get_source_registers()
-        # cycle iterator is used to not only reuse a single destination, but go through all of them
-        srcs_it = itertools.cycle(enumerate(srcs))
-        matched = False
-        last_match_idx = len(srcs) - 1
-        for dst_idx, dst in enumerate(dsts):
-            for src_idx, src in srcs_it:
-                if src.llvm_type == dst.llvm_type:
-                    lcd.append([dst,
-                                self.get_source_names()[dst_idx],
-                                self.init_values[dst_idx],
-                                src,
-                                self.get_destination_names()[src_idx]])
-                    matched = True
-                    last_match_idx = src_idx
-                    break
-                # since srcs_it is an infinity iterator, we need to abort after a complete cycle
-                if src_idx == last_match_idx:
-                    break
-        if not matched:
-            raise ValueError("Unable to match source to any destination.")
-
-        code = ''
-        for dst_reg, dst_name, init_value, src_reg, src_name in lcd:
-            assert dst_reg.llvm_type == src_reg.llvm_type, \
-                "Source and destination types do not match"
-            code += ('{dst_name} = phi {llvm_type} [{init_value}, %"entry"], '
-                     '[{src_name}, %"loop"]\n').format(
-                llvm_type=dst_reg.llvm_type,
-                dst_name=dst_name,
-                init_value=init_value,
-                src_name=src_name)
-
-        # Add extra phi for constant values. Assuming LLVM will optimize them "away"
-        for dst_idx, dst in enumerate(dsts):
-            if dst not in [d for d, dn, i, s, sn in lcd]:
-                code += ('{dst_reg} = phi {llvm_type} [{init_value}, %"entry"], '
-                         '[{init_value}, %"loop"]\n').format(
-                    llvm_type=dst.llvm_type,
-                    dst_reg=self.get_source_names()[dst_idx],
-                    init_value=self.init_values[dst_idx])
-
-        return code
-
-    def build_ir(self):
-        raise NotImplementedError()
-
-
-class IntegerLoopBenchmark(LoopBenchmark):
-    def build_ir(self, iaca_marker=False):
-        if iaca_marker:
-            iaca_start_marker = textwrap.dedent('''\
-                call void asm "movl    $$111,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-            iaca_stop_marker = textwrap.dedent('''\
-                call void asm "movl    $$222,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-        else:
-            iaca_start_marker = ''
-            iaca_stop_marker = ''
-
-        ir = textwrap.dedent('''\
-            define i64 @"test"(i64 %"N")
-            {{
-            entry:
-              %"loop_cond" = icmp slt i64 0, %"N"
-              br i1 %"loop_cond", label %"loop", label %"end"
-
-            loop:
-              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-            {phi}
-            {iaca_start_marker}
-            {loop_body}
-              %"loop_counter.1" = add i64 %"loop_counter", 1
-              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-              br i1 %"loop_cond.1", label %"loop", label %"end"
-            
-            end:
-              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-            {iaca_stop_marker}
-              ret i64 %"ret"
-            }}
-            ''').format(
-            loop_body=textwrap.indent(
-                self.root_synth.build_ir(self.get_destination_names(),
-                                         self.get_source_names()), '  '),
-            phi=textwrap.indent(self.get_phi_code(), '  '),
-            iaca_start_marker=iaca_start_marker,
-            iaca_stop_marker=iaca_stop_marker)
-
-        return ir
-
-
-def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
-                       serialize=False, verbosity=0, iaca_comparison=None,
-                       repeat=4, min_elapsed=0.1, max_elapsed=0.2, frequency=None):
-    not_serializable = False
-    try:
-        # Latency Benchmark
-        if verbosity > 0:
-            print('## Latency Benchmark')
-        p_instrs = []
-        if not serialize:
-            for i in instructions:
-                p_instrs.append(op.Serialized([i] * serial_factor))
-        else:
-            p_instrs = [op.Serialized(instructions * serial_factor)]
-        p = op.Parallelized(p_instrs)
-        b = IntegerLoopBenchmark(p, frequency=frequency)
-        if verbosity >= 3:
-            print('### LLVM IR')
-            print(b.build_ir())
-        if verbosity >= 2:
-            print('### Assembly')
-            print(b.get_assembly())
-        if verbosity >= 3:
-            print('### IACA Analysis')
-            try:
-                print(b.get_iaca_analysis('SKL')['output'])
-            except ValueError as e:
-                print("Unable to perform IACA analysis (skipping): ", e)
-            except FileNotFoundError as e:
-                print("IACA binary not found by kerncraft. Run iaca_get to install.", e)
-                
-        result = b.build_and_execute(
-            repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
-        lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
-                    for t in result['runtimes']])
-        result['latency'] = lat
-        if verbosity > 0:
-            print('### Detailed Results')
-            pprint(result)
-            print()
-    except op.NotSerializableError as e:
-        print("Latency measurement not possible:", e)
-        not_serializable = True
-
-    if not_serializable:
-        throughput_serial_factor = 1
-        print("WARNING: throughput_serial_factor has be set to 1.")
-
-    # Throughput Benchmark
-    if verbosity > 0:
-        print('## Throughput Benchmark')
-    p_instrs = []
-    if not serialize:
-        for i in instructions:
-            p_instrs.append(op.Serialized([i] * throughput_serial_factor))
-    else:
-        p_instrs = [op.Serialized(instructions * throughput_serial_factor)]
-    p = op.Parallelized(p_instrs * parallel_factor, interleave=True)
-    b = IntegerLoopBenchmark(p, frequency=frequency)
-    if verbosity >= 3:
-        print('### LLVM IR')
-        print(b.build_ir())
-    if verbosity >= 2:
-        print('### Assembly')
-        print(b.get_assembly())
-    if verbosity >= 3:
-        print('### IACA Analysis')
-        try:
-            print(b.get_iaca_analysis('SKL')['output'])
-        except ValueError as e:
-            print("Unable to perform IACA analysis (skipping): ", e)
-        except FileNotFoundError as e:
-            print("IACA binary not found by kerncraft. Run iaca_get to install.", e)
-    result = b.build_and_execute(
-        repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
-    tp = min(
-        [(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
-         for t in result['runtimes']])
-    result['throughput'] = tp
-    if iaca_comparison is not None:
-        iaca_analysis = b.get_iaca_analysis(iaca_comparison)
-        result['iaca throughput'] = iaca_analysis['throughput']/(
-                parallel_factor * throughput_serial_factor)
-    if verbosity > 0:
-        print('### Detailed Results')
-        pprint(result)
-        print()
-    if verbosity > 1 and iaca_comparison is not None:
-        print('### IACA Results')
-        print(iaca_analysis['output'])
-        print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
-            throughput_serial_factor, parallel_factor))
-
-    # Result compilation
-    return lat, tp
-
-
-if __name__ == '__main__':
-    setup_llvm()
-
-    i1 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
-    i2 = op.Instruction(
-        instruction='sub $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
-    s = op.Serialized([i1, i2])
-    i3 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
-    i4 = op.Instruction(
-        instruction='sub $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
-    i5 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
-    i6 = op.Instruction(
-        instruction='add $2, $0',
-        destination_operand=op.Register('i64', 'r'),
-        source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
-    s1 = op.Serialized([i1, i2])
-    s2 = op.Serialized([s1, i3])
-    s3 = op.Serialized([i4, i5])
-    p1 = op.Parallelized([i6, s2, s3])
-    init_values = ['1' for r in p1.get_source_registers()]
-    b = IntegerLoopBenchmark(p1, init_values)
-    print(b.build_ir())
-    print(b.get_assembly())
--- a/build/lib/asmbench/oldjit.py
+++ b/build/lib/asmbench/oldjit.py
@@ -1,897 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-import sys
-import time
-import textwrap
-import itertools
-import random
-import collections
-import pprint
-import math
-import argparse
-
-import llvmlite.binding as llvm
-import psutil
-
-
-# TODOs
-# * API to create test scenarios
-#   * DSL?
-# * Test cases:
-#   * Instructions:
-#     * [x] arithmetics \w reg and/or imm.
-#       * scalar
-#       * packed
-#     * [x] lea
-#     * [x] LOAD / mov \w mem
-#     * [TODO] STORE / mov to mem
-#   * [x] Single Latency
-#   * [x] Single Throughput
-#   * [TODO] Combined Throughput
-#   * [TODO] Random Throughput
-# * [TODO] Automated TP, Lat, #pipeline analysis
-# * [TODO] IACA marked binary output generation
-# * [TODO] Fuzzing algorithm
-# * [TODO] CLI
-# * C based timing routine? As an extension?
-# * make sanity checks during runtime, check for fixed frequency and pinning
-
-def floor_harmonic_fraction(n, error=0.1):
-    """
-    Finds closest floored integer or inverse integer and returns error.
-
-    (numerator, denominator, relative error) where either numerator or denominator is exactly one.
-    """
-    floor_n = math.floor(n)
-    if floor_n > 0:
-        return floor_n, 1, 1 - floor_n / n
-    else:
-        i = 2
-        while (1 / i) > n:
-            i += 1
-
-        return 1, i, 1 - (1 / i) / n
-
-
-class Benchmark:
-    def __init__(self, parallel=1, serial=5, frequency=None):
-        self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
-        self.parallel = parallel
-        self.serial = serial
-        self.frequency = frequency or psutil.cpu_freq().current * 1e6
-
-        # Do interesting work
-        self._loop_body = textwrap.dedent('''\
-            %"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
-            %"checksum.1" = call i64 asm sideeffect "
-                add $1, $0",
-                "=r,i,r" (i64 1, i64 %"checksum")\
-            ''')
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-    def get_ir(self):
-        # FP add loop - may have issues
-        # return textwrap.dedent('''\
-        #    define i64 @"test"(i64 %"N")
-        #    {{
-        #    entry:
-        #      %"N.fp" = sitofp i64 %"N" to double
-        #      %"loop_cond" = fcmp olt double 0.0, %"N.fp"
-        #      br i1 %"loop_cond", label %"loop", label %"end"
-        #
-        #    loop:
-        #      %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"]
-        #    {loop_body}
-        #      %"loop_counter.1" = fadd double %"loop_counter", 1.0
-        #      %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp"
-        #      br i1 %"loop_cond.1", label %"loop", label %"end"
-        #
-        #    end:
-        #      %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"]
-        #      %"ret" = fptosi double %"ret.fp" to i64
-        #      ret i64 %"ret"
-        #    }}
-        #    ''').format(
-        #        loop_body=textwrap.indent(self._loop_body, '  '))
-        return textwrap.dedent('''\
-            define i64 @"test"(i64 %"N")
-            {{
-            entry:
-              %"loop_cond" = icmp slt i64 0, %"N"
-              br i1 %"loop_cond", label %"loop", label %"end"
-
-            loop:
-              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-            {loop_body}
-              %"loop_counter.1" = add i64 %"loop_counter", 1
-              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-              br i1 %"loop_cond.1", label %"loop", label %"end"
-
-            end:
-              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-              ret i64 %"ret"
-            }}
-            ''').format(
-            loop_body=textwrap.indent(self._loop_body, '  '))
-
-    def prepare_arguments(self, previous_args=None, time_factor=1.0):
-        """Build argument tuple, to be passed to low level function."""
-        if previous_args is None:
-            return 100,
-        else:
-            return int(previous_args[0] * time_factor),
-
-    def get_iterations(self, args):
-        """Return number of iterations performed, based on lower level function arguments."""
-        return args[0]
-
-    def get_llvm_module(self):
-        """Build and return LLVM module from LLVM IR code."""
-        if not hasattr(self, '_llvm_module'):
-            self._llvm_module = llvm.parse_assembly(self.get_ir())
-            self._llvm_module.verify()
-        return self._llvm_module
-
-    def get_target_machine(self):
-        """Instantiate and return target machine."""
-        if not hasattr(self, '_llvm_module'):
-            features = llvm.get_host_cpu_features().flatten()
-            cpu = llvm.get_host_cpu_name()
-            self._tm = llvm.Target.from_default_triple().create_target_machine(
-                cpu=cpu, features=features, opt=1)
-        return self._tm
-
-    def get_assembly(self):
-        """Compile and return assembly from LLVM module."""
-        tm = self.get_target_machine()
-        tm.set_asm_verbosity(0)
-        return tm.emit_assembly(self.get_llvm_module())
-
-    def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
-        # Compile the module to machine code using MCJIT
-        tm = self.get_target_machine()
-        runtimes = []
-        args = self.prepare_arguments()
-        with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
-            ee.finalize_object()
-
-            # Obtain a pointer to the compiled 'sum' - it's the address of its JITed
-            # code in memory.
-            cfptr = ee.get_function_address('test')
-
-            # To convert an address to an actual callable thing we have to use
-            # CFUNCTYPE, and specify the arguments & return type.
-            cfunc = self._function_ctype(cfptr)
-
-            # Now 'cfunc' is an actual callable we can invoke
-            # TODO replace time.clock with a C implemententation for less overhead
-            # TODO return result in machine readable format
-            fixed_args = False
-            for i in range(repeat):
-                while True:
-                    start = time.perf_counter()
-                    res = cfunc(*args)
-                    end = time.perf_counter()
-                    elapsed = end - start
-                    if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
-                        target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
-                        factor = target_elapsed / elapsed
-                        args = self.prepare_arguments(previous_args=args, time_factor=factor)
-                        continue
-                    else:
-                        # After we have the right argument choice, we keep it.
-                        fixed_args = True
-                        break
-
-                runtimes.append(elapsed)
-
-        return {'iterations': self.get_iterations(args),
-                'arguments': args,
-                'runtimes': runtimes,
-                'frequency': self.frequency}
-
-    @classmethod
-    def get_latency(cls, max_serial=6, print_table=False, **kwargs):
-        if print_table:
-            print(' s |' + ''.join([' {:^5}'.format(i) for i in range(1, max_serial)]))
-            print('   | ', end='')
-        serial_runs = []
-        for s in range(1, max_serial):
-            m = cls(serial=s, parallel=1, **kwargs)
-            r = m.build_and_execute(repeat=1)
-            cy_per_it = min(r['runtimes']) * r['frequency'] / (
-                        r['iterations'] * m.parallel * m.serial)
-            if print_table:
-                print('{:.3f} '.format(cy_per_it), end='')
-            sys.stdout.flush()
-
-            serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
-
-        if print_table:
-            print()
-            print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format(
-                lat=min(serial_runs)[1]))
-
-        return min(serial_runs)[1]
-
-    @classmethod
-    def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs):
-        if print_table:
-            print('s\p |' + ''.join([' {:^5}'.format(i) for i in range(2, max_parallel)]))
-        parallel_runs = []
-        for s in range(1, max_serial):
-            if print_table:
-                print('{:>3} | '.format(s), end='')
-            for p in range(2, max_parallel):
-                m = cls(serial=s, parallel=p, **kwargs)
-                r = m.build_and_execute(repeat=1)
-                cy_per_it = min(r['runtimes']) * r['frequency'] / (
-                            r['iterations'] * m.parallel * m.serial)
-                if print_table:
-                    print('{:.3f} '.format(cy_per_it), end='')
-                sys.stdout.flush()
-                parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
-            if print_table:
-                print()
-
-        if print_table:
-            print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format(
-                tp=min(parallel_runs)[1]))
-
-        return min(parallel_runs)[1]
-
-
-class InstructionBenchmark(Benchmark):
-    def __init__(self, instruction='addq $1, $0',
-                 dst_operands=(),
-                 dstsrc_operands=(('r', 'i64', '0'),),
-                 src_operands=(('i', 'i64', '1'),),
-                 parallel=10,
-                 serial=4,
-                 **kwargs):
-        """
-        Build LLVM IR for arithmetic instruction benchmark without memory references.
-
-        Currently only one destination (dst) or combined destination and source (dstsrc) operand
-        is allowed. Only instruction's operands ($N) refer to the order of opernads found in
-        dst + dstsrc + src.
-        """
-        Benchmark.__init__(self, parallel=parallel, serial=serial, **kwargs)
-        self.instruction = instruction
-        self.dst_operands = dst_operands
-        self.dstsrc_operands = dstsrc_operands
-        self.src_operands = src_operands
-        self._loop_body = ''
-        if len(dst_operands) + len(dstsrc_operands) != 1:
-            raise NotImplemented("Must have exactly one dst or dstsrc operand.")
-        if not all([op[0] in 'irx'
-                    for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
-            raise NotImplemented("This class only supports register and immediate operands.")
-
-        # Part 1: PHI functions and initializations
-        for i, dstsrc_op in enumerate(dstsrc_operands):
-            # constraint code, llvm type string, initial value
-            if dstsrc_op[0] in 'rx':
-                # register operand
-                for p in range(self.parallel):
-                    self._loop_body += (
-                        '%"dstsrc{index}_{p}" = phi {type} '
-                        '[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
-                        index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
-            else:
-                raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
-
-        # Part 2: Inline ASM call
-        # Build constraint string from operands
-        constraints = ','.join(
-            ['=' + dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
-            [sop[0] for sop in itertools.chain(src_operands)] +
-            ['{}'.format(i + len(dst_operands)) for i in range(len(dstsrc_operands))])
-
-        for i, dstsrc_op in enumerate(dstsrc_operands):
-            # Build instruction from instruction and operands
-            # TODO support multiple dstsrc operands
-            # TODO support dst and dstsrc operands at the same time
-            for p in range(self.parallel):
-                operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
-                for j, dop in enumerate(dstsrc_operands):
-                    operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=j, p=p))
-                args = ', '.join(operands)
-
-                self._loop_body += (
-                    '%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
-                    ' "{instruction}", "{constraints}" ({args})\n').format(
-                    index=i,
-                    dst_type=dstsrc_op[1],
-                    instruction='\n'.join([instruction] * self.serial),
-                    constraints=constraints,
-                    args=args,
-                    p=p)
-
-        for i, dst_op in enumerate(dst_operands):
-            # Build instruction from instruction and operands
-            # TODO support multiple dst operands
-            # TODO support dst and dstsrc operands at the same time
-            if self.serial != 1:
-                raise NotImplemented("Serial > 1 and dst operand is not supported.")
-            for p in range(self.parallel):
-                operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
-                args = ', '.join(operands)
-
-                self._loop_body += (
-                    '%"dst{index}_{p}.out" = call {dst_type} asm sideeffect'
-                    ' "{instruction}", "{constraints}" ({args})\n').format(
-                    index=i,
-                    dst_type=dst_op[1],
-                    instruction=instruction,
-                    constraints=constraints,
-                    args=args,
-                    p=p)
-
-
-class AddressGenerationBenchmark(Benchmark):
-    def __init__(self,
-                 offset=('i', 'i64', '0x42'),
-                 base=('r', 'i64', '0'),
-                 index=('r', 'i64', '0'),
-                 width=('i', None, '4'),
-                 destination='base',
-                 parallel=10,
-                 serial=4,
-                 **kwargs):
-        """
-        Benchmark for address generation modes.
-
-        Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
-        or 'i' (immediate) and initial_value a string.
-        E.g., ('r', 'i64', '0') or ('i', None, '4')
-
-        +--------------------------------+-----------------------------+
-        | Mode                           | AT&T                        |
-        +--------------------------------+-----------------------------+
-        | Offset                         | leal           0x0100, %eax | <- no latency support
-        | Base                           | leal           (%esi), %eax |
-        | Offset + Base                  | leal         -8(%ebp), %eax |
-        | Offset + Index*Width           | leal   0x100(,%ebx,4), %eax |
-        | Offset + Base + Index*Width    | leal 0x8(%edx,%ebx,4), %eax |
-        +--------------------------------+-----------------------------+
-        OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
-        offset: immediate integer (+/-)
-        base: register
-        index: register
-        width: immediate 1,2,4 or 8
-        """
-        Benchmark.__init__(self, parallel=parallel, serial=serial, **kwargs)
-        self.offset = offset
-        self.base = base
-        self.index = index
-        self.width = width
-        self.destination = destination
-        self.parallel = parallel
-        # Sanity checks:
-        if bool(index) ^ bool(width):
-            raise ValueError("Index and width both need to be set, or be None.")
-        elif index and width:
-            if width[0] != 'i' or int(width[2]) not in [1, 2, 4, 8]:
-                raise ValueError("Width may only be immediate 1,2,4 or 8.")
-            if index[0] != 'r':
-                raise ValueError("Index must be a register.")
-
-        if offset and offset[0] != 'i':
-            raise ValueError("Offset must be an immediate.")
-        if base and base[0] != 'r':
-            raise ValueError("Offset must be a register.")
-
-        if not index and not width and not offset and not base:
-            raise ValueError("Must provide at least an offset or base.")
-
-        if destination == 'base' and not base:
-            raise ValueError("Destination may only be set to 'base' if base is set.")
-        elif destination == 'index' and not index:
-            raise ValueError("Destination may only be set to 'index' if index is set.")
-        elif destination not in ['base', 'index']:
-            raise ValueError("Destination must be set to 'base' or 'index'.")
-
-        if not base and not index:
-            raise ValueError("Either base or index must be set for latency test to work.")
-
-        if serial != 1 and not (base or index):
-            raise ValueError("Serial > 1 only works with index and/or base in use.")
-
-        self._loop_body = ''
-
-        ops = ''
-        if offset:
-            ops += offset[2]
-        if base:
-            ops += '($0'
-            if width and index:
-                ops += ',$1,{}'.format(width[2])
-            ops += ')'
-
-            if destination == 'base':
-                ops += ', $0'
-            else:  # destination == 'index'
-                ops += ', $1'
-        else:
-            if width and index:
-                ops += '(,$0,{}), $0'.format(width[2])
-        ops += ' '
-
-        if destination == 'base':
-            destination_reg = base
-        else:  # destination == 'index'
-            destination_reg = index
-
-        # Part 1: PHI function for destination
-        for p in range(parallel):
-            self._loop_body += (
-                '%"{name}_{p}.0" = '
-                'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format(
-                name=destination, type=destination_reg[1], initial=destination_reg[2], p=p,
-                s=self.serial)
-
-        for p in range(parallel):
-            for s in range(self.serial):
-                constraints = '=r,r'
-                if base and index:
-                    constraints += ',r'
-                    if destination == 'base':
-                        args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format(
-                            base_type=base[1], base_name=destination,
-                            index_type=index[1], index_value=index[2], p=p, s_in=s)
-                    else:  # destination == 'index':
-                        args = '{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format(
-                            base_type=base[1], base_value=base[2],
-                            index_type=index[1], index_name=destination, p=p, s_in=s)
-                else:
-                    args = '{type} %"{name}_{p}.{s_in}"'.format(
-                        type=destination_reg[1], name=destination, p=p, s_in=s)
-
-                self._loop_body += (
-                    '%"{name}_{p}.{s_out}" = call {type} asm sideeffect'
-                    ' "lea {ops}", "{constraints}" ({args})\n').format(
-                    name=destination,
-                    type=destination_reg[1],
-                    ops=ops,
-                    constraints=constraints,
-                    args=args,
-                    p=p,
-                    s_out=s + 1)
-
-
-class LoadBenchmark(Benchmark):
-    def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4, **kwargs):
-        """
-        Benchmark for L1 load using pointer chasing.
-
-        *chain_length* is the number of pointers to place in memory.
-        *structure* may be 'linear' (1-offsets) or 'random'.
-        """
-        Benchmark.__init__(self, parallel=parallel, serial=1, **kwargs)
-        self._serial = serial
-        self._loop_body = ''
-        element_type = ctypes.POINTER(ctypes.c_int)
-        self._function_ctype = ctypes.CFUNCTYPE(
-            ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
-        self.chain_length = chain_length
-        self.parallel = parallel
-        self.structure = structure
-        self._pointer_field = (element_type * chain_length)()
-        if chain_length % serial != 0:
-            raise ValueError(
-                "chain_length ({}) needs to be divisible by serial factor ({}).".format(
-                    chain_length, serial))
-
-        # Initialize pointer field
-        # Field must represent a ring of pointers
-        if structure == 'linear':
-            for i in range(chain_length):
-                self._pointer_field[i] = ctypes.cast(
-                    ctypes.pointer(self._pointer_field[(i + 1) % chain_length]), element_type)
-        elif structure == 'random':
-            shuffled_indices = list(range(chain_length))
-            random.shuffle(shuffled_indices)
-            for i in range(chain_length):
-                self._pointer_field[shuffled_indices[i]] = ctypes.cast(
-                    ctypes.pointer(self._pointer_field[shuffled_indices[(i + 1) % chain_length]]),
-                    element_type)
-        else:
-            raise ValueError("Given structure is not supported. Supported are: "
-                             "linear and random.")
-
-    def prepare_arguments(self, previous_args=None, time_factor=1.0):
-        """Build argument tuple, to be passed to low level function."""
-        if previous_args is None:
-            return self._pointer_field, 100
-        else:
-            return previous_args[0], int(previous_args[1] * time_factor)
-
-    def get_iterations(self, args):
-        """Return number of iterations performed, based on lower level function arguments."""
-        return self.chain_length * args[1]
-
-    def get_ir(self):
-        """
-        Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1):
-
-        int test(int** ptrf, int repeat) {
-            int** p0 = (int**)ptrf[0];
-            int i = 0;
-            while(i < N) {
-                int** p = (int**)*p0;
-                while(p != p0) {
-                    p = (int**)*p;
-                }
-                i++;
-            }
-            return i;
-        }
-        """
-        ret = textwrap.dedent('''
-        define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
-        entry:
-        ''')
-        # Load pointer to ptrf[p] and p0
-        for p in range(self.parallel):
-            if p > 0:
-                ret += '  %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
-            ret += (
-                '  %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
-                '  %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
-
-        ret += textwrap.dedent('''
-            %"cmp.entry" = icmp sgt i32 %"repeats", 0
-            br i1 %"cmp.entry", label %"loop0", label %"end"
-
-        loop0:
-            br label %"loop1"
-
-        loop1:
-            %"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
-            br label %"loop2"
-
-        loop2:\n''')
-
-        for p in range(self.parallel):
-            ret += ('  %"p_{p}.0" = phi i32** '
-                    '[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format(
-                p=p, s_max=self._serial)
-
-        # load p, compare to p0 and or-combine results
-        for p in range(self.parallel):
-            for s in range(self._serial):
-                ret += ('  %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n'
-                        '  %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format(
-                    p=p, s=s + 1, s_prev=s)
-
-            # Compare is needed for all registers, for llvm not to remove unused 
-            # instructions:
-            ret += '  %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format(
-                p=p, s_max=self._serial)
-
-        # TODO tree reduce cmp to make use of all cmp_* values
-
-        # It is sufficient to use only one compare, all others will be eliminated
-        ret += '  br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
-
-        ret += textwrap.dedent('''
-        loop3:
-            %"i.1" = add i32 %"i", 1
-            %"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
-            br i1 %"cmp.loop3", label %"end", label %"loop1"
-
-        end:
-            %"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
-            ret i32 %"ret"
-        }''')
-        return ret
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v', '--verbose', action='count', default=0)
-    parser.add_argument('-f', '--frequency', type=float, required=psutil.cpu_freq() is None,
-        help='Provided (in GHz), if psutil.cpu_freq() does report anything.')
-    args = parser.parse_args()
-    if args.frequency:
-        args.frequency *= 1e9
-
-    llvm.initialize()
-    llvm.initialize_native_target()
-    llvm.initialize_native_asmprinter()
-    llvm.initialize_native_asmparser()
-
-    modules = collections.OrderedDict()
-
-    # immediate source
-    modules['add i64 r64 LAT'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    # register source
-    modules['add r64 r64 LAT'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('r', 'i64', '1'),),
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    # multiple instructions
-    modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
-        instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    # immediate source
-    modules['add i64 r64 TP'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=10,
-        serial=5,
-        frequency=args.frequency)
-
-    # register source
-    modules['add r64 r64 TP'] = InstructionBenchmark(
-        instruction='addq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('r', 'i64', '1'),),
-        parallel=10,
-        serial=5,
-        frequency=args.frequency)
-
-    # multiple instructions
-    modules['4xadd i64 r64 TP'] = InstructionBenchmark(
-        instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
-        dst_operands=(),
-        dstsrc_operands=(('r', 'i64', '0'),),
-        src_operands=(('i', 'i64', '1'),),
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['lea base LAT'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['lea base+offset LAT'] = AddressGenerationBenchmark(
-        offset=('i', None, '23'),
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['lea index*width LAT'] = AddressGenerationBenchmark(
-        offset=None,
-        base=None,
-        index=('r', 'i64', '1'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
-        offset=('i', 'i64', '-0x8'),
-        base=None,
-        index=('r', 'i64', '51'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
-        offset=('i', None, '42'),
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['lea base TP'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['lea base+offset TP'] = AddressGenerationBenchmark(
-        offset=('i', None, '23'),
-        base=('r', 'i64', '666'),
-        index=None,
-        width=None,
-        destination='base',
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['lea index*width TP'] = AddressGenerationBenchmark(
-        offset=None,
-        base=None,
-        index=('r', 'i64', '1'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['lea offset+index*width TP'] = AddressGenerationBenchmark(
-        offset=('i', 'i64', '-0x8'),
-        base=None,
-        index=('r', 'i64', '51'),
-        width=('i', None, '4'),
-        destination='index',
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['lea base+index*width TP'] = AddressGenerationBenchmark(
-        offset=None,
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark(
-        offset=('i', None, '42'),
-        base=('r', 'i64', '23'),
-        index=('r', 'i64', '12'),
-        width=('i', None, '4'),
-        destination='base',
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['LD linear LAT'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='linear',
-        parallel=1,
-        serial=8,
-        frequency=args.frequency)
-
-    modules['LD random LAT'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='random',
-        parallel=1,
-        serial=8,
-        frequency=args.frequency)
-
-    modules['LD linear TP'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='linear',
-        parallel=16,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['LD random TP'] = LoadBenchmark(
-        chain_length=2048,  # 2048 * 8B = 16kB
-        structure='random',
-        parallel=16,
-        serial=1,
-        frequency=args.frequency)
-
-    modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark(
-        instruction='vaddpd $1, $0, $0',
-        dst_operands=(),
-        dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
-        src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark(
-        instruction='vmulpd $1, $0, $0',
-        dst_operands=(),
-        dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
-        src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
-        parallel=1,
-        serial=5,
-        frequency=args.frequency)
-
-    # This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies:
-    modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark(
-        instruction='vmulpd $1, $2, $0',
-        dst_operands=(),
-        dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
-        src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
-        parallel=10,
-        serial=1,
-        frequency=args.frequency)
-
-    modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('LD ')])
-
-    for key, module in modules.items():
-        if args.verbose > 0:
-            print("=== Benchmark")
-            print(repr(module))
-            print("=== LLVM")
-            print(module.get_ir())
-            print("=== Assembly")
-            print(module.get_assembly())
-        r = module.build_and_execute(repeat=3)
-        if args.verbose > 0:
-            print("=== Result")
-            pprint.pprint(r)
-
-        cy_per_it = min(r['runtimes']) * r['frequency'] / (
-                    r['iterations'] * module.parallel * module.serial)
-        print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format(
-            key=key,
-            module=module,
-            cy_per_it=cy_per_it,
-            runtime_sum=sum(r['runtimes'])))
-
-    # InstructionBenchmark.get_latency(
-    #    instruction='vmulpd $1, $0, $0',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
-    #    src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
-    #                  ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
-    #    print_table=True)
-    # InstructionBenchmark.get_throughput(
-    #    instruction='vmulpd $1, $0, $0',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
-    #    src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
-    #                  ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
-    #    print_table=True)
-    #
-    # InstructionBenchmark.get_latency(
-    #    instruction='nop',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('r','i8', '0'),),
-    #    src_operands=(),
-    #    print_table=True)
-    # InstructionBenchmark.get_throughput(
-    #    instruction='nop',
-    #    dst_operands=(),
-    #    dstsrc_operands=(('r','i8', '0'),),
-    #    src_operands=(),
-    #    print_table=True)
--- a/build/lib/asmbench/op.py
+++ b/build/lib/asmbench/op.py
@@ -1,514 +0,0 @@
-#!/usr/bin/env python3
-import re
-from itertools import zip_longest
-
-# TODO use abc to force implementation of interface requirements
-
-init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
-# LLVM requires floating point constants to have a non-repeating binary representation
-# See http://llvm.org/docs/LangRef.html#simple-constants for details
-init_value_by_llvm_type.update({fp_type: str(1+1/2**10)
-                                for fp_type in ['float', 'double', 'fp128']})
-# For vector-types we reuse the scalar values
-init_value_by_llvm_type.update(
-    {'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
-     for t, v in init_value_by_llvm_type.items()
-     for vec in [2, 4, 8, 16, 32, 64]})
-
-
-class NotSerializableError(Exception):
-    pass
-
-class Operand:
-    def __init__(self, llvm_type):
-        self.llvm_type = llvm_type
-
-    def get_constraint_char(self):
-        raise NotImplementedError()
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-    @staticmethod
-    def from_string(s):
-        options = [Register.from_string, Immediate.from_string, MemoryReference.from_string]
-        for o in options:
-            try:
-                return o(s)
-            except ValueError:
-                continue
-        raise ValueError("No matching operand type found for '{}'.".format(s))
-
-
-class Immediate(Operand):
-    def __init__(self, llvm_type, value):
-        Operand.__init__(self, llvm_type)
-        self.value = value
-
-    def get_constraint_char(self):
-        return 'i'
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create Immediate object from string.
-
-        :param s: must have the form: "llvm_type:value"
-        """
-        llvm_type, value = s.split(':', 1)
-        value_regex = r'(0x[0-9a-fA-F]+|[0-9]+(\.[0-9]+)?)'
-        if not re.match(value_regex, value):
-            raise ValueError("Invalid immediate value, must match {!r}".format(value_regex))
-        return cls(llvm_type, value)
-
-
-class MemoryReference(Operand):
-    """
-    offset + base + index*width
-
-    OFFSET(BASE, INDEX, WIDTH) in AT&T assembly
-
-    Possible operand values:
-        offset: immediate integer (+/-)
-        base: register
-        index: register
-        width: immediate 1,2,4 or 8
-    """
-
-    def __init__(self, llvm_type, offset=None, base=None, index=None, width=None):
-        super().__init__(llvm_type)
-        self.offset = offset
-        self.base = base
-        self.index = index
-        self.width = width
-
-        # Sanity checks:
-        if bool(index) ^ bool(width):
-            raise ValueError("Index and width both need to be set, or None.")
-        elif index and width:
-            if not (isinstance(width, Immediate) and int(width.value) in [1, 2, 4, 8]):
-                raise ValueError("Width may only be immediate 1,2,4 or 8.")
-            if not isinstance(index, Register):
-                raise ValueError("Index must be a register.")
-
-        if offset and not isinstance(offset, Immediate):
-            raise ValueError("Offset must be an immediate.")
-        if base and not isinstance(base, Register):
-            raise ValueError("Offset must be a register.")
-
-        if not index and not width and not offset and not base:
-            raise ValueError("Must provide at least an offset or base.")
-
-    def get_constraint_char(self):
-        return 'm'
-
-    def get_registers(self):
-        if self.base:
-            yield self.base
-        if self.index:
-            yield self.index
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create MemoryReference from string.
-
-        :param s: must fulfill the regex: "mem:[bdis]+"
-        """
-        m = re.match(r"\*([^:]+):([obiw]+)", s)
-        if not m:
-            raise ValueError("Invalid format, must match 'mem:[obiw]+'.")
-        else:
-            llvm_type, features = m.groups()
-            offset = None
-            if 'o' in features:
-                offset = Immediate('i32', 8)
-            base = None
-            if 'b' in features:
-                base = Register('i64', 'r')
-            index = None
-            if 'i' in features:
-                index = Register('i64', 'r')
-            width = None
-            if 'w' in features:
-                width = Immediate('i32', 8)
-            return cls(llvm_type, offset=offset, base=base, index=index, width=width)
-
-
-class Register(Operand):
-    def __init__(self, llvm_type, constraint_char='r'):
-        super().__init__(llvm_type)
-        self.constraint_char = constraint_char
-
-    def get_constraint_char(self):
-        return self.constraint_char
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create Register object from string.
-
-        :param s: must have the form: "llvm_type:constraint_char"
-        """
-        llvm_type, constraint_char = s.split(':', 1)
-        valid_cc = 'rx'
-        if constraint_char not in valid_cc:
-            raise ValueError("Invalid constraint character, must be one of {!r}".format(valid_cc))
-        return cls(llvm_type, constraint_char)
-
-
-class Synthable:
-    def __init__(self):
-        pass
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers):
-        raise NotImplementedError()
-
-    def get_source_registers(self):
-        raise NotImplementedError()
-
-    def get_destination_registers(self):
-        raise NotImplementedError()
-
-    @staticmethod
-    def _get_unused_reg_name(used_registers):
-        name = None
-        i = 0
-        while name in used_registers or name is None:
-            name = '%"reg.{}"'.format(i)
-            i += 1
-        used_registers.add(name)
-        return name
-
-    def get_default_init_values(self):
-        r = []
-        for reg in self.get_source_registers():
-            try:
-                r.append(init_value_by_llvm_type[reg.llvm_type])
-            except KeyError:
-                raise ValueError("Invalid or unsupported LLVM type {!r}.".format(reg.llvm_type))
-        return r
-
-    def __repr__(self):
-        return '{}({})'.format(
-            self.__class__.__name__,
-            ', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
-                       if not k.startswith('_')]))
-
-
-class Operation(Synthable):
-    """Base class for operations."""
-
-
-class Instruction(Operation):
-    def __init__(self, instruction, destination_operand, source_operands):
-        super().__init__()
-        self.instruction = instruction
-        self.destination_operand = destination_operand
-        assert isinstance(destination_operand, Register), "Destination needs to be a register."
-        self.source_operands = source_operands
-
-    def get_source_registers(self):
-        sop_types = set()
-        sr = []
-        for sop in self.source_operands:
-            if isinstance(sop, Register):
-                if sop.llvm_type not in sop_types:
-                    sop_types.add(sop.llvm_type)
-                    sr.append(sop)
-            elif isinstance(sop, MemoryReference):
-                sr += list(sop.get_registers())
-
-        return sr
-
-    def get_destination_registers(self):
-        if isinstance(self.destination_operand, Register):
-            return [self.destination_operand]
-        else:
-            return []
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
-        """
-        Build IR string based on in and out operand names and types.
-        """
-        if used_registers is None:
-            used_registers = set(dst_reg_names + src_reg_names)
-
-        # Build constraint string from operands
-        constraints = ','.join(
-            ['=' + self.destination_operand.get_constraint_char()] +
-            [sop.get_constraint_char() for sop in self.source_operands])
-
-        # Build argument string from operands and register names
-        operands = []
-        sop_types = {}
-        i = 0
-        for sop in self.source_operands:
-            if isinstance(sop, Immediate):
-                operands.append('{type} {repr}'.format(
-                    type=sop.llvm_type,
-                    repr=sop.value))
-            elif isinstance(sop, Register):
-                if sop.llvm_type in sop_types:
-                    operands.append('{type} {repr}'.format(
-                        type=sop.llvm_type,
-                        repr=src_reg_names[sop_types[sop.llvm_type]]))
-                else:
-                    sop_types[sop.llvm_type] = i
-                    operands.append('{type} {repr}'.format(
-                        type=sop.llvm_type,
-                        repr=src_reg_names[i]))
-                    i += 1
-            elif isinstance(sop, MemoryReference):
-                operands.append('{type} {repr}'.format(
-                    type=sop.llvm_type,
-                    repr=src_reg_names[i]))
-                i += 1
-            else:
-                raise NotImplementedError("Only register and immediate operands are supported.")
-        args = ', '.join(operands)
-
-        # Build instruction from instruction and operands
-        return ('{dst_reg} = call {dst_type} asm '
-                ' "{instruction}", "{constraints}" ({args})').format(
-            dst_reg=dst_reg_names[0],
-            dst_type=self.destination_operand.llvm_type,
-            instruction=self.instruction,
-            constraints=constraints,
-            args=args)
-
-    @classmethod
-    def from_string(cls, s):
-        """
-        Create Instruction object from string.
-
-        :param s: must have the form:
-                  "asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
-        """
-        instruction = s
-        # It is important that the match objects are in reverse order, to allow string replacements
-        # based on original match group locations
-        operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
-        # Destination indices start at 0
-        dst_index = 0
-        # Source indices at "number of destination operands"
-        src_index = ['dst' in o.group(1) for o in operands].count(True)
-
-        dst_ops = []
-        src_ops = []
-        for m in operands:
-            direction, operand_string = m.group(1, 2)
-            operand = Operand.from_string(operand_string)
-            if 'src' in direction and not 'dst' in direction:
-                src_ops.append(operand)
-                # replace with index string
-                instruction = (instruction[:m.start()] + "${}".format(src_index)
-                               + instruction[m.end():])
-                src_index += 1
-            if 'dst' in direction:
-                dst_ops.append(operand)
-                # replace with index string
-                instruction = (instruction[:m.start()] + "${}".format(dst_index)
-                               + instruction[m.end():])
-                if 'src' in direction:
-                    src_ops.append(Register(operand_string.split(':', 1)[0], str(dst_index)))
-                    src_index += 1
-                dst_index += 1
-
-        if len(dst_ops) != 1:
-            raise ValueError("Instruction supports only single destinations.")
-        return cls(instruction, dst_ops[0], src_ops)
-
-
-class Load(Operation):
-    def __init__(self, chain_length, structure='linear'):
-        """
-        *chain_length* is the number of pointers to place in memory.
-        *structure* may be 'linear' (1-offsets) or 'random'.
-        """
-        super().__init__()
-        self.chain_length = chain_length
-        self.structure = structure
-    # TODO
-
-
-class AddressGeneration(Operation):
-    def __init__(self, offset, base, index, width, destination='base'):
-        super().__init__()
-        self.offset = offset
-        self.base = base
-        self.index = index
-        self.width = width
-        self.destination = destination
-        raise NotImplementedError()
-
-
-class Serialized(Synthable):
-    def __init__(self, synths):
-        super().__init__()
-        self.synths = synths
-        assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
-
-    def get_source_registers(self):
-        if self.synths:
-            return self.synths[0].get_source_registers()
-        else:
-            return []
-
-    def get_destination_registers(self):
-        if self.synths:
-            return self.synths[-1].get_destination_registers()
-        else:
-            return []
-
-    @staticmethod
-    def match(source_registers, destination_registers):
-        """
-        Find maximum number of matches from source (previous destinations) to
-        destination (current source) registers.
-
-        Return list of two-tuples of matches (src_idx, dst_idx)
-        """
-        matched_pairs = []
-        unmatched_dests = set(destination_registers)
-        for dst_idx, dst in enumerate(destination_registers):
-            for src_idx, src in enumerate(source_registers):
-                if src.llvm_type == dst.llvm_type:
-                    matched_pairs.append((src_idx, dst_idx))
-                    unmatched_dests.discard(dst)
-
-        return matched_pairs, unmatched_dests
-
-    def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
-        reg_naming_out = []
-        dst_naming = []
-        last_s = None
-        for i, s in enumerate(self.synths):
-            if i == 0:
-                # first source is passed in from outside
-                src_naming = src_reg_names
-            else:
-                # match with previous destinations
-                src_naming = []
-                match = False
-                for src in s.get_source_registers():
-                    # Find matching destination from previous synths
-                    src_match = False
-                    for dst_idx, dst in enumerate(last_s.get_destination_registers()):
-                        if dst.llvm_type == src.llvm_type:
-                            match = src_match = True
-                            src_naming.append(dst_naming[dst_idx])
-                    # If source could not be matched, use constant value instead
-                    if not src_match:
-                        src_naming.append(init_value_by_llvm_type[src.llvm_type])
-                if not match:
-                    raise NotSerializableError("Unable to find match.")
-
-            if i == len(self.synths) - 1:
-                # last destination is passed in from outside
-                dst_naming = dst_reg_names
-            else:
-                # noinspection PyUnusedLocal
-                dst_naming = [self._get_unused_reg_name(used_registers)
-                              for j in s.get_destination_registers()]
-
-            reg_naming_out.append((dst_naming, src_naming))
-            last_s = s
-        return reg_naming_out, used_registers
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
-        if used_registers is None:
-            used_registers = set(dst_reg_names + src_reg_names)
-        reg_names, used_registers = self.generate_register_naming(
-            dst_reg_names, src_reg_names, used_registers)
-        code = []
-        for s, r in zip(self.synths, reg_names):
-            code.append(s.build_ir(*r, used_registers))
-        return '\n'.join(code)
-
-
-class Parallelized(Synthable):
-    def __init__(self, synths, interleave=False):
-        super().__init__()
-        self.synths = synths
-        self.interleave = interleave
-        assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
-
-    def get_source_registers(self):
-        sources = []
-        for s in self.synths:
-            sources += s.get_source_registers()
-        return sources
-
-    def get_destination_registers(self):
-        destinations = []
-        for s in self.synths:
-            destinations += s.get_destination_registers()
-        return destinations
-
-    def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
-        # Split reg_naming among all synths
-        reg_naming_out = []
-        for s in self.synths:
-            n_dsts = len(s.get_destination_registers())
-            n_srcs = len(s.get_source_registers())
-            reg_naming_out.append((dst_reg_names[:n_dsts], src_reg_names[:n_srcs]))
-            dst_reg_names, src_reg_names = (dst_reg_names[n_dsts:], src_reg_names[n_srcs:])
-        return reg_naming_out, used_registers
-
-    def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
-        if used_registers is None:
-            used_registers = set(dst_reg_names + src_reg_names)
-        reg_names, used_registers = self.generate_register_naming(
-            dst_reg_names, src_reg_names, used_registers)
-        code = []
-        for s, r in zip(self.synths, reg_names):
-            code.append(s.build_ir(*r, used_registers))
-
-        # Interleave parallelized sequences
-        if self.interleave:
-            code = ['\n'.join(filter(None.__ne__, c))
-                    for c in list(zip_longest(*[c.split('\n') for c in code]))]
-        return '\n'.join(code)
-
-
-if __name__ == '__main__':
-    i1 = Instruction(
-        instruction='add $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
-    i2 = Instruction(
-        instruction='sub $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
-    i3 = Instruction(
-        instruction='mul $1, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Register('i64', 'r')])
-    i4 = Instruction(
-        instruction='div $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
-    i5 = Instruction(
-        instruction='mul $2, $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
-    i6 = Instruction(
-        instruction='inc $0',
-        destination_operand=Register('i64', 'r'),
-        source_operands=[Register('i64', 'r')])
-    s1 = Serialized([i1, i2])
-    s2 = Serialized([s1, i3])
-    print(s1.build_ir(['%out'], ['%in']), '\n')
-    print(s2.build_ir(['%out'], ['%in']), '\n')
-    s3 = Serialized([i4, i5])
-    p1 = Parallelized([i6, s2, s3])
-    print(p1.build_ir(['%out.0', '%out.1', '%out.2'], ['%in.0', '%in.1', '%in.2']), '\n')
-
-    s4 = Serialized([i1, i2, i3, i4, i5, i6])
-    print(s4.build_ir(['%out'], ['%in']), '\n')
-
-    print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))
--- a/build/lib/asmbench/sc18src.py
+++ b/build/lib/asmbench/sc18src.py
@@ -1,243 +0,0 @@
-#!/usr/bin/env python3
-import collections
-import itertools
-import socket
-
-import numpy
-import matplotlib.pyplot as plt
-import matplotlib as mpl
-
-from asmbench import op, bench
-from asmbench import oldjit
-
-
-def jit_based_benchs():
-    modules = collections.OrderedDict()
-    modules['lea_b'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['lea_b+off'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '23'),
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '23'),
-            base=('r', 'i64', '666'),
-            index=None,
-            width=None,
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['lea_idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=None,
-            index=('r', 'i64', '1'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=None,
-            index=('r', 'i64', '1'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=10,
-            serial=1))
-
-    modules['lea_off+idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', 'i64', '-0x8'),
-            base=None,
-            index=('r', 'i64', '51'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', 'i64', '-0x8'),
-            base=None,
-            index=('r', 'i64', '51'),
-            width=('i', None, '4'),
-            destination='index',
-            parallel=10,
-            serial=1))
-
-    modules['lea_b+idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=None,
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['lea_b+off+idx*w'] = (
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '42'),
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=1,
-            serial=5),
-        oldjit.AddressGenerationBenchmark(
-            offset=('i', None, '42'),
-            base=('r', 'i64', '23'),
-            index=('r', 'i64', '12'),
-            width=('i', None, '4'),
-            destination='base',
-            parallel=10,
-            serial=1))
-
-    modules['LD_linear'] = (
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='linear',
-            parallel=1,
-            serial=2),
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='linear',
-            parallel=4,
-            serial=2))
-
-    modules['LD_random'] = (
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='random',
-            parallel=1,
-            serial=2),
-        oldjit.LoadBenchmark(
-            chain_length=2048,  # 2048 * 8B = 16kB
-            structure='random',
-            parallel=4,
-            serial=2))
-
-    for name, mods in modules.items():
-        lat_module, tp_module = mods
-        r_lat = lat_module.build_and_execute(repeat=3)
-        cy_per_it_lat = min(r_lat['runtimes']) * r_lat['frequency'] / (
-                    r_lat['iterations'] * lat_module.parallel * lat_module.serial)
-        r_tp = tp_module.build_and_execute(repeat=3)
-        cy_per_it_tp = min(r_tp['runtimes']) * r_tp['frequency'] / (
-                    r_tp['iterations'] * tp_module.parallel * tp_module.serial)
-        print('{key:<16} LAT {cy_per_it_lat:.3f} cy  TP {cy_per_it_tp:.3f} cy'.format(
-            key=name,
-            cy_per_it_lat=cy_per_it_lat,
-            cy_per_it_tp=cy_per_it_tp))
-
-def plot_combined(single_measured, combined_measured):
-    instructions = list(single_measured.keys())
-    d = numpy.ndarray((len(single_measured), len(single_measured)))
-    d.fill(float('nan'))
-    for k, v in combined_measured.items():
-        i1, i2 = [instructions.index(i) for i in [c[0] for c in k]]
-        d[i1, i2] = v[2]
-    cmap = mpl.cm.get_cmap('plasma', 5)
-    cmap.set_bad('w') # default value is 'k'
-    fig = plt.figure(figsize=(10,10))
-    ax1 = fig.add_subplot(111)
-    cax = ax1.imshow(d, interpolation="nearest", cmap=cmap, norm=mpl.colors.Normalize(vmin=-.5, vmax=1.5))
-    ax1.set_xticks(range(len(instructions)))
-    ax1.set_xticklabels(instructions, rotation=90)
-    ax1.set_yticks(range(len(instructions)))
-    ax1.set_yticklabels(instructions)
-    ax1.set_title(socket.gethostname())
-    ax1.grid()
-    cb = fig.colorbar(cax, shrink=0.65)
-    cb.set_ticks([-.5, 0, 1, 1.5])
-    cb.set_ticklabels(['< -0.5', '0.0 (complete overlap)', '1.0 (no overlap)', '> 1.5'])
-    cb.set_label('inverse parallel overlap')
-    fig.tight_layout()
-    plt.show()
-
-
-if __name__ == '__main__':
-    bench.setup_llvm()
-    instructions = [
-        (i[0], i[1], op.Instruction.from_string(i[1]))
-        for i in [
-            ('ADD32ri', 'add {src:i32:1}, {srcdst:i32:r}'),
-            ('ADD64ri32', 'add {src:i32:1}, {srcdst:i64:r}'),
-            ('INC64r', 'inc {srcdst:i64:r}'),
-            ('SUB32ri', 'sub {src:i32:1}, {srcdst:i64:r}'),
-            ('MOV64ri32', 'mov {src:i32:1}, {srcdst:i64:r}'),
-            ('VINSERTF128rr', 'vinsertf128 {src:i8:0}, {src:<2 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VCVTSI642SSrr', 'vcvtsi2ss {src:i64:r}, {src:float:x}, {dst:float:x}'),
-            ('VADDPDYrr', 'vaddpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VADDSDrr', 'vaddsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VADDSSrr', 'vaddss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ('VFMADD213PDYr', 'vfmadd213pd {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
-            ('VFMADD213PDr', 'vfmadd213pd {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
-            ('VFMADD213PSYr', 'vfmadd213ps {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
-            ('VFMADD213PSr', 'vfmadd213ps {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
-            ('VFMADD213SDr', 'vfmadd213sd {src:double:x}, {src:double:x}, {srcdst:double:x}'),
-            ('VFMADD213SSr', 'vfmadd213ss {src:float:x}, {src:float:x}, {srcdst:float:x}'),
-            ('VMULPDYrr', 'vmulpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VMULSDrr', 'vmulsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VMULSSrr', 'vmulss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ('VSUBSDrr', 'vsubsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VSUBSSrr', 'vsubss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ('VDIVPDYrr', 'vdivpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
-            ('VDIVSDrr', 'vdivsd {src:double:x}, {src:double:x}, {dst:double:x}'),
-            ('VDIVSSrr', 'vdivss {src:float:x}, {src:float:x}, {dst:float:x}'),
-            ]
-    ]
-    instructions_measured = collections.OrderedDict()
-    for llvm_name, i_str, i in instructions:
-        lat, tp = bench.bench_instructions(
-            [i],
-            serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
-            verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
-        print('{:<16}  LAT {:.3f} cy  TP {:.3f} cy'.format(llvm_name, lat, tp))
-        instructions_measured[llvm_name] = (lat, tp)
-
-    jit_based_benchs()
-
-    two_combinations_measured = collections.OrderedDict()
-
-    for a, b in itertools.combinations_with_replacement(instructions, 2):
-        lat, tp = bench.bench_instructions(
-            [a[2], b[2]],
-            serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
-            verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
-        same_port_metric = ((
-            tp-max(instructions_measured[a[0]][1], instructions_measured[b[0]][1])) /
-            min(instructions_measured[a[0]][1], instructions_measured[b[0]][1]))
-        print('{:<16} {:<16}  LAT {:.3f} cy  TP {:.3f} cy  SPM {:>5.2f}'.format(
-            a[0], b[0], lat, tp, same_port_metric))
-        two_combinations_measured[(a[0], a[1]), (b[0], b[1])] = (lat, tp, same_port_metric)
-
-    plot_combined(instructions_measured, two_combinations_measured)
--- a/build/lib/asmbench/streams.py
+++ b/build/lib/asmbench/streams.py
@@ -1,82 +0,0 @@
-#!/usr/bin/env python3
-
-import collections
-import itertools
-import socket
-import textwrap
-
-import numpy
-import matplotlib.pyplot as plt
-import matplotlib as mpl
-
-from asmbench import op, bench
-from asmbench import oldjit
-
-
-type_size = {
-    'i32': 4,
-    'i64': 8,
-    'f32': 4,
-    'float': 4,
-    'f64': 8,
-    'double': 8,
-}
-
-
-class StreamsBenchmark(bench.Benchmark):
-    def __init__(self,
-                 read_streams=0, read_write_streams=0, write_streams=0,
-                 stream_byte_length=0,
-                 element_type='i64'):
-        super().__init__()
-        self.read_streams = read_streams
-        self.read_write_streams = read_write_streams
-        self.write_streams = write_streams
-        self.stream_byte_length = stream_byte_length
-        self.element_type = element_type
-
-    def build_ir(self, iaca_marker=False):
-        if iaca_marker:
-            iaca_start_marker = textwrap.dedent('''\
-                call void asm "movl    $$111,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-            iaca_stop_marker = textwrap.dedent('''\
-                call void asm "movl    $$222,%ebx", ""()
-                call void asm ".byte   100,103,144", ""()''')
-        else:
-            iaca_start_marker = ''
-            iaca_stop_marker = ''
-
-        ir = textwrap.dedent('''\
-            define i64 @"test"(i64 %"N"{pointer_arguments})
-            {{
-            entry:
-              %"loop_cond" = icmp slt i64 0, %"N"
-              br i1 %"loop_cond", label %"loop", label %"end"
-
-            loop:
-              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-            {iaca_start_marker}
-            {loop_body}
-              %"loop_counter.1" = add i64 %"loop_counter", 1
-              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-              br i1 %"loop_cond.1", label %"loop", label %"end"
-
-            end:
-              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-            {iaca_stop_marker}
-              ret i64 %"ret"
-            }}
-            ''').format(
-            pointer_arguments='',
-            loop_body='',
-            iaca_start_marker=iaca_start_marker,
-            iaca_stop_marker=iaca_stop_marker)
-
-        return ir
-
-if __name__ == '__main__':
-    bench.setup_llvm()
-    sb = StreamsBenchmark()
-    print(sb.build_and_execute())
-
--- a/c_api/build.sh
+++ b/c_api/build.sh
@@ -1,3 +0,0 @@
-#!/bin/sh
-clang -g `llvm-config --cflags` test.c -c
-clang++ test.o `llvm-config --cxxflags --ldflags --libs --system-libs all` -o test
--- a/c_api/test
+++ b/c_api/test
--- a/c_api/test.c
+++ b/c_api/test.c
@@ -1,72 +0,0 @@
-/**
- * LLVM equivalent of:
- *
- * int sum(int a, int b) {
- *     return a + b;
- * }
- */
-
-#include <llvm-c/Core.h>
-#include <llvm-c/ExecutionEngine.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/Analysis.h>
-#include <llvm-c/BitWriter.h>
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, char** argv) {
-    LLVMModuleRef mod = LLVMModuleCreateWithName("my_module");
-
-    LLVMTypeRef param_types[] = { LLVMInt32Type(), LLVMInt32Type() };
-    LLVMTypeRef ret_type = LLVMFunctionType(LLVMInt32Type(), param_types, 2, 0);
-    LLVMValueRef sum = LLVMAddFunction(mod, "sum", ret_type);
-
-    LLVMBasicBlockRef entry = LLVMAppendBasicBlock(sum, "entry");
-
-    LLVMBuilderRef builder = LLVMCreateBuilder();
-    LLVMPositionBuilderAtEnd(builder, entry);
-    LLVMValueRef tmp = LLVMBuildAdd(builder, LLVMGetParam(sum, 0), LLVMGetParam(sum, 1), "tmp");
-    LLVMBuildRet(builder, tmp);
-
-    char *error = NULL;
-    LLVMVerifyModule(mod, LLVMAbortProcessAction, &error);
-    LLVMDisposeMessage(error);
-
-    LLVMExecutionEngineRef engine;
-    error = NULL;
-    LLVMLinkInMCJIT();
-    LLVMInitializeNativeTarget();
-    if (LLVMCreateExecutionEngineForModule(&engine, mod, &error) != 0) {
-        fprintf(stderr, "failed to create execution engine\n");
-        abort();
-    }
-    if (error) {
-        fprintf(stderr, "error: %s\n", error);
-        LLVMDisposeMessage(error);
-        exit(EXIT_FAILURE);
-    }
-
-    if (argc < 3) {
-        fprintf(stderr, "usage: %s x y\n", argv[0]);
-        exit(EXIT_FAILURE);
-    }
-    long long x = strtoll(argv[1], NULL, 10);
-    long long y = strtoll(argv[2], NULL, 10);
-
-    LLVMGenericValueRef args[] = {
-        LLVMCreateGenericValueOfInt(LLVMInt32Type(), x, 0),
-        LLVMCreateGenericValueOfInt(LLVMInt32Type(), y, 0)
-    };
-    LLVMGenericValueRef res = LLVMRunFunction(engine, sum, 2, args);
-    printf("%d\n", (int)LLVMGenericValueToInt(res, 0));
-
-    // Write out bitcode to file
-    if (LLVMWriteBitcodeToFile(mod, "sum.bc") != 0) {
-        fprintf(stderr, "error writing bitcode to file, skipping\n");
-    }
-
-    LLVMDisposeBuilder(builder);
-    LLVMDisposeExecutionEngine(engine);
-}
--- a/c_api/test.o
+++ b/c_api/test.o
--- a/debug_avx_feature.py
+++ b/debug_avx_feature.py
@@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-import llvmlite.binding as llvm
-
-llvm.initialize()
-# From
-# >>> cp = (ctypes.c_char_p * 1)()
-# >>> ffi.lib.LLVMPY_GetHostCPUFeatures(cp)
-# >>> print(cp[0])
-# llvm.set_option('', '-mattr=+sse2,+cx16,-tbm,-avx512ifma,-avx512dq,-fma4,+prfchw,+bmi2,+xsavec,+fsgsbase,+popcnt,+aes,+xsaves,-avx512er,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-xop,+rdseed,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vl,-avx512cd,+avx,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,+cmov,-avx512vbmi,+movbe,+xsaveopt,-sha,+adx,-avx512pf,+sse3')
-# llvm.set_option('', '-march=native')
-# llvm.set_option('', '-mcpu=native')
-# llvm.set_option('', '-version')
-# llvm.set_option('', '-help-list-hidden')
-llvm.initialize_native_target()
-llvm.initialize_native_asmprinter()
-llvm.initialize_native_asmparser()
-# llvm.set_option('', '-help-list-hidden')
-
-ir = '''
-
-target triple = "x86_64-apple-darwin17.5.0"
-
-define <4 x double> @testv(i32**, i32) {
-
-  %out = tail call <4 x double> asm "vaddpd $1, $2, $0", "=x,x,x,~{dirflag},~{fpsr},~{flags}"(<4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>, <4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>)
-  ret <4 x double> %out
-}
-'''
-
-module = llvm.parse_assembly(ir)
-module.verify()
-features = llvm.get_host_cpu_features().flatten()
-cpu = llvm.get_host_cpu_name()
-tm = llvm.Target.from_default_triple().create_target_machine(cpu=cpu, features=features)
-with llvm.create_mcjit_compiler(module, tm) as ee:
-    ee.finalize_object()
-    print(tm.emit_assembly(module))
--- a/dev_test/a.out
+++ b/dev_test/a.out
--- a/dev_test/cpu_features.txt
+++ b/dev_test/cpu_features.txt
@@ -1,55 +0,0 @@
-taschenbuch:pyasmjit codemonk$ clang -P - -march=native -### 2>&1|grep -E --color -o -- '"-target-feature" "[^"]+"'
-"-target-feature" "+sse2"
-"-target-feature" "+cx16"
-"-target-feature" "-tbm"
-"-target-feature" "-avx512ifma"
-"-target-feature" "-avx512dq"
-"-target-feature" "-fma4"
-"-target-feature" "+prfchw"
-"-target-feature" "+bmi2"
-"-target-feature" "+xsavec"
-"-target-feature" "+fsgsbase"
-"-target-feature" "+popcnt"
-"-target-feature" "+aes"
-"-target-feature" "+xsaves"
-"-target-feature" "-avx512er"
-"-target-feature" "-avx512vpopcntdq"
-"-target-feature" "-clwb"
-"-target-feature" "-avx512f"
-"-target-feature" "-clzero"
-"-target-feature" "-pku"
-"-target-feature" "+mmx"
-"-target-feature" "-lwp"
-"-target-feature" "-xop"
-"-target-feature" "+rdseed"
-"-target-feature" "-sse4a"
-"-target-feature" "-avx512bw"
-"-target-feature" "+clflushopt"
-"-target-feature" "+xsave"
-"-target-feature" "-avx512vl"
-"-target-feature" "-avx512cd"
-"-target-feature" "+avx"
-"-target-feature" "+rtm"
-"-target-feature" "+fma"
-"-target-feature" "+bmi"
-"-target-feature" "+rdrnd"
-"-target-feature" "-mwaitx"
-"-target-feature" "+sse4.1"
-"-target-feature" "+sse4.2"
-"-target-feature" "+avx2"
-"-target-feature" "+sse"
-"-target-feature" "+lzcnt"
-"-target-feature" "+pclmul"
-"-target-feature" "-prefetchwt1"
-"-target-feature" "+f16c"
-"-target-feature" "+ssse3"
-"-target-feature" "+sgx"
-"-target-feature" "+cmov"
-"-target-feature" "-avx512vbmi"
-"-target-feature" "+movbe"
-"-target-feature" "+xsaveopt"
-"-target-feature" "-sha"
-"-target-feature" "+adx"
-"-target-feature" "-avx512pf"
-"-target-feature" "+sse3"
-taschenbuch:pyasmjit codemonk$
--- a/dev_test/fail.ll
+++ b/dev_test/fail.ll
@@ -1,22 +0,0 @@
-define i64 @"test"(i64 %"N")
-{
-entry:
-  %"loop_cond" = icmp slt i64 0, %"N"
-  br i1 %"loop_cond", label %"loop", label %"end"
-
-loop:
-  %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-  %"in.0" = phi i32 [3, %"entry"], [%"out.0", %"loop"]
-
-
-  %"reg.0" = call i32 asm  "add $2, $0", "=r,0,i" (i32 %"in.0", i32 1)
-  %"out.0" = call i32 asm  "add $2, $0", "=r,0,i" (i32 %"reg.0", i32 1)
-  %"loop_counter.1" = add i64 %"loop_counter", 1
-  %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-  br i1 %"loop_cond.1", label %"loop", label %"end"
-
-end:
-  %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-
-  ret i64 %"ret"
-}
--- a/dev_test/fail.o
+++ b/dev_test/fail.o
--- a/dev_test/fail.s
+++ b/dev_test/fail.s
@@ -1,35 +0,0 @@
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_test                   ## -- Begin function test
-	.p2align	4, 0x90
-_test:                                  ## @test
-	.cfi_startproc
-## %bb.0:                               ## %entry
-	testq	%rdi, %rdi
-	jle	LBB0_1
-## %bb.2:                               ## %loop.preheader
-	movl	$3, %ecx
-	movq	$-1, %rdx
-	.p2align	4, 0x90
-LBB0_3:                                 ## %loop
-                                        ## =>This Inner Loop Header: Depth=1
-	## InlineAsm Start
-	addl	$1, %ecx
-	## InlineAsm End
-	leaq	1(%rdx), %rax
-	addq	$2, %rdx
-	cmpq	%rdi, %rdx
-	movq	%rax, %rdx
-	## InlineAsm Start
-	addl	$1, %ecx
-	## InlineAsm End
-	jl	LBB0_3
-## %bb.4:                               ## %end
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-                                        ## -- End function
-
-.subsections_via_symbols
--- a/dev_test/fail_code.ll
+++ b/dev_test/fail_code.ll
@@ -1,67 +0,0 @@
-define i64 @"test"(i64 %"N")
-{
-entry:
-  %"loop_cond" = icmp slt i64 0, %"N"
-  br i1 %"loop_cond", label %"loop", label %"end"
-
-loop:
-  %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-  %in.0 = phi i64 [1, %"entry"], [%out.0, %"loop"]
-  %in.1 = phi i64 [1, %"entry"], [%out.1, %"loop"]
-  %in.2 = phi i64 [1, %"entry"], [%out.2, %"loop"]
-  %in.3 = phi i64 [1, %"entry"], [%out.3, %"loop"]
-  %in.4 = phi i64 [1, %"entry"], [%out.4, %"loop"]
-  %in.5 = phi i64 [1, %"entry"], [%out.5, %"loop"]
-  %in.6 = phi i64 [1, %"entry"], [%out.6, %"loop"]
-  %in.7 = phi i64 [1, %"entry"], [%out.7, %"loop"]
-
-  %"reg.0" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.0, i64 1)
-  %"reg.1" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.0", i64 1)
-  %"reg.2" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.1", i64 1)
-  %"reg.3" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.2", i64 1)
-  %"reg.4" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.3", i64 1)
-  %"reg.5" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.4", i64 1)
-  %"reg.6" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.5", i64 1)
-  %out.0 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.6", i64 1)
-  %"reg.7" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.1, i64 1)
-  %"reg.8" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.7", i64 1)
-  %"reg.9" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.8", i64 1)
-  %"reg.10" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.9", i64 1)
-  %"reg.11" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.10", i64 1)
-  %"reg.12" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.11", i64 1)
-  %"reg.13" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.12", i64 1)
-  %out.1 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.13", i64 1)
-  %"reg.14" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.2, i64 1)
-  %"reg.15" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.14", i64 1)
-  %"reg.16" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.15", i64 1)
-  %"reg.17" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.16", i64 1)
-  %"reg.18" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.17", i64 1)
-  %"reg.19" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.18", i64 1)
-  %"reg.20" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.19", i64 1)
-  %out.2 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.20", i64 1)
-  %"reg.21" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.3, i64 1)
-  %"reg.22" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.21", i64 1)
-  %"reg.23" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.22", i64 1)
-  %"reg.24" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.23", i64 1)
-  %"reg.25" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.24", i64 1)
-  %"reg.26" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.25", i64 1)
-  %"reg.27" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.26", i64 1)
-  %out.3 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.27", i64 1)
-  %"reg.28" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.4, i64 1)
-  %"reg.29" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.28", i64 1)
-  %"reg.30" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.29", i64 1)
-  %"reg.31" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.30", i64 1)
-  %"reg.32" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.31", i64 1)
-  %"reg.33" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.32", i64 1)
-  %out.4 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.33", i64 1)
-  %out.5 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.5, i64 1)
-  %out.6 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.6, i64 1)
-  %out.7 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.7, i64 1)
-  %"loop_counter.1" = add i64 %"loop_counter", 1
-  %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-  br i1 %"loop_cond.1", label %"loop", label %"end"
-
-end:
-  %"ret" = phi i64 [-1, %"entry"], [%"loop_counter", %"loop"]
-  ret i64 %"ret"
-}
--- a/dev_test/fail_code.o
+++ b/dev_test/fail_code.o
--- a/dev_test/fail_main.c
+++ b/dev_test/fail_main.c
@@ -1,6 +0,0 @@
-#include <stdio.h>
-
-int main() {
-    printf("%d\n", test(100));
-    return 0;
-}
--- a/dev_test/fail_test.py
+++ b/dev_test/fail_test.py
@@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-import llvmlite.binding as llvm
-import ctypes
-
-llvm.initialize()
-llvm.initialize_native_target()
-llvm.initialize_native_asmprinter()
-llvm.initialize_native_asmparser()
-
-code = '''define i64 @"test"(i64 %"N")
-{
-entry:
-  %"loop_cond" = icmp slt i64 0, %"N"
-  br i1 %"loop_cond", label %"loop", label %"end"
-
-loop:
-  %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-  %in.0 = phi i64 [1, %"entry"], [%out.0, %"loop"]
-  %in.1 = phi i64 [1, %"entry"], [%out.1, %"loop"]
-  %in.2 = phi i64 [1, %"entry"], [%out.2, %"loop"]
-  %in.3 = phi i64 [1, %"entry"], [%out.3, %"loop"]
-  %in.4 = phi i64 [1, %"entry"], [%out.4, %"loop"]
-  %in.5 = phi i64 [1, %"entry"], [%out.5, %"loop"]
-  %in.6 = phi i64 [1, %"entry"], [%out.6, %"loop"]
-  %in.7 = phi i64 [1, %"entry"], [%out.7, %"loop"]
-
-  %"reg.0" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.0, i64 1)
-  %"reg.1" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.0", i64 1)
-  %"reg.2" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.1", i64 1)
-  %"reg.3" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.2", i64 1)
-  %"reg.4" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.3", i64 1)
-  %"reg.5" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.4", i64 1)
-  %"reg.6" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.5", i64 1)
-  %out.0 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.6", i64 1)
-  %"reg.7" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.1, i64 1)
-  %"reg.8" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.7", i64 1)
-  %"reg.9" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.8", i64 1)
-  %"reg.10" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.9", i64 1)
-  %"reg.11" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.10", i64 1)
-  %"reg.12" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.11", i64 1)
-  %"reg.13" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.12", i64 1)
-  %out.1 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.13", i64 1)
-  %"reg.14" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.2, i64 1)
-  %"reg.15" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.14", i64 1)
-  %"reg.16" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.15", i64 1)
-  %"reg.17" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.16", i64 1)
-  %"reg.18" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.17", i64 1)
-  %"reg.19" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.18", i64 1)
-  %"reg.20" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.19", i64 1)
-  %out.2 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.20", i64 1)
-  %"reg.21" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.3, i64 1)
-  %"reg.22" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.21", i64 1)
-  %"reg.23" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.22", i64 1)
-  %"reg.24" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.23", i64 1)
-  %"reg.25" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.24", i64 1)
-  %"reg.26" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.25", i64 1)
-  %"reg.27" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.26", i64 1)
-  %out.3 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.27", i64 1)
-  %"reg.28" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.4, i64 1)
-  %"reg.29" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.28", i64 1)
-  %"reg.30" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.29", i64 1)
-  %"reg.31" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.30", i64 1)
-  %"reg.32" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.31", i64 1)
-  %"reg.33" = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.32", i64 1)
-  %out.4 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %"reg.33", i64 1)
-  %out.5 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.5, i64 1)
-  %out.6 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.6, i64 1)
-  %out.7 = call i64 asm  "add $2, $0", "=r,0,i" (i64 %in.7, i64 1)
-  %"loop_counter.1" = add i64 %"loop_counter", 1
-  %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-  br i1 %"loop_cond.1", label %"loop", label %"end"
-
-end:
-  %"ret" = phi i64 [-1, %"entry"], [%"loop_counter", %"loop"]
-  ret i64 %"ret"
-}'''
-
-llvm_module = llvm.parse_assembly(code)
-llvm_module.verify()
-tm = llvm.Target.from_default_triple().create_target_machine(
-    features=llvm.get_host_cpu_features().flatten(),
-    cpu=llvm.get_host_cpu_name(),
-    opt=3)
-ee = llvm.create_mcjit_compiler(llvm_module, tm)
-ee.finalize_object()
-cfptr = ee.get_function_address('test')
-cfunc = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)(cfptr)
-N = 100
-ret = cfunc(N)
-
-print(ret)
-if ret == 0:
-    print("FAIL")
-elif ret == N-1:
-    print("Probably good.")
--- a/dev_test/main.bc
+++ b/dev_test/main.bc
--- a/dev_test/main.c
+++ b/dev_test/main.c
@@ -1,6 +0,0 @@
-#include <stdio.h>
-int test(int);
-int main() {
-    printf("%d\n", test(123123123));
-    return 0;
-}
--- a/dev_test/main.s
+++ b/dev_test/main.s
@@ -1,32 +0,0 @@
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_main                   ## -- Begin function main
-	.p2align	4, 0x90
-_main:                                  ## @main
-	.cfi_startproc
-## %bb.0:
-	pushq	%rbp
-	.cfi_def_cfa_offset 16
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-	.cfi_def_cfa_register %rbp
-	subq	$16, %rsp
-	movl	$0, -4(%rbp)
-	movl	$123123123, %edi        ## imm = 0x756B5B3
-	callq	_test
-	leaq	L_.str(%rip), %rdi
-	movl	%eax, %esi
-	movb	$0, %al
-	callq	_printf
-	xorl	%eax, %eax
-	addq	$16, %rsp
-	popq	%rbp
-	retq
-	.cfi_endproc
-                                        ## -- End function
-	.section	__TEXT,__cstring,cstring_literals
-L_.str:                                 ## @.str
-	.asciz	"%d\n"
-
-
-.subsections_via_symbols
--- a/dev_test/reproduce.py
+++ b/dev_test/reproduce.py
@@ -1,53 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-
-import llvmlite.binding as llvm
-
-
-llvm.initialize()
-llvm.initialize_native_target()
-llvm.initialize_native_asmprinter()
-llvm.initialize_native_asmparser()
-
-code = """
-define i64 @"test"(i64 %"N")
-{
-entry:
-  %"loop_cond" = icmp slt i64 0, %"N"
-  br i1 %"loop_cond", label %"loop", label %"end"
-
-loop:
-  %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
-  %"in.0" = phi i32 [3, %"entry"], [%"out.0", %"loop"]
-
-
-  %"reg.0" = call i32 asm  "add $2, $0", "=r,0,i" (i32 %"in.0", i32 1)
-  %"out.0" = call i32 asm  "add $2, $0", "=r,0,i" (i32 %"reg.0", i32 1)
-  %"loop_counter.1" = add i64 %"loop_counter", 1
-  %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
-  br i1 %"loop_cond.1", label %"loop", label %"end"
-
-end:
-  %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
-
-  ret i64 %"ret"
-}
-"""
-
-features = llvm.get_host_cpu_features().flatten()
-# znver1 on naples and skylake-avx512 on skylake-sp
-for cpu in ["skylake-avx512", "znver1"]:
-    tm =  llvm.Target.from_default_triple().create_target_machine(
-        cpu=cpu, opt=2)
-    tm.set_asm_verbosity(0)
-
-    module = llvm.parse_assembly(code)
-    asm = tm.emit_assembly(module)
-    print(asm)
-    with llvm.create_mcjit_compiler(module, tm) as ee:
-        ee.finalize_object()
-        cfptr = ee.get_function_address('test')
-        cfunc = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)(cfptr)
-        print('->', cfunc(100000))
-
-
--- a/dev_test/tblgen_output
+++ b/dev_test/tblgen_output
--- a/dev_test/tblgen_output_ARM
+++ b/dev_test/tblgen_output_ARM
--- a/dist/asmbench-0.1.0.tar.gz
+++ b/dist/asmbench-0.1.0.tar.gz
--- a/dist/asmbench-0.1.1.1.tar.gz
+++ b/dist/asmbench-0.1.1.1.tar.gz
--- a/dist/asmbench-0.1.1.2.tar.gz
+++ b/dist/asmbench-0.1.1.2.tar.gz
--- a/dist/asmbench-0.1.1.3.tar.gz
+++ b/dist/asmbench-0.1.1.3.tar.gz
--- a/dist/asmbench-0.1.1.tar.gz
+++ b/dist/asmbench-0.1.1.tar.gz
--- a/dist/asmbench-0.1.2.tar.gz
+++ b/dist/asmbench-0.1.2.tar.gz
--- a/dist/asmbench-0.1.3.tar.gz
+++ b/dist/asmbench-0.1.3.tar.gz
--- a/dist/asmbench-0.1.4.tar.gz
+++ b/dist/asmbench-0.1.4.tar.gz
--- a/dist/asmjit-0.1.1.tar.gz
+++ b/dist/asmjit-0.1.1.tar.gz
--- a/dist/asmjit-0.1.2.tar.gz
+++ b/dist/asmjit-0.1.2.tar.gz
--- a/dist/asmjit-0.1.tar.gz
+++ b/dist/asmjit-0.1.tar.gz
--- a/doc/asmbench-SC18SRC-poster
+++ b/doc/asmbench-SC18SRC-poster
--- a/dump.bin
+++ b/dump.bin
--- a/min.ll
+++ b/min.ll
@@ -1,6 +0,0 @@
-define <4 x double> @testv(i32**, i32) {
-
-  %out = tail call <4 x double> asm "vaddpd $1, $2, $0", "=x,x,x,~{dirflag},~{fpsr},~{flags}"(<4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>, <4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>)
-  ret <4 x double> %out
-}
-
--- a/min.o
+++ b/min.o
--- a/min.s
+++ b/min.s
@@ -1,21 +0,0 @@
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.section	__TEXT,__literal8,8byte_literals
-	.p2align	3               ## -- Begin function testv
-LCPI0_0:
-	.quad	4593527504729830064     ## 0x3fbf7ced916872b0
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_testv
-	.p2align	4, 0x90
-_testv:                                 ## @testv
-	.cfi_startproc
-## BB#0:
-	vbroadcastsd	LCPI0_0(%rip), %ymm0 ## ymm0 = [4593527504729830064,4593527504729830064,4593527504729830064,4593527504729830064]
-	## InlineAsm Start
-	vaddpd	%ymm0, %ymm0, %ymm0
-	## InlineAsm End
-	retq
-	.cfi_endproc
-                                        ## -- End function
-
-.subsections_via_symbols
--- a/random_pf1.txt
+++ b/random_pf1.txt
@@ -1,450 +0,0 @@
-## Selected Instructions
-VPERMILPSri, MULPSrr, ANDPDrr, VPSIGNBrr, PSIGNBrr, PMOVZXWDrr, PMINUWrr, PADDSWrr, VPSHUFHWri, MOVUPDrr
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.section	__TEXT,__literal4,4byte_literals
-	.p2align	2
-LCPI0_0:
-	.long	1065361408
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movabsq	$LCPI0_0, %rax
-	vbroadcastss	(%rax), %xmm0
-	movq	$-1, %rcx
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	vpermilps	$1, %xmm0, %xmm0
-	mulps	%xmm0, %xmm0
-	andpd	%xmm0, %xmm0
-	vpsignb	%xmm0, %xmm0, %xmm0
-	psignb	%xmm0, %xmm0
-	pmovzxwd	%xmm0, %xmm0
-	pminuw	%xmm0, %xmm0
-	paddsw	%xmm0, %xmm0
-	vpshufhw	$1, %xmm0, %xmm0
-	movupd	%xmm0, %xmm0
-	## InlineAsm End
-	leaq	1(%rcx), %rax
-	addq	$2, %rcx
-	cmpq	%rdi, %rcx
-	movq	%rax, %rcx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (24655919,),
- 'frequency': 2600000000.0,
- 'iterations': 24655919,
- 'parallel_factor': 1,
- 'returned': [24655918, 24655918, 24655918, 24655918],
- 'runtimes': [0.13202582497615367,
-              0.13208268792368472,
-              0.13151856907643378,
-              0.13161470007617027]}
-minimal throughput: 13.87 cy
-## Selected Instructions
-VFMADD132PDYr, VPADDWYrr, VFMADD132PSYr, VPADDDYrr, VSUBPDYrr, VPACKUSDWYrr, VPMULHUWYrr, VMINPDYrr, VPUNPCKLWDYrr, VBLENDVPSYrr
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.section	__TEXT,__literal4,4byte_literals
-	.p2align	2
-LCPI0_0:
-	.long	1065361408
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movabsq	$LCPI0_0, %rax
-	vbroadcastss	(%rax), %ymm0
-	movq	$-1, %rcx
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	vfmadd132pd	%ymm0, %ymm0, %ymm0
-	vpaddw	%ymm0, %ymm0, %ymm0
-	vfmadd132ps	%ymm0, %ymm0, %ymm0
-	vpaddd	%ymm0, %ymm0, %ymm0
-	vsubpd	%ymm0, %ymm0, %ymm0
-	vpackusdw	%ymm0, %ymm0, %ymm0
-	vpmulhuw	%ymm0, %ymm0, %ymm0
-	vminpd	%ymm0, %ymm0, %ymm0
-	vpunpcklwd	%ymm0, %ymm0, %ymm0
-	vblendvps	%ymm0, %ymm0, %ymm0, %ymm0
-	## InlineAsm End
-	leaq	1(%rcx), %rax
-	addq	$2, %rcx
-	cmpq	%rdi, %rcx
-	movq	%rax, %rcx
-	jl	LBB0_3
-	vzeroupper
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (10000000,),
- 'frequency': 2600000000.0,
- 'iterations': 10000000,
- 'parallel_factor': 1,
- 'returned': [9999999, 9999999, 9999999, 9999999],
- 'runtimes': [0.11892832000739872,
-              0.11891822703182697,
-              0.11902078497223556,
-              0.12094117503147572]}
-minimal throughput: 30.92 cy
-## Selected Instructions
-VCVTSI642SDrr, VFMADD213SDr, DIVSDrr, VCVTSI642SDrr, MAXSDrr, VFNMADD213SDr, VFMADD132SDr, VMAXSDrr, VFNMADD132SDr, SQRTSDr
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.section	__TEXT,__literal8,8byte_literals
-	.p2align	3
-LCPI0_0:
-	.quad	4607186816846528512
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movq	$-1, %rcx
-	movabsq	$LCPI0_0, %rax
-	vmovsd	(%rax), %xmm0
-	movl	$3, %edx
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	vcvtsi2sdq	%rdx, %xmm0, %xmm0
-	vfmadd213sd	%xmm0, %xmm0, %xmm0
-	divsd	%xmm0, %xmm0
-	vcvtsi2sdq	%rdx, %xmm0, %xmm0
-	maxsd	%xmm0, %xmm0
-	vfnmadd213sd	%xmm0, %xmm0, %xmm0
-	vfmadd132sd	%xmm0, %xmm0, %xmm0
-	vmaxsd	%xmm0, %xmm0, %xmm0
-	vfnmadd132sd	%xmm0, %xmm0, %xmm0
-	sqrtsd	%xmm0, %xmm0
-	## InlineAsm End
-	leaq	1(%rcx), %rax
-	addq	$2, %rcx
-	cmpq	%rdi, %rcx
-	movq	%rax, %rcx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (5841530,),
- 'frequency': 2600000000.0,
- 'iterations': 5841530,
- 'parallel_factor': 1,
- 'returned': [5841529, 5841529, 5841529, 5841529],
- 'runtimes': [0.13433505699504167,
-              0.13318849296774715,
-              0.13303690601605922,
-              0.13309408095665276]}
-minimal throughput: 59.21 cy
-## Selected Instructions
-RCPSSr, VCVTSI2SSrr, MULSSrr, VCVTSD2SSrr, VROUNDSSr, VRCPSSr, VCVTSI2SSrr, VSQRTSSr, VFNMADD231SSr, VSQRTSSr
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.section	__TEXT,__literal4,4byte_literals
-	.p2align	2
-LCPI0_0:
-	.long	1065361408
-	.section	__TEXT,__literal8,8byte_literals
-	.p2align	3
-LCPI0_1:
-	.quad	4607186816846528512
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movq	$-1, %rcx
-	movabsq	$LCPI0_0, %rax
-	vmovss	(%rax), %xmm1
-	movl	$3, %edx
-	movabsq	$LCPI0_1, %rax
-	vmovsd	(%rax), %xmm0
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	rcpss	%xmm1, %xmm1
-	vcvtsi2ssl	%edx, %xmm1, %xmm1
-	mulss	%xmm1, %xmm1
-	vcvtsd2ss	%xmm0, %xmm1, %xmm1
-	vroundss	$1, %xmm1, %xmm1, %xmm1
-	vrcpss	%xmm1, %xmm1, %xmm1
-	vcvtsi2ssl	%edx, %xmm1, %xmm1
-	vsqrtss	%xmm1, %xmm1, %xmm1
-	vfnmadd231ss	%xmm1, %xmm1, %xmm1
-	vsqrtss	%xmm1, %xmm1, %xmm1
-	## InlineAsm End
-	leaq	1(%rcx), %rax
-	addq	$2, %rcx
-	cmpq	%rdi, %rcx
-	movq	%rax, %rcx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (6011291,),
- 'frequency': 2600000000.0,
- 'iterations': 6011291,
- 'parallel_factor': 1,
- 'returned': [6011290, 6011290, 6011290, 6011290],
- 'runtimes': [0.13239118899218738,
-              0.13244657206814736,
-              0.1326694720191881,
-              0.13262002903502434]}
-minimal throughput: 57.26 cy
-## Selected Instructions
-ROR16ri, CMOVS16rr, SBB16ri, ADC16ri8, XOR16ri8, BTR16rr, XOR16ri8, SAR16r1, DEC16r, SUB16ri
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movw	$3, %cx
-	movq	$-1, %rdx
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	rorw	%cx
-	cmovsw	%cx, %cx
-	sbbw	$1, %cx
-	adcw	$1, %cx
-	xorw	$1, %cx
-	btrw	%cx, %cx
-	xorw	$1, %cx
-	sarw	%cx
-	decw	%cx
-	subw	$1, %cx
-	## InlineAsm End
-	leaq	1(%rdx), %rax
-	addq	$2, %rdx
-	cmpq	%rdi, %rdx
-	movq	%rax, %rdx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (31283731,),
- 'frequency': 2600000000.0,
- 'iterations': 31283731,
- 'parallel_factor': 1,
- 'returned': [31283730, 31283730, 31283730, 31283730],
- 'runtimes': [0.13291946100071073,
-              0.13294463406782597,
-              0.1332225619116798,
-              0.13287500606384128]}
-minimal throughput: 11.04 cy
-## Selected Instructions
-SHLX32rr, CMOVO32rr, MOV32rr, CMOVS32rr, CRC32r32r8, SHR32r1, ADD32rr, CRC32r32r8, RCR32ri, SHR32r1
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movl	$3, %esi
-	movq	$-1, %rdx
-	movb	$3, %cl
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	shlxl	%esi, %esi, %eax
-	cmovol	%eax, %eax
-	movl	%eax, %esi
-	cmovsl	%esi, %esi
-	crc32b	%cl, %esi
-	shrl	%esi
-	addl	%esi, %esi
-	crc32b	%cl, %esi
-	rcrl	%esi
-	shrl	%esi
-	## InlineAsm End
-	leaq	1(%rdx), %rax
-	addq	$2, %rdx
-	cmpq	%rdi, %rdx
-	movq	%rax, %rdx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (24008543,),
- 'frequency': 2600000000.0,
- 'iterations': 24008543,
- 'parallel_factor': 1,
- 'returned': [24008542, 24008542, 24008542, 24008542],
- 'runtimes': [0.13333229208365083,
-              0.13314284407533705,
-              0.13381975598167628,
-              0.13447994901798666]}
-minimal throughput: 14.42 cy
-## Selected Instructions
-SHRX64rr, SBB64ri32, AND64ri8, MOV64rc, INC64r, SUB64ri32, POPCNT64rr, OR64ri8, BTS64rr, ROL64ri
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movq	$-1, %rcx
-	movl	$3, %edx
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	shrxq	%rdx, %rdx, %rax
-	sbbq	$1, %rax
-	andq	$1, %rax
-	movq	%rax, %rax
-	incq	%rax
-	subq	$1, %rax
-	popcntq	%rax, %rdx
-	orq	$1, %rdx
-	btsq	%rdx, %rdx
-	rolq	%rdx
-	## InlineAsm End
-	leaq	1(%rcx), %rax
-	addq	$2, %rcx
-	cmpq	%rdi, %rcx
-	movq	%rax, %rcx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (27539225,),
- 'frequency': 2600000000.0,
- 'iterations': 27539225,
- 'parallel_factor': 1,
- 'returned': [27539224, 27539224, 27539224, 27539224],
- 'runtimes': [0.1335972750093788,
-              0.13322542910464108,
-              0.13357082300353795,
-              0.13376462296582758]}
-minimal throughput: 12.58 cy
-## Selected Instructions
-SAR8r1, SHR8ri, INC8r, AND8rr, RCR8ri, ROL8ri, SUB8ri, SBB8rr, NEG8r, NOT8r
-## Generated Assembly (1x parallel)
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_test
-	.p2align	4, 0x90
-_test:
-	.cfi_startproc
-	testq	%rdi, %rdi
-	jle	LBB0_1
-	movb	$3, %cl
-	movq	$-1, %rdx
-	.p2align	4, 0x90
-LBB0_3:
-	## InlineAsm Start
-	sarb	%cl
-	shrb	%cl
-	incb	%cl
-	andb	%cl, %cl
-	rcrb	%cl
-	rolb	%cl
-	subb	$1, %cl
-	sbbb	%cl, %cl
-	negb	%cl
-	notb	%cl
-	## InlineAsm End
-	leaq	1(%rdx), %rax
-	addq	$2, %rdx
-	cmpq	%rdi, %rdx
-	movq	%rax, %rdx
-	jl	LBB0_3
-	retq
-LBB0_1:
-	xorl	%eax, %eax
-	retq
-	.cfi_endproc
-
-
-.subsections_via_symbols
-
-## Detailed Results
-{'arguments': (30431254,),
- 'frequency': 2600000000.0,
- 'iterations': 30431254,
- 'parallel_factor': 1,
- 'returned': [30431253, 30431253, 30431253, 30431253],
- 'runtimes': [0.13894746906589717,
-              0.1348069809610024,
-              0.13318019802682102,
-              0.13318415405228734]}
-minimal throughput: 11.38 cy
--- a/random_pf10.txt
+++ b/random_pf10.txt
--- a/tablegen.cprof
+++ b/tablegen.cprof
--- a/BIN
+++ b/BIN
--- a/test.o
+++ b/test.o
--- a/test.s
+++ b/test.s
@@ -1,136 +0,0 @@
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 13
-	.globl	_foo                    ## -- Begin function foo
-	.p2align	4, 0x90
-_foo:                                   ## @foo
-	.cfi_startproc
-## BB#0:
-	pushq	%rbp
-Lcfi0:
-	.cfi_def_cfa_offset 16
-Lcfi1:
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-Lcfi2:
-	.cfi_def_cfa_register %rbp
-	xorl	%eax, %eax
-	testl	%edi, %edi
-	jle	LBB0_2
-	.p2align	4, 0x90
-LBB0_1:                                 ## =>This Inner Loop Header: Depth=1
-	## InlineAsm Start
-	addl	$23, %eax
-
-	## InlineAsm End
-	## InlineAsm Start
-	subl	$13, %eax
-
-	## InlineAsm End
-	## InlineAsm Start
-	subl	$10, %eax
-
-	## InlineAsm End
-	incl	%eax
-	cmpl	%edi, %eax
-	jl	LBB0_1
-LBB0_2:
-	popq	%rbp
-	retq
-	.cfi_endproc
-                                        ## -- End function
-	.section	__TEXT,__literal8,8byte_literals
-	.p2align	3               ## -- Begin function benchmark
-LCPI1_0:
-	.quad	4696837146684686336     ## double 1.0E+6
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_benchmark
-	.p2align	4, 0x90
-_benchmark:                             ## @benchmark
-	.cfi_startproc
-## BB#0:
-	pushq	%rbp
-Lcfi3:
-	.cfi_def_cfa_offset 16
-Lcfi4:
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-Lcfi5:
-	.cfi_def_cfa_register %rbp
-	pushq	%r14
-	pushq	%rbx
-	subq	$48, %rsp
-Lcfi6:
-	.cfi_offset %rbx, -32
-Lcfi7:
-	.cfi_offset %r14, -24
-	movq	%rsi, %r14
-	movss	%xmm0, -20(%rbp)        ## 4-byte Spill
-	movl	%edi, %ebx
-	leaq	-56(%rbp), %rdi
-	xorl	%esi, %esi
-	callq	_gettimeofday
-	movl	%ebx, %edi
-	callq	*%r14
-	leaq	-40(%rbp), %rdi
-	xorl	%esi, %esi
-	callq	_gettimeofday
-	movq	-40(%rbp), %rax
-	subq	-56(%rbp), %rax
-	cvtsi2sdq	%rax, %xmm1
-	movl	-32(%rbp), %eax
-	subl	-48(%rbp), %eax
-	xorps	%xmm0, %xmm0
-	cvtsi2sdl	%eax, %xmm0
-	mulsd	LCPI1_0(%rip), %xmm0
-	addsd	%xmm1, %xmm0
-	movss	-20(%rbp), %xmm1        ## 4-byte Reload
-                                        ## xmm1 = mem[0],zero,zero,zero
-	cvtss2sd	%xmm1, %xmm1
-	divsd	%xmm1, %xmm0
-	leaq	L_.str(%rip), %rdi
-	movb	$1, %al
-	callq	_printf
-	addq	$48, %rsp
-	popq	%rbx
-	popq	%r14
-	popq	%rbp
-	retq
-	.cfi_endproc
-                                        ## -- End function
-	.section	__TEXT,__literal4,4byte_literals
-	.p2align	2               ## -- Begin function main
-LCPI2_0:
-	.long	1326386456              ## float 2.4E+9
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_main
-	.p2align	4, 0x90
-_main:                                  ## @main
-	.cfi_startproc
-## BB#0:
-	pushq	%rbp
-Lcfi8:
-	.cfi_def_cfa_offset 16
-Lcfi9:
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-Lcfi10:
-	.cfi_def_cfa_register %rbp
-	movq	8(%rsi), %rdi
-	callq	_atoi
-	leaq	_foo(%rip), %rsi
-	movss	LCPI2_0(%rip), %xmm0    ## xmm0 = mem[0],zero,zero,zero
-	movl	%eax, %edi
-	callq	_benchmark
-	xorl	%eax, %eax
-	popq	%rbp
-	retq
-	.cfi_endproc
-                                        ## -- End function
-	.section	__TEXT,__cstring,cstring_literals
-L_.str:                                 ## @.str
-	.asciz	"%.3f (clock cycles)\n"
-
-	.comm	_latency,8,3            ## @latency
-	.comm	_ninst,8,3              ## @ninst
-
-.subsections_via_symbols
--- a/triad-instructions.numbers
+++ b/triad-instructions.numbers
--- a/triad-instructions.txt
+++ b/triad-instructions.txt
@@ -1,39 +0,0 @@
-ADD32ri
-ADD64ri32
-CMP32rm
-CMP32rr
-CMP64ri32
-CMP64rr
-INC64r
-MOVSX64rm32
-SUB32ri
-VADDPDYrm
-VADDSDrm
-VADDSDrr
-VADDSSrr
-VCVTSI642SSrr
-VCVTSS2SIrr_Int
-VFMADD213PDYr
-VFMADD213PDr
-VFMADD213PSYr
-VFMADD213PSr
-VFMADD213SDr
-VFMADD213SSr
-VINSERTF128rr
-VMULPDYrr
-VMULSDrm_Int
-VMULSDrr_Int
-VMULSSrr_Int
-VSUBPDYrm
-VSUBSDrm_Int
-VSUBSDrr_Int
-VSUBSSrr_Int
-
-MOV64mr (store)
-MOV32rm
-MOV64rm
-VMOVSD??? mem_xmm
-VMOVSD??? xmm_mem
-
-LEA32r <-- which ones?
-LEA64r <-- which ones?
--- a/triad-instructions_.txt
+++ b/triad-instructions_.txt
@@ -1,51 +0,0 @@
-add-r32_imd
-add-r64_imd
-inc-r64
-mov-mem_r64
-mov-r32_imd
-movslq-r64_r32
-sub-r32_imd
-vaddpd-avx
-vaddsd-xmm_xmm_xmm
-vaddss-xmm_xmm_xmm
-vcvtsi2ss-xmm_xmm_r32
-
-vcvtss2si-r32_xmm
-vfmadd213pd-avx
-vfmadd213pd-sse
-vfmadd213ps-avx
-vfmadd213ps-sse
-vfmadd213sd
-vfmadd213ss
-vinsertf128-ymm_ymm_imd
-
-vmulpd-ymm_ymm_ymm
-vmulsd-xmm_xmm_xmm
-vmulss-xmm_xmm_xmm
-vsubsd-xmm_xmm_xmm
-vsubss-xmm_xmm_xmm
-
-# LEAs:
-
-lea-r32_mem
-lea-r64_mem
-lea-r64_mem2
-
-# /w mem operand:
-
-mov-r32_mem
-mov-r64_mem
-vmovsd-mem_xmm
-vmovsd-xmm_mem
-vaddpd-ymm_ymm_mem
-vaddsd-xmm_xmm_mem
-vmulsd-xmm_xmm_mem
-vsubpd-ymm_ymm_mem
-vsubsd-xmm_xmm_mem
-
-# impossible to serialize:
-
-cmp-r32_r32
-cmp-r64_imd
-cmp-r64_r64
-cmp-r32_mem