Revert "fixed up to work with latest kerncraft"

This reverts commit 2ccfb0c9ea.
This commit is contained in:
Julian Hammer
2020-05-27 14:09:13 +02:00
parent 2ccfb0c9ea
commit 9c511f9ddf
80 changed files with 4 additions and 2327784 deletions

12
.idea/asmbench.iml generated
View File

@@ -1,12 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Nosetests" />
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
</component>
</module>

4
.idea/encodings.xml generated
View File

@@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

7
.idea/misc.xml generated
View File

@@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.4 (/opt/local/bin/python)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated
View File

@@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/asmbench.iml" filepath="$PROJECT_DIR$/.idea/asmbench.iml" />
</modules>
</component>
</project>

469
.idea/workspace.xml generated
View File

@@ -1,469 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="ce9d0a71-6676-44f6-88f0-52583274be24" name="Default" comment="">
<change beforePath="$PROJECT_DIR$/.idea/vcs.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/doc/asmbench-SC18SRC-poster/sc18-src-poster.ai" beforeDir="false" afterPath="$PROJECT_DIR$/doc/asmbench-SC18SRC-poster/sc18-src-poster.ai" afterDir="false" />
<change beforePath="$PROJECT_DIR$/tablegen.py" beforeDir="false" afterPath="$PROJECT_DIR$/tablegen.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/asmbench/oldjit.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="12135">
<caret line="820" column="33" selection-start-line="820" selection-start-column="33" selection-end-line="820" selection-end-column="33" />
<folding>
<element signature="e#23#36#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/asmbench/sc18src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="120">
<caret line="17" column="21" selection-start-line="17" selection-start-column="21" selection-end-line="17" selection-end-column="21" />
<folding>
<element signature="e#23#41#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tablegen.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1350">
<caret line="100" column="28" selection-start-line="100" selection-start-column="28" selection-end-line="100" selection-end-column="28" />
<folding>
<element signature="e#24#34#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/asmbench/streams.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1230">
<caret line="82" lean-forward="true" selection-start-line="82" selection-end-line="82" />
<folding>
<element signature="e#24#42#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/asmbench/op.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="3900">
<caret line="260" column="35" selection-start-line="260" selection-start-column="35" selection-end-line="260" selection-end-column="35" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/asmbench/bench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="15" selection-start-line="3" selection-end-line="4" />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Setup Script" />
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>build_ir</find>
<find>combined_instructions</find>
<find>random</find>
<find>serial</find>
<find>IntegerLoopBenchmark</find>
<find>latenchy</find>
<find>iaca_markers</find>
<find>setup_llvm</find>
<find>get_iaca_analysis</find>
<find>get_target_machine</find>
<find>foo</find>
<find>instructions_ret_type</find>
<find>iaca</find>
<find>get_registers</find>
<find>sop_t</find>
<find>AddressGenerationBenchmark</find>
<find>lea</find>
<find>,)</find>
<find>prepare_arguments</find>
<find>VSUBSSrr</find>
<find>build_and_execute</find>
<find>jit.</find>
<find>fn</find>
<find>asmjit</find>
<find>ValueError</find>
<find>split_llvm_vector_type</find>
<find>get_default_init_values</find>
<find>llvm</find>
<find>self.init_val</find>
<find>i64</find>
</findStrings>
<replaceStrings>
<replace>generate_register_nameing</replace>
<replace>naming</replace>
<replace>iaca_marker</replace>
<replace>jit.AddressGenerationBenchmark</replace>
<replace>)</replace>
<replace>oldjit.</replace>
<replace>asmbench</replace>
</replaceStrings>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/debug_avx_feature.py" />
<option value="$PROJECT_DIR$/op.py" />
<option value="$PROJECT_DIR$/bench.py" />
<option value="$PROJECT_DIR$/asmjit/__init__.py" />
<option value="$PROJECT_DIR$/jit.py" />
<option value="$PROJECT_DIR$/tablegen.py" />
<option value="$PROJECT_DIR$/dev_test/reproduce.py" />
<option value="$PROJECT_DIR$/asmjit/__main__.py" />
<option value="$PROJECT_DIR$/asmjit/op.py" />
<option value="$PROJECT_DIR$/README.md" />
<option value="$PROJECT_DIR$/asmjit/bench.py" />
<option value="$PROJECT_DIR$/run_SC18_SRC.py" />
<option value="$PROJECT_DIR$/asmjit/sc18src.py" />
<option value="$PROJECT_DIR$/README.md" />
<option value="$PROJECT_DIR$/doc/sc18src_artifact_appendix.md" />
<option value="$PROJECT_DIR$/README.rst" />
<option value="$PROJECT_DIR$/MANIFEST.in" />
<option value="$PROJECT_DIR$/setup.py" />
<option value="$PROJECT_DIR$/setup.py" />
<option value="$PROJECT_DIR$/asmbench/bench.py" />
<option value="$PROJECT_DIR$/asmbench/op.py" />
<option value="$APPLICATION_CONFIG_DIR$/scratches/scratch.py" />
<option value="$PROJECT_DIR$/asmbench/streams.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds">
<option name="x" value="971" />
<option name="y" value="-1669" />
<option name="width" value="1241" />
<option name="height" value="1669" />
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="asmbench" type="b2602c69:ProjectViewProjectNode" />
<item name="asmbench" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="asmbench" type="b2602c69:ProjectViewProjectNode" />
<item name="asmbench" type="462c0819:PsiDirectoryNode" />
<item name="asmbench" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="com.intellij.ide.scratch.LRUPopupBuilder$1/New Scratch File" value="Python" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
<property name="run.code.analysis.last.selected.profile" value="pProject Default" />
<property name="settings.editor.selected.configurable" value="editor.preferences.completion" />
</component>
<component name="PyConsoleOptionsProvider">
<option name="myPythonConsoleState">
<console-settings is-module-sdk="true">
<option name="myUseModuleSdk" value="true" />
</console-settings>
</option>
</component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/asmjit" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="ce9d0a71-6676-44f6-88f0-52583274be24" name="Default" comment="" />
<created>1528185911695</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1528185911695</updated>
<workItem from="1549014553562" duration="10478000" />
<workItem from="1549470823118" duration="191000" />
<workItem from="1549577395449" duration="719000" />
<workItem from="1549629861489" duration="622000" />
<workItem from="1549636051326" duration="400000" />
<workItem from="1550675127118" duration="4866000" />
<workItem from="1553613650758" duration="756000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="18032000" />
</component>
<component name="ToolWindowManager">
<frame x="971" y="-1669" width="1241" height="1669" extended-state="0" />
<editor active="true" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25771475" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" weight="0.32980832" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
<window_info anchor="bottom" id="Database Changes" order="8" show_stripe_button="false" />
<window_info anchor="bottom" id="Terminal" order="9" />
<window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
<window_info anchor="bottom" id="Version Control" order="11" />
<window_info anchor="bottom" id="Messages" order="12" />
<window_info anchor="bottom" id="Python Console" order="13" />
<window_info active="true" anchor="bottom" id="Inspection Results" order="14" visible="true" weight="0.32980832" />
<window_info anchor="right" id="Commander" order="0" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="SciView" order="3" />
<window_info anchor="right" id="Database" order="4" />
</layout>
<layout-to-restore>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.23436196" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" weight="0.32980832" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
<window_info anchor="bottom" id="Version Control" order="8" />
<window_info anchor="bottom" id="Database Changes" order="9" show_stripe_button="false" />
<window_info anchor="bottom" id="Python Console" order="10" />
<window_info anchor="bottom" id="Terminal" order="11" />
<window_info anchor="bottom" id="Event Log" order="12" side_tool="true" />
<window_info anchor="bottom" id="Messages" order="13" />
<window_info active="true" anchor="bottom" id="Inspection Results" order="14" visible="true" weight="0.32980832" />
<window_info anchor="right" id="Commander" order="0" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="SciView" order="3" />
<window_info anchor="right" id="Database" order="4" />
</layout-to-restore>
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="1" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/asmbench/bench.py</url>
<line>1</line>
<option name="timeStamp" value="3" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/debug_avx_feature.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="30">
<caret line="2" selection-start-line="2" selection-end-line="2" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/dev_test/reproduce.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="705">
<caret line="46" column="33" selection-start-line="46" selection-start-column="33" selection-end-line="46" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_skylapesp2.txt" />
<entry file="file://$PROJECT_DIR$/SC18_SRC_skylapesp2.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="135">
<caret line="9" column="43" lean-forward="true" selection-start-line="9" selection-start-column="43" selection-end-line="9" selection-end-column="43" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_naples1.txt" />
<entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_skylakesp2.txt" />
<entry file="file://$PROJECT_DIR$/README.md" />
<entry file="file://$PROJECT_DIR$/MANIFEST.in">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="15">
<caret line="1" column="18" selection-start-line="1" selection-start-column="18" selection-end-line="1" selection-end-column="18" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/doc/sc18src_artifact_appendix.md">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="15">
<caret line="1" lean-forward="true" selection-start-line="1" selection-end-line="1" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/README.rst">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105">
<caret line="7" selection-start-line="7" selection-end-line="7" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/setup.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="420">
<caret line="28" lean-forward="true" selection-start-line="28" selection-end-line="28" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/build/lib/asmbench/bench.py" />
<entry file="file://$PROJECT_DIR$/build/lib/asmjit/bench.py" />
<entry file="file://$PROJECT_DIR$/asmbench/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$APPLICATION_CONFIG_DIR$/scratches/scratch.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/asmbench/__main__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="255">
<caret line="21" column="21" selection-start-line="21" selection-start-column="21" selection-end-line="21" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/run_SC18_SRC.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2084">
<caret line="2" column="16" selection-start-line="2" selection-start-column="16" selection-end-line="2" selection-end-column="16" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/asmbench/oldjit.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="12135">
<caret line="820" column="33" selection-start-line="820" selection-start-column="33" selection-end-line="820" selection-end-column="33" />
<folding>
<element signature="e#23#36#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/asmbench/sc18src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="120">
<caret line="17" column="21" selection-start-line="17" selection-start-column="21" selection-end-line="17" selection-end-column="21" />
<folding>
<element signature="e#23#41#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tablegen.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1350">
<caret line="100" column="28" selection-start-line="100" selection-start-column="28" selection-end-line="100" selection-end-column="28" />
<folding>
<element signature="e#24#34#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/asmbench/op.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="3900">
<caret line="260" column="35" selection-start-line="260" selection-start-column="35" selection-end-line="260" selection-end-column="35" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/asmbench/bench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="15" selection-start-line="3" selection-end-line="4" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/asmbench/streams.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1230">
<caret line="82" lean-forward="true" selection-start-line="82" selection-end-line="82" />
<folding>
<element signature="e#24#42#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
<component name="masterDetails">
<states>
<state key="ScopeChooserConfigurable.UI">
<settings>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>

View File

View File

@@ -1,261 +0,0 @@
ADD32ri LAT 1.001 cy TP 0.293 cy
ADD64ri32 LAT 1.001 cy TP 0.295 cy
INC64r LAT 1.000 cy TP 0.314 cy
MOV64ri32 LAT 0.535 cy TP 0.354 cy
SUB32ri LAT 1.001 cy TP 0.330 cy
VADDPDYrr LAT 4.002 cy TP 0.523 cy
VADDSDrr LAT 4.002 cy TP 0.523 cy
VADDSSrr LAT 4.002 cy TP 0.523 cy
VCVTSI642SSrr LAT 2.001 cy TP 2.001 cy
VFMADD213PDYr LAT 4.002 cy TP 0.523 cy
VFMADD213PDr LAT 4.002 cy TP 0.523 cy
VFMADD213PSYr LAT 4.002 cy TP 0.523 cy
VFMADD213PSr LAT 4.002 cy TP 0.523 cy
VFMADD213SDr LAT 4.002 cy TP 0.523 cy
VFMADD213SSr LAT 4.002 cy TP 0.523 cy
VINSERTF128rr LAT 3.001 cy TP 1.000 cy
VMULPDYrr LAT 4.002 cy TP 0.523 cy
VMULSDrr LAT 4.002 cy TP 0.523 cy
VMULSSrr LAT 4.002 cy TP 0.523 cy
VSUBSDrr LAT 4.002 cy TP 0.523 cy
VSUBSSrr LAT 4.002 cy TP 0.523 cy
lea_b LAT 0.600 cy TP 0.550 cy
lea_b+off LAT 0.600 cy TP 0.550 cy
lea_idx*w LAT 0.600 cy TP 0.550 cy
lea_off+idx*w LAT 0.600 cy TP 0.550 cy
lea_b+idx*w LAT 1.000 cy TP 0.601 cy
lea_b+off+idx*w LAT 3.001 cy TP 1.000 cy
LD_linear LAT 2.006 cy TP 0.502 cy
LD_random LAT 2.006 cy TP 0.502 cy
ADD32ri ADD32ri LAT 1.086 cy TP 0.614 cy SPM 1.09
ADD32ri ADD64ri32 LAT 1.086 cy TP 0.614 cy SPM 1.09
ADD32ri INC64r LAT 1.086 cy TP 0.629 cy SPM 1.08
ADD32ri MOV64ri32 LAT 1.000 cy TP 0.603 cy SPM 0.85
ADD32ri SUB32ri LAT 1.086 cy TP 0.614 cy SPM 0.97
ADD32ri VADDPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VADDSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VADDSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VCVTSI642SSrr LAT 2.001 cy TP 2.001 cy SPM 0.00
ADD32ri VFMADD213PDYr LAT 4.002 cy TP 0.581 cy SPM 0.20
ADD32ri VFMADD213PDr LAT 4.002 cy TP 0.582 cy SPM 0.20
ADD32ri VFMADD213PSYr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VFMADD213PSr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VFMADD213SDr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VFMADD213SSr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VINSERTF128rr LAT 3.001 cy TP 1.000 cy SPM -0.00
ADD32ri VMULPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VMULSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VMULSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VSUBSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD32ri VSUBSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 ADD64ri32 LAT 1.086 cy TP 0.611 cy SPM 1.07
ADD64ri32 INC64r LAT 1.086 cy TP 0.605 cy SPM 0.99
ADD64ri32 MOV64ri32 LAT 1.000 cy TP 0.578 cy SPM 0.76
ADD64ri32 SUB32ri LAT 1.086 cy TP 0.611 cy SPM 0.95
ADD64ri32 VADDPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VADDSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VADDSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VCVTSI642SSrr LAT 2.001 cy TP 1.000 cy SPM -3.39
ADD64ri32 VFMADD213PDYr LAT 4.002 cy TP 0.581 cy SPM 0.20
ADD64ri32 VFMADD213PDr LAT 4.002 cy TP 0.581 cy SPM 0.20
ADD64ri32 VFMADD213PSYr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VFMADD213PSr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VFMADD213SDr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VFMADD213SSr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VINSERTF128rr LAT 3.002 cy TP 1.001 cy SPM 0.00
ADD64ri32 VMULPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VMULSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VMULSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VSUBSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
ADD64ri32 VSUBSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
INC64r INC64r LAT 1.086 cy TP 0.611 cy SPM 0.95
INC64r MOV64ri32 LAT 1.000 cy TP 0.588 cy SPM 0.74
INC64r SUB32ri LAT 1.086 cy TP 0.609 cy SPM 0.89
INC64r VADDPDYrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VADDSDrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VADDSSrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VCVTSI642SSrr LAT 2.001 cy TP 1.000 cy SPM -3.19
INC64r VFMADD213PDYr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VFMADD213PDr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VFMADD213PSYr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VFMADD213PSr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VFMADD213SDr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VFMADD213SSr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VINSERTF128rr LAT 3.001 cy TP 1.000 cy SPM 0.00
INC64r VMULPDYrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VMULSDrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VMULSSrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VSUBSDrr LAT 4.002 cy TP 0.564 cy SPM 0.13
INC64r VSUBSSrr LAT 4.002 cy TP 0.564 cy SPM 0.13
MOV64ri32 MOV64ri32 LAT 0.657 cy TP 0.578 cy SPM 0.63
MOV64ri32 SUB32ri LAT 1.000 cy TP 0.578 cy SPM 0.68
MOV64ri32 VADDPDYrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VADDSDrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VADDSSrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VCVTSI642SSrr LAT 2.001 cy TP 1.001 cy SPM -2.83
MOV64ri32 VFMADD213PDYr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VFMADD213PDr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VFMADD213PSYr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VFMADD213PSr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VFMADD213SDr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VFMADD213SSr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VINSERTF128rr LAT 3.001 cy TP 1.001 cy SPM 0.00
MOV64ri32 VMULPDYrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VMULSDrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VMULSSrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VSUBSDrr LAT 4.002 cy TP 0.557 cy SPM 0.10
MOV64ri32 VSUBSSrr LAT 4.002 cy TP 0.557 cy SPM 0.10
SUB32ri SUB32ri LAT 1.086 cy TP 0.611 cy SPM 0.85
SUB32ri VADDPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VADDSDrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VADDSSrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VCVTSI642SSrr LAT 2.001 cy TP 1.000 cy SPM -3.03
SUB32ri VFMADD213PDYr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VFMADD213PDr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VFMADD213PSYr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VFMADD213PSr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VFMADD213SDr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VFMADD213SSr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VINSERTF128rr LAT 3.001 cy TP 1.000 cy SPM -0.00
SUB32ri VMULPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VMULSDrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VMULSSrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VSUBSDrr LAT 4.002 cy TP 0.592 cy SPM 0.21
SUB32ri VSUBSSrr LAT 4.002 cy TP 0.961 cy SPM 1.33
VADDPDYrr VADDPDYrr LAT 4.002 cy TP 1.036 cy SPM 0.98
VADDPDYrr VADDSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDPDYrr VADDSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDPDYrr VCVTSI642SSrr LAT 4.002 cy TP 2.001 cy SPM -0.00
VADDPDYrr VFMADD213PDYr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDPDYrr VFMADD213PDr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDPDYrr VFMADD213PSYr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDPDYrr VFMADD213PSr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDPDYrr VFMADD213SDr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDPDYrr VFMADD213SSr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDPDYrr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
VADDPDYrr VMULPDYrr LAT 4.002 cy TP 1.036 cy SPM 0.98
VADDPDYrr VMULSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDPDYrr VMULSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDPDYrr VSUBSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDPDYrr VSUBSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDSDrr VADDSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VADDSDrr VADDSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VADDSDrr VCVTSI642SSrr LAT 4.002 cy TP 2.001 cy SPM 0.00
VADDSDrr VFMADD213PDYr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSDrr VFMADD213PDr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSDrr VFMADD213PSYr LAT 4.002 cy TP 1.030 cy SPM 0.97
VADDSDrr VFMADD213PSr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSDrr VFMADD213SDr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSDrr VFMADD213SSr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSDrr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
VADDSDrr VMULPDYrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDSDrr VMULSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VADDSDrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VADDSDrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VADDSDrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VADDSSrr VADDSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VADDSSrr VCVTSI642SSrr LAT 4.002 cy TP 2.001 cy SPM 0.00
VADDSSrr VFMADD213PDYr LAT 4.002 cy TP 1.030 cy SPM 0.97
VADDSSrr VFMADD213PDr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSSrr VFMADD213PSYr LAT 4.002 cy TP 1.030 cy SPM 0.97
VADDSSrr VFMADD213PSr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSSrr VFMADD213SDr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSSrr VFMADD213SSr LAT 4.002 cy TP 1.029 cy SPM 0.97
VADDSSrr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
VADDSSrr VMULPDYrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VADDSSrr VMULSDrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VADDSSrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VADDSSrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VADDSSrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VCVTSI642SSrr VCVTSI642SSrr LAT 4.002 cy TP 4.002 cy SPM 1.00
VCVTSI642SSrr VFMADD213PDYr LAT 4.002 cy TP 2.001 cy SPM -0.00
VCVTSI642SSrr VFMADD213PDr LAT 4.002 cy TP 2.001 cy SPM -0.00
VCVTSI642SSrr VFMADD213PSYr LAT 4.002 cy TP 2.001 cy SPM -0.00
VCVTSI642SSrr VFMADD213PSr LAT 4.002 cy TP 2.001 cy SPM 0.00
VCVTSI642SSrr VFMADD213SDr LAT 4.002 cy TP 2.001 cy SPM -0.00
VCVTSI642SSrr VFMADD213SSr LAT 4.002 cy TP 2.147 cy SPM 0.28
VCVTSI642SSrr VINSERTF128rr LAT 3.002 cy TP 3.001 cy SPM 1.00
VCVTSI642SSrr VMULPDYrr LAT 4.002 cy TP 2.001 cy SPM -0.00
VCVTSI642SSrr VMULSDrr LAT 4.002 cy TP 2.001 cy SPM 0.00
VCVTSI642SSrr VMULSSrr LAT 4.002 cy TP 2.001 cy SPM -0.00
VCVTSI642SSrr VSUBSDrr LAT 4.002 cy TP 2.001 cy SPM 0.00
VCVTSI642SSrr VSUBSSrr LAT 4.002 cy TP 2.001 cy SPM -0.00
VFMADD213PDYr VFMADD213PDYr LAT 4.002 cy TP 1.047 cy SPM 1.00
VFMADD213PDYr VFMADD213PDr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PDYr VFMADD213PSYr LAT 4.002 cy TP 1.047 cy SPM 1.00
VFMADD213PDYr VFMADD213PSr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PDYr VFMADD213SDr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PDYr VFMADD213SSr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PDYr VINSERTF128rr LAT 4.002 cy TP 1.001 cy SPM 0.00
VFMADD213PDYr VMULPDYrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PDYr VMULSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PDYr VMULSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PDYr VSUBSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PDYr VSUBSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PDr VFMADD213PDr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PDr VFMADD213PSYr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PDr VFMADD213PSr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PDr VFMADD213SDr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PDr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PDr VINSERTF128rr LAT 4.002 cy TP 0.675 cy SPM -0.62
VFMADD213PDr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
VFMADD213PDr VMULSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PDr VMULSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PDr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PDr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PSYr VFMADD213PSYr LAT 4.002 cy TP 1.047 cy SPM 1.00
VFMADD213PSYr VFMADD213PSr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PSYr VFMADD213SDr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PSYr VFMADD213SSr LAT 4.002 cy TP 1.045 cy SPM 1.00
VFMADD213PSYr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
VFMADD213PSYr VMULPDYrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PSYr VMULSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PSYr VMULSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PSYr VSUBSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PSYr VSUBSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
VFMADD213PSr VFMADD213PSr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PSr VFMADD213SDr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PSr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213PSr VINSERTF128rr LAT 4.002 cy TP 0.675 cy SPM -0.62
VFMADD213PSr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
VFMADD213PSr VMULSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PSr VMULSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PSr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213PSr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213SDr VFMADD213SDr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213SDr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213SDr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
VFMADD213SDr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
VFMADD213SDr VMULSDrr LAT 4.002 cy TP 1.156 cy SPM 1.21
VFMADD213SDr VMULSSrr LAT 4.002 cy TP 1.156 cy SPM 1.21
VFMADD213SDr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213SDr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213SSr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
VFMADD213SSr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
VFMADD213SSr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
VFMADD213SSr VMULSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213SSr VMULSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213SSr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VFMADD213SSr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
VINSERTF128rr VINSERTF128rr LAT 3.001 cy TP 2.001 cy SPM 1.00
VINSERTF128rr VMULPDYrr LAT 4.002 cy TP 1.000 cy SPM -0.00
VINSERTF128rr VMULSDrr LAT 4.002 cy TP 1.000 cy SPM -0.00
VINSERTF128rr VMULSSrr LAT 4.002 cy TP 1.000 cy SPM -0.00
VINSERTF128rr VSUBSDrr LAT 4.002 cy TP 1.000 cy SPM -0.00
VINSERTF128rr VSUBSSrr LAT 4.002 cy TP 1.000 cy SPM -0.00
VMULPDYrr VMULPDYrr LAT 4.002 cy TP 1.036 cy SPM 0.98
VMULPDYrr VMULSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VMULPDYrr VMULSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VMULPDYrr VSUBSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VMULPDYrr VSUBSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
VMULSDrr VMULSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VMULSDrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VMULSDrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VMULSDrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VMULSSrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VMULSSrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VMULSSrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VSUBSDrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
VSUBSDrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
VSUBSSrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
[likwid-pin] Main PID -> core 0 - OK

Binary file not shown.

BIN
a.out

Binary file not shown.

View File

@@ -1,20 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>English</string>
<key>CFBundleIdentifier</key>
<string>com.apple.xcode.dsym.a.out</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundlePackageType</key>
<string>dSYM</string>
<key>CFBundleSignature</key>
<string>????</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleVersion</key>
<string>1</string>
</dict>
</plist>

View File

@@ -1,29 +0,0 @@
Metadata-Version: 2.1
Name: asmbench
Version: 0.1.4
Summary: A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT
Home-page: https://github.com/RRZE-HPC/asmbench
Author: Julian Hammer
Author-email: julian.hammer@fau.de
License: AGPLv3
Description: asmbench
========
A benchmark toolkit for assembly instructions using the LLVM JIT.
Usage
=====
To benchmark latency and throughput of a 64bit integer add use the following command:
``python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}'``
To benchmark two instructions interleaved use this:
``python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}' 'sub {src:i64:r}, {srcdst:i64:r}'``
To find out more add `-h` for help and `-v` for verbose mode.
Platform: UNKNOWN
Provides-Extra: iaca
Provides-Extra: sc18src

View File

@@ -1,17 +0,0 @@
LICENSE
MANIFEST.in
README.rst
setup.py
asmbench/__init__.py
asmbench/__main__.py
asmbench/bench.py
asmbench/oldjit.py
asmbench/op.py
asmbench/sc18src.py
asmbench/streams.py
asmbench.egg-info/PKG-INFO
asmbench.egg-info/SOURCES.txt
asmbench.egg-info/dependency_links.txt
asmbench.egg-info/entry_points.txt
asmbench.egg-info/requires.txt
asmbench.egg-info/top_level.txt

View File

@@ -1 +0,0 @@

View File

@@ -1,3 +0,0 @@
[console_scripts]
asmbench = asmbench.__main__:main

View File

@@ -1,9 +0,0 @@
llvmlite>=0.23.2
psutil
[iaca]
kerncraft
[sc18src]
numpy
matplotlib

View File

@@ -1 +0,0 @@
asmbench

View File

@@ -12,9 +12,9 @@ import sys
import llvmlite.binding as llvm
import psutil
try:
from kerncraft import incode_model
from kerncraft import iaca
except ImportError:
incode_model = None
iaca = None
from . import op
@@ -87,13 +87,13 @@ class Benchmark:
def get_iaca_analysis(self, arch):
"""Compile and return IACA analysis."""
if incode_model is None:
if iaca is None:
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
tm = self.get_target_machine()
tmpf = tempfile.NamedTemporaryFile("wb")
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
tmpf.flush()
return incode_model.iaca_analyse_instrumented_binary(tmpf.name, arch)
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT

View File

@@ -1,82 +0,0 @@
#!/usr/bin/env python3
import collections
import itertools
import socket
import textwrap
import numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
from asmbench import op, bench
from asmbench import oldjit
type_size = {
'i32': 4,
'i64': 8,
'f32': 4,
'float': 4,
'f64': 8,
'double': 8,
}
class StreamsBenchmark(bench.Benchmark):
def __init__(self,
read_streams=0, read_write_streams=0, write_streams=0,
stream_byte_length=0,
element_type='i64'):
super().__init__()
self.read_streams = read_streams
self.read_write_streams = read_write_streams
self.write_streams = write_streams
self.stream_byte_length = stream_byte_length
self.element_type = element_type
def build_ir(self, iaca_marker=False):
if iaca_marker:
iaca_start_marker = textwrap.dedent('''\
call void asm "movl $$111,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
iaca_stop_marker = textwrap.dedent('''\
call void asm "movl $$222,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
else:
iaca_start_marker = ''
iaca_stop_marker = ''
ir = textwrap.dedent('''\
define i64 @"test"(i64 %"N"{pointer_arguments})
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{iaca_start_marker}
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
{iaca_stop_marker}
ret i64 %"ret"
}}
''').format(
pointer_arguments='',
loop_body='',
iaca_start_marker=iaca_start_marker,
iaca_stop_marker=iaca_stop_marker)
return ir
if __name__ == '__main__':
bench.setup_llvm()
sb = StreamsBenchmark()
print(sb.build_and_execute())

View File

@@ -1 +0,0 @@
__version__ = '0.1.4'

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python3
import argparse
import psutil
import llvmlite.binding as llvm
from . import op, bench
def main():
parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
# parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
help='instruction declaration, e.g., "add {src:i32:r}, {srcdst:i32:r}"')
parser.add_argument('--serialize', action='store_true',
help='Serialize instructions.')
parser.add_argument('--latency-serial', '-l', type=int, default=8,
help='length of serial chain for each instruction in latency benchmark')
parser.add_argument('--parallel', '-p',type=int, default=10,
help='number of parallel instances of serial chains in throughput '
'benchmark')
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
help='length of serial instances of serial chains in throughput benchmark')
parser.add_argument('--iaca', type=str, default=None,
help='Compare throughput measurement with IACA analysis, pass '
'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
parser.add_argument("--verbose", "-v", action="count", default=0,
help="increase output verbosity")
parser.add_argument('-f', '--frequency', type=float, required=psutil.cpu_freq() is None,
help='Provided (in GHz), if psutil.cpu_freq() does report anything.')
args = parser.parse_args()
if args.frequency:
args.frequency *= 1e9
bench.setup_llvm()
lat, tp = bench.bench_instructions(args.instructions,
serial_factor=args.latency_serial,
parallel_factor=args.parallel,
throughput_serial_factor=args.throughput_serial,
serialize=args.serialize,
verbosity=args.verbose,
iaca_comparison=args.iaca,
frequency=args.frequency)
print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
if __name__ == "__main__":
main()

View File

@@ -1,399 +0,0 @@
#!/usr/bin/env python3
import ctypes
import time
import textwrap
import itertools
import re
from pprint import pprint
import tempfile
import subprocess
import sys
import llvmlite.binding as llvm
import psutil
try:
from kerncraft import iaca
except ImportError:
iaca = None
from . import op
def setup_llvm():
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
def uniquify(l):
# Uniquify list while preserving order
seen = set()
return [x for x in l if x not in seen and not seen.add(x)]
class Benchmark:
def __init__(self, frequency=None):
self.frequency = frequency or psutil.cpu_freq().max * 1e6
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
@staticmethod
def prepare_arguments(previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 10000000,
else:
try:
return int(previous_args[0] * time_factor),
except OverflowError:
return previous_args[0]*10,
@staticmethod
def get_iterations(args) -> int:
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def build_ir(self):
raise NotImplementedError()
def get_llvm_module(self, iaca_marker=False):
"""Build and return LLVM module from LLVM IR code."""
ir = self.build_ir(iaca_marker=iaca_marker)
return llvm.parse_assembly(ir)
def get_target_machine(self):
"""Instantiate and return target machine."""
features = llvm.get_host_cpu_features().flatten()
cpu = '' # llvm.get_host_cpu_name() # Work around until ryzen problems are fixed
return llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=3)
def get_assembly(self, iaca_marker=False):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
# Remove double comments
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
return asm
def get_function_ctype(self):
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
def get_iaca_analysis(self, arch):
"""Compile and return IACA analysis."""
if iaca is None:
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
tm = self.get_target_machine()
tmpf = tempfile.NamedTemporaryFile("wb")
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
tmpf.flush()
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
return_values = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self.get_function_ctype()(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
for i in range(repeat):
tries = 0
while True:
if tries > 10:
raise RuntimeError("Unable to measure non-zero runtime.")
tries += 1
start = time.perf_counter()
ret = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if ret != args[0]-1:
raise RuntimeError(
"Return value {} is invalid, should have been {}.".format(ret, args[0]-1))
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
factor = target_elapsed / elapsed
args = self.prepare_arguments(previous_args=args, time_factor=factor)
continue
else:
# After we have the right argument choice, we keep it.
fixed_args = True
break
return_values.append(ret)
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': self.frequency,
'returned': return_values}
class LoopBenchmark(Benchmark):
def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True, **kwargs):
super().__init__(**kwargs)
self.root_synth = root_synth
self.init_values = init_values or root_synth.get_default_init_values()
self.loop_carried_dependencies = loop_carried_dependencies
if len(root_synth.get_source_registers()) != len(self.init_values):
raise ValueError("Number of init values and source registers do not match.")
def get_source_names(self):
return ['%in.{}'.format(i) for i in range(len(self.root_synth.get_source_registers()))]
def get_destination_names(self):
return ['%out.{}'.format(i) for i in
range(len(self.root_synth.get_destination_registers()))]
def get_phi_code(self):
if not self.loop_carried_dependencies:
return ''
# Compile loop carried dependencies
lcd = []
# Change in naming (src <-> dst) is on purpose!
srcs = self.root_synth.get_destination_registers()
dsts = self.root_synth.get_source_registers()
# cycle iterator is used to not only reuse a single destination, but go through all of them
srcs_it = itertools.cycle(enumerate(srcs))
matched = False
last_match_idx = len(srcs) - 1
for dst_idx, dst in enumerate(dsts):
for src_idx, src in srcs_it:
if src.llvm_type == dst.llvm_type:
lcd.append([dst,
self.get_source_names()[dst_idx],
self.init_values[dst_idx],
src,
self.get_destination_names()[src_idx]])
matched = True
last_match_idx = src_idx
break
# since srcs_it is an infinity iterator, we need to abort after a complete cycle
if src_idx == last_match_idx:
break
if not matched:
raise ValueError("Unable to match source to any destination.")
code = ''
for dst_reg, dst_name, init_value, src_reg, src_name in lcd:
assert dst_reg.llvm_type == src_reg.llvm_type, \
"Source and destination types do not match"
code += ('{dst_name} = phi {llvm_type} [{init_value}, %"entry"], '
'[{src_name}, %"loop"]\n').format(
llvm_type=dst_reg.llvm_type,
dst_name=dst_name,
init_value=init_value,
src_name=src_name)
# Add extra phi for constant values. Assuming LLVM will optimize them "away"
for dst_idx, dst in enumerate(dsts):
if dst not in [d for d, dn, i, s, sn in lcd]:
code += ('{dst_reg} = phi {llvm_type} [{init_value}, %"entry"], '
'[{init_value}, %"loop"]\n').format(
llvm_type=dst.llvm_type,
dst_reg=self.get_source_names()[dst_idx],
init_value=self.init_values[dst_idx])
return code
def build_ir(self):
raise NotImplementedError()
class IntegerLoopBenchmark(LoopBenchmark):
def build_ir(self, iaca_marker=False):
if iaca_marker:
iaca_start_marker = textwrap.dedent('''\
call void asm "movl $$111,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
iaca_stop_marker = textwrap.dedent('''\
call void asm "movl $$222,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
else:
iaca_start_marker = ''
iaca_stop_marker = ''
ir = textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{phi}
{iaca_start_marker}
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
{iaca_stop_marker}
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(
self.root_synth.build_ir(self.get_destination_names(),
self.get_source_names()), ' '),
phi=textwrap.indent(self.get_phi_code(), ' '),
iaca_start_marker=iaca_start_marker,
iaca_stop_marker=iaca_stop_marker)
return ir
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
serialize=False, verbosity=0, iaca_comparison=None,
repeat=4, min_elapsed=0.1, max_elapsed=0.2, frequency=None):
not_serializable = False
try:
# Latency Benchmark
if verbosity > 0:
print('## Latency Benchmark')
p_instrs = []
if not serialize:
for i in instructions:
p_instrs.append(op.Serialized([i] * serial_factor))
else:
p_instrs = [op.Serialized(instructions * serial_factor)]
p = op.Parallelized(p_instrs)
b = IntegerLoopBenchmark(p, frequency=frequency)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
if verbosity >= 3:
print('### IACA Analysis')
try:
print(b.get_iaca_analysis('SKL')['output'])
except ValueError as e:
print("Unable to perform IACA analysis (skipping): ", e)
except FileNotFoundError as e:
print("IACA binary not found by kerncraft. Run iaca_get to install.", e)
result = b.build_and_execute(
repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
result['latency'] = lat
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
except op.NotSerializableError as e:
print("Latency measurement not possible:", e)
not_serializable = True
if not_serializable:
throughput_serial_factor = 1
print("WARNING: throughput_serial_factor has be set to 1.")
# Throughput Benchmark
if verbosity > 0:
print('## Throughput Benchmark')
p_instrs = []
if not serialize:
for i in instructions:
p_instrs.append(op.Serialized([i] * throughput_serial_factor))
else:
p_instrs = [op.Serialized(instructions * throughput_serial_factor)]
p = op.Parallelized(p_instrs * parallel_factor, interleave=True)
b = IntegerLoopBenchmark(p, frequency=frequency)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
if verbosity >= 3:
print('### IACA Analysis')
try:
print(b.get_iaca_analysis('SKL')['output'])
except ValueError as e:
print("Unable to perform IACA analysis (skipping): ", e)
except FileNotFoundError as e:
print("IACA binary not found by kerncraft. Run iaca_get to install.", e)
result = b.build_and_execute(
repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
tp = min(
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
result['throughput'] = tp
if iaca_comparison is not None:
iaca_analysis = b.get_iaca_analysis(iaca_comparison)
result['iaca throughput'] = iaca_analysis['throughput']/(
parallel_factor * throughput_serial_factor)
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
if verbosity > 1 and iaca_comparison is not None:
print('### IACA Results')
print(iaca_analysis['output'])
print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
throughput_serial_factor, parallel_factor))
# Result compilation
return lat, tp
if __name__ == '__main__':
setup_llvm()
i1 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
i2 = op.Instruction(
instruction='sub $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
s = op.Serialized([i1, i2])
i3 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
i4 = op.Instruction(
instruction='sub $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
i5 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
i6 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
s1 = op.Serialized([i1, i2])
s2 = op.Serialized([s1, i3])
s3 = op.Serialized([i4, i5])
p1 = op.Parallelized([i6, s2, s3])
init_values = ['1' for r in p1.get_source_registers()]
b = IntegerLoopBenchmark(p1, init_values)
print(b.build_ir())
print(b.get_assembly())

View File

@@ -1,897 +0,0 @@
#!/usr/bin/env python3
import ctypes
import sys
import time
import textwrap
import itertools
import random
import collections
import pprint
import math
import argparse
import llvmlite.binding as llvm
import psutil
# TODOs
# * API to create test scenarios
# * DSL?
# * Test cases:
# * Instructions:
# * [x] arithmetics \w reg and/or imm.
# * scalar
# * packed
# * [x] lea
# * [x] LOAD / mov \w mem
# * [TODO] STORE / mov to mem
# * [x] Single Latency
# * [x] Single Throughput
# * [TODO] Combined Throughput
# * [TODO] Random Throughput
# * [TODO] Automated TP, Lat, #pipeline analysis
# * [TODO] IACA marked binary output generation
# * [TODO] Fuzzing algorithm
# * [TODO] CLI
# * C based timing routine? As an extension?
# * make sanity checks during runtime, check for fixed frequency and pinning
def floor_harmonic_fraction(n, error=0.1):
"""
Finds closest floored integer or inverse integer and returns error.
(numerator, denominator, relative error) where either numerator or denominator is exactly one.
"""
floor_n = math.floor(n)
if floor_n > 0:
return floor_n, 1, 1 - floor_n / n
else:
i = 2
while (1 / i) > n:
i += 1
return 1, i, 1 - (1 / i) / n
class Benchmark:
def __init__(self, parallel=1, serial=5, frequency=None):
self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
self.parallel = parallel
self.serial = serial
self.frequency = frequency or psutil.cpu_freq().current * 1e6
# Do interesting work
self._loop_body = textwrap.dedent('''\
%"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
%"checksum.1" = call i64 asm sideeffect "
add $1, $0",
"=r,i,r" (i64 1, i64 %"checksum")\
''')
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
def get_ir(self):
# FP add loop - may have issues
# return textwrap.dedent('''\
# define i64 @"test"(i64 %"N")
# {{
# entry:
# %"N.fp" = sitofp i64 %"N" to double
# %"loop_cond" = fcmp olt double 0.0, %"N.fp"
# br i1 %"loop_cond", label %"loop", label %"end"
#
# loop:
# %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"]
# {loop_body}
# %"loop_counter.1" = fadd double %"loop_counter", 1.0
# %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp"
# br i1 %"loop_cond.1", label %"loop", label %"end"
#
# end:
# %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"]
# %"ret" = fptosi double %"ret.fp" to i64
# ret i64 %"ret"
# }}
# ''').format(
# loop_body=textwrap.indent(self._loop_body, ' '))
return textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(self._loop_body, ' '))
def prepare_arguments(self, previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 100,
else:
return int(previous_args[0] * time_factor),
def get_iterations(self, args):
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def get_llvm_module(self):
"""Build and return LLVM module from LLVM IR code."""
if not hasattr(self, '_llvm_module'):
self._llvm_module = llvm.parse_assembly(self.get_ir())
self._llvm_module.verify()
return self._llvm_module
def get_target_machine(self):
"""Instantiate and return target machine."""
if not hasattr(self, '_llvm_module'):
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
self._tm = llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=1)
return self._tm
def get_assembly(self):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
return tm.emit_assembly(self.get_llvm_module())
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self._function_ctype(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
for i in range(repeat):
while True:
start = time.perf_counter()
res = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
factor = target_elapsed / elapsed
args = self.prepare_arguments(previous_args=args, time_factor=factor)
continue
else:
# After we have the right argument choice, we keep it.
fixed_args = True
break
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': self.frequency}
@classmethod
def get_latency(cls, max_serial=6, print_table=False, **kwargs):
if print_table:
print(' s |' + ''.join([' {:^5}'.format(i) for i in range(1, max_serial)]))
print(' | ', end='')
serial_runs = []
for s in range(1, max_serial):
m = cls(serial=s, parallel=1, **kwargs)
r = m.build_and_execute(repeat=1)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * m.parallel * m.serial)
if print_table:
print('{:.3f} '.format(cy_per_it), end='')
sys.stdout.flush()
serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
if print_table:
print()
print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format(
lat=min(serial_runs)[1]))
return min(serial_runs)[1]
@classmethod
def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs):
if print_table:
print('s\p |' + ''.join([' {:^5}'.format(i) for i in range(2, max_parallel)]))
parallel_runs = []
for s in range(1, max_serial):
if print_table:
print('{:>3} | '.format(s), end='')
for p in range(2, max_parallel):
m = cls(serial=s, parallel=p, **kwargs)
r = m.build_and_execute(repeat=1)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * m.parallel * m.serial)
if print_table:
print('{:.3f} '.format(cy_per_it), end='')
sys.stdout.flush()
parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
if print_table:
print()
if print_table:
print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format(
tp=min(parallel_runs)[1]))
return min(parallel_runs)[1]
class InstructionBenchmark(Benchmark):
def __init__(self, instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=4,
**kwargs):
"""
Build LLVM IR for arithmetic instruction benchmark without memory references.
Currently only one destination (dst) or combined destination and source (dstsrc) operand
is allowed. Only instruction's operands ($N) refer to the order of opernads found in
dst + dstsrc + src.
"""
Benchmark.__init__(self, parallel=parallel, serial=serial, **kwargs)
self.instruction = instruction
self.dst_operands = dst_operands
self.dstsrc_operands = dstsrc_operands
self.src_operands = src_operands
self._loop_body = ''
if len(dst_operands) + len(dstsrc_operands) != 1:
raise NotImplemented("Must have exactly one dst or dstsrc operand.")
if not all([op[0] in 'irx'
for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
raise NotImplemented("This class only supports register and immediate operands.")
# Part 1: PHI functions and initializations
for i, dstsrc_op in enumerate(dstsrc_operands):
# constraint code, llvm type string, initial value
if dstsrc_op[0] in 'rx':
# register operand
for p in range(self.parallel):
self._loop_body += (
'%"dstsrc{index}_{p}" = phi {type} '
'[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
else:
raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
# Part 2: Inline ASM call
# Build constraint string from operands
constraints = ','.join(
['=' + dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
[sop[0] for sop in itertools.chain(src_operands)] +
['{}'.format(i + len(dst_operands)) for i in range(len(dstsrc_operands))])
for i, dstsrc_op in enumerate(dstsrc_operands):
# Build instruction from instruction and operands
# TODO support multiple dstsrc operands
# TODO support dst and dstsrc operands at the same time
for p in range(self.parallel):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
for j, dop in enumerate(dstsrc_operands):
operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=j, p=p))
args = ', '.join(operands)
self._loop_body += (
'%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dstsrc_op[1],
instruction='\n'.join([instruction] * self.serial),
constraints=constraints,
args=args,
p=p)
for i, dst_op in enumerate(dst_operands):
# Build instruction from instruction and operands
# TODO support multiple dst operands
# TODO support dst and dstsrc operands at the same time
if self.serial != 1:
raise NotImplemented("Serial > 1 and dst operand is not supported.")
for p in range(self.parallel):
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
args = ', '.join(operands)
self._loop_body += (
'%"dst{index}_{p}.out" = call {dst_type} asm sideeffect'
' "{instruction}", "{constraints}" ({args})\n').format(
index=i,
dst_type=dst_op[1],
instruction=instruction,
constraints=constraints,
args=args,
p=p)
class AddressGenerationBenchmark(Benchmark):
def __init__(self,
offset=('i', 'i64', '0x42'),
base=('r', 'i64', '0'),
index=('r', 'i64', '0'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=4,
**kwargs):
"""
Benchmark for address generation modes.
Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
or 'i' (immediate) and initial_value a string.
E.g., ('r', 'i64', '0') or ('i', None, '4')
+--------------------------------+-----------------------------+
| Mode | AT&T |
+--------------------------------+-----------------------------+
| Offset | leal 0x0100, %eax | <- no latency support
| Base | leal (%esi), %eax |
| Offset + Base | leal -8(%ebp), %eax |
| Offset + Index*Width | leal 0x100(,%ebx,4), %eax |
| Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax |
+--------------------------------+-----------------------------+
OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
offset: immediate integer (+/-)
base: register
index: register
width: immediate 1,2,4 or 8
"""
Benchmark.__init__(self, parallel=parallel, serial=serial, **kwargs)
self.offset = offset
self.base = base
self.index = index
self.width = width
self.destination = destination
self.parallel = parallel
# Sanity checks:
if bool(index) ^ bool(width):
raise ValueError("Index and width both need to be set, or be None.")
elif index and width:
if width[0] != 'i' or int(width[2]) not in [1, 2, 4, 8]:
raise ValueError("Width may only be immediate 1,2,4 or 8.")
if index[0] != 'r':
raise ValueError("Index must be a register.")
if offset and offset[0] != 'i':
raise ValueError("Offset must be an immediate.")
if base and base[0] != 'r':
raise ValueError("Offset must be a register.")
if not index and not width and not offset and not base:
raise ValueError("Must provide at least an offset or base.")
if destination == 'base' and not base:
raise ValueError("Destination may only be set to 'base' if base is set.")
elif destination == 'index' and not index:
raise ValueError("Destination may only be set to 'index' if index is set.")
elif destination not in ['base', 'index']:
raise ValueError("Destination must be set to 'base' or 'index'.")
if not base and not index:
raise ValueError("Either base or index must be set for latency test to work.")
if serial != 1 and not (base or index):
raise ValueError("Serial > 1 only works with index and/or base in use.")
self._loop_body = ''
ops = ''
if offset:
ops += offset[2]
if base:
ops += '($0'
if width and index:
ops += ',$1,{}'.format(width[2])
ops += ')'
if destination == 'base':
ops += ', $0'
else: # destination == 'index'
ops += ', $1'
else:
if width and index:
ops += '(,$0,{}), $0'.format(width[2])
ops += ' '
if destination == 'base':
destination_reg = base
else: # destination == 'index'
destination_reg = index
# Part 1: PHI function for destination
for p in range(parallel):
self._loop_body += (
'%"{name}_{p}.0" = '
'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format(
name=destination, type=destination_reg[1], initial=destination_reg[2], p=p,
s=self.serial)
for p in range(parallel):
for s in range(self.serial):
constraints = '=r,r'
if base and index:
constraints += ',r'
if destination == 'base':
args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format(
base_type=base[1], base_name=destination,
index_type=index[1], index_value=index[2], p=p, s_in=s)
else: # destination == 'index':
args = '{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format(
base_type=base[1], base_value=base[2],
index_type=index[1], index_name=destination, p=p, s_in=s)
else:
args = '{type} %"{name}_{p}.{s_in}"'.format(
type=destination_reg[1], name=destination, p=p, s_in=s)
self._loop_body += (
'%"{name}_{p}.{s_out}" = call {type} asm sideeffect'
' "lea {ops}", "{constraints}" ({args})\n').format(
name=destination,
type=destination_reg[1],
ops=ops,
constraints=constraints,
args=args,
p=p,
s_out=s + 1)
class LoadBenchmark(Benchmark):
def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4, **kwargs):
"""
Benchmark for L1 load using pointer chasing.
*chain_length* is the number of pointers to place in memory.
*structure* may be 'linear' (1-offsets) or 'random'.
"""
Benchmark.__init__(self, parallel=parallel, serial=1, **kwargs)
self._serial = serial
self._loop_body = ''
element_type = ctypes.POINTER(ctypes.c_int)
self._function_ctype = ctypes.CFUNCTYPE(
ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
self.chain_length = chain_length
self.parallel = parallel
self.structure = structure
self._pointer_field = (element_type * chain_length)()
if chain_length % serial != 0:
raise ValueError(
"chain_length ({}) needs to be divisible by serial factor ({}).".format(
chain_length, serial))
# Initialize pointer field
# Field must represent a ring of pointers
if structure == 'linear':
for i in range(chain_length):
self._pointer_field[i] = ctypes.cast(
ctypes.pointer(self._pointer_field[(i + 1) % chain_length]), element_type)
elif structure == 'random':
shuffled_indices = list(range(chain_length))
random.shuffle(shuffled_indices)
for i in range(chain_length):
self._pointer_field[shuffled_indices[i]] = ctypes.cast(
ctypes.pointer(self._pointer_field[shuffled_indices[(i + 1) % chain_length]]),
element_type)
else:
raise ValueError("Given structure is not supported. Supported are: "
"linear and random.")
def prepare_arguments(self, previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return self._pointer_field, 100
else:
return previous_args[0], int(previous_args[1] * time_factor)
def get_iterations(self, args):
"""Return number of iterations performed, based on lower level function arguments."""
return self.chain_length * args[1]
def get_ir(self):
"""
Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1):
int test(int** ptrf, int repeat) {
int** p0 = (int**)ptrf[0];
int i = 0;
while(i < N) {
int** p = (int**)*p0;
while(p != p0) {
p = (int**)*p;
}
i++;
}
return i;
}
"""
ret = textwrap.dedent('''
define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
entry:
''')
# Load pointer to ptrf[p] and p0
for p in range(self.parallel):
if p > 0:
ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
ret += (
' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
ret += textwrap.dedent('''
%"cmp.entry" = icmp sgt i32 %"repeats", 0
br i1 %"cmp.entry", label %"loop0", label %"end"
loop0:
br label %"loop1"
loop1:
%"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
br label %"loop2"
loop2:\n''')
for p in range(self.parallel):
ret += (' %"p_{p}.0" = phi i32** '
'[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format(
p=p, s_max=self._serial)
# load p, compare to p0 and or-combine results
for p in range(self.parallel):
for s in range(self._serial):
ret += (' %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n'
' %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format(
p=p, s=s + 1, s_prev=s)
# Compare is needed for all registers, for llvm not to remove unused
# instructions:
ret += ' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format(
p=p, s_max=self._serial)
# TODO tree reduce cmp to make use of all cmp_* values
# It is sufficient to use only one compare, all others will be eliminated
ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
ret += textwrap.dedent('''
loop3:
%"i.1" = add i32 %"i", 1
%"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
br i1 %"cmp.loop3", label %"end", label %"loop1"
end:
%"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
ret i32 %"ret"
}''')
return ret
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-v', '--verbose', action='count', default=0)
parser.add_argument('-f', '--frequency', type=float, required=psutil.cpu_freq() is None,
help='Provided (in GHz), if psutil.cpu_freq() does report anything.')
args = parser.parse_args()
if args.frequency:
args.frequency *= 1e9
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
modules = collections.OrderedDict()
# immediate source
modules['add i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=1,
serial=5,
frequency=args.frequency)
# register source
modules['add r64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('r', 'i64', '1'),),
parallel=1,
serial=5,
frequency=args.frequency)
# multiple instructions
modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=1,
serial=5,
frequency=args.frequency)
# immediate source
modules['add i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=5,
frequency=args.frequency)
# register source
modules['add r64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('r', 'i64', '1'),),
parallel=10,
serial=5,
frequency=args.frequency)
# multiple instructions
modules['4xadd i64 r64 TP'] = InstructionBenchmark(
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
dst_operands=(),
dstsrc_operands=(('r', 'i64', '0'),),
src_operands=(('i', 'i64', '1'),),
parallel=10,
serial=1,
frequency=args.frequency)
modules['lea base LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5,
frequency=args.frequency)
modules['lea base+offset LAT'] = AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5,
frequency=args.frequency)
modules['lea index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5,
frequency=args.frequency)
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5,
frequency=args.frequency)
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5,
frequency=args.frequency)
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5,
frequency=args.frequency)
modules['lea base TP'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1,
frequency=args.frequency)
modules['lea base+offset TP'] = AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1,
frequency=args.frequency)
modules['lea index*width TP'] = AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1,
frequency=args.frequency)
modules['lea offset+index*width TP'] = AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1,
frequency=args.frequency)
modules['lea base+index*width TP'] = AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1,
frequency=args.frequency)
modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1,
frequency=args.frequency)
modules['LD linear LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=1,
serial=8,
frequency=args.frequency)
modules['LD random LAT'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=1,
serial=8,
frequency=args.frequency)
modules['LD linear TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=16,
serial=1,
frequency=args.frequency)
modules['LD random TP'] = LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=16,
serial=1,
frequency=args.frequency)
modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark(
instruction='vaddpd $1, $0, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=1,
serial=5,
frequency=args.frequency)
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark(
instruction='vmulpd $1, $0, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=1,
serial=5,
frequency=args.frequency)
# This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies:
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark(
instruction='vmulpd $1, $2, $0',
dst_operands=(),
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
parallel=10,
serial=1,
frequency=args.frequency)
modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('LD ')])
for key, module in modules.items():
if args.verbose > 0:
print("=== Benchmark")
print(repr(module))
print("=== LLVM")
print(module.get_ir())
print("=== Assembly")
print(module.get_assembly())
r = module.build_and_execute(repeat=3)
if args.verbose > 0:
print("=== Result")
pprint.pprint(r)
cy_per_it = min(r['runtimes']) * r['frequency'] / (
r['iterations'] * module.parallel * module.serial)
print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format(
key=key,
module=module,
cy_per_it=cy_per_it,
runtime_sum=sum(r['runtimes'])))
# InstructionBenchmark.get_latency(
# instruction='vmulpd $1, $0, $0',
# dst_operands=(),
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
# print_table=True)
# InstructionBenchmark.get_throughput(
# instruction='vmulpd $1, $0, $0',
# dst_operands=(),
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
# print_table=True)
#
# InstructionBenchmark.get_latency(
# instruction='nop',
# dst_operands=(),
# dstsrc_operands=(('r','i8', '0'),),
# src_operands=(),
# print_table=True)
# InstructionBenchmark.get_throughput(
# instruction='nop',
# dst_operands=(),
# dstsrc_operands=(('r','i8', '0'),),
# src_operands=(),
# print_table=True)

View File

@@ -1,514 +0,0 @@
#!/usr/bin/env python3
import re
from itertools import zip_longest
# TODO use abc to force implementation of interface requirements
init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
# LLVM requires floating point constants to have a non-repeating binary representation
# See http://llvm.org/docs/LangRef.html#simple-constants for details
init_value_by_llvm_type.update({fp_type: str(1+1/2**10)
for fp_type in ['float', 'double', 'fp128']})
# For vector-types we reuse the scalar values
init_value_by_llvm_type.update(
{'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
for t, v in init_value_by_llvm_type.items()
for vec in [2, 4, 8, 16, 32, 64]})
class NotSerializableError(Exception):
pass
class Operand:
def __init__(self, llvm_type):
self.llvm_type = llvm_type
def get_constraint_char(self):
raise NotImplementedError()
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
@staticmethod
def from_string(s):
options = [Register.from_string, Immediate.from_string, MemoryReference.from_string]
for o in options:
try:
return o(s)
except ValueError:
continue
raise ValueError("No matching operand type found for '{}'.".format(s))
class Immediate(Operand):
def __init__(self, llvm_type, value):
Operand.__init__(self, llvm_type)
self.value = value
def get_constraint_char(self):
return 'i'
@classmethod
def from_string(cls, s):
"""
Create Immediate object from string.
:param s: must have the form: "llvm_type:value"
"""
llvm_type, value = s.split(':', 1)
value_regex = r'(0x[0-9a-fA-F]+|[0-9]+(\.[0-9]+)?)'
if not re.match(value_regex, value):
raise ValueError("Invalid immediate value, must match {!r}".format(value_regex))
return cls(llvm_type, value)
class MemoryReference(Operand):
"""
offset + base + index*width
OFFSET(BASE, INDEX, WIDTH) in AT&T assembly
Possible operand values:
offset: immediate integer (+/-)
base: register
index: register
width: immediate 1,2,4 or 8
"""
def __init__(self, llvm_type, offset=None, base=None, index=None, width=None):
super().__init__(llvm_type)
self.offset = offset
self.base = base
self.index = index
self.width = width
# Sanity checks:
if bool(index) ^ bool(width):
raise ValueError("Index and width both need to be set, or None.")
elif index and width:
if not (isinstance(width, Immediate) and int(width.value) in [1, 2, 4, 8]):
raise ValueError("Width may only be immediate 1,2,4 or 8.")
if not isinstance(index, Register):
raise ValueError("Index must be a register.")
if offset and not isinstance(offset, Immediate):
raise ValueError("Offset must be an immediate.")
if base and not isinstance(base, Register):
raise ValueError("Offset must be a register.")
if not index and not width and not offset and not base:
raise ValueError("Must provide at least an offset or base.")
def get_constraint_char(self):
return 'm'
def get_registers(self):
if self.base:
yield self.base
if self.index:
yield self.index
@classmethod
def from_string(cls, s):
"""
Create MemoryReference from string.
:param s: must fulfill the regex: "mem:[bdis]+"
"""
m = re.match(r"\*([^:]+):([obiw]+)", s)
if not m:
raise ValueError("Invalid format, must match 'mem:[obiw]+'.")
else:
llvm_type, features = m.groups()
offset = None
if 'o' in features:
offset = Immediate('i32', 8)
base = None
if 'b' in features:
base = Register('i64', 'r')
index = None
if 'i' in features:
index = Register('i64', 'r')
width = None
if 'w' in features:
width = Immediate('i32', 8)
return cls(llvm_type, offset=offset, base=base, index=index, width=width)
class Register(Operand):
def __init__(self, llvm_type, constraint_char='r'):
super().__init__(llvm_type)
self.constraint_char = constraint_char
def get_constraint_char(self):
return self.constraint_char
@classmethod
def from_string(cls, s):
"""
Create Register object from string.
:param s: must have the form: "llvm_type:constraint_char"
"""
llvm_type, constraint_char = s.split(':', 1)
valid_cc = 'rx'
if constraint_char not in valid_cc:
raise ValueError("Invalid constraint character, must be one of {!r}".format(valid_cc))
return cls(llvm_type, constraint_char)
class Synthable:
def __init__(self):
pass
def build_ir(self, dst_reg_names, src_reg_names, used_registers):
raise NotImplementedError()
def get_source_registers(self):
raise NotImplementedError()
def get_destination_registers(self):
raise NotImplementedError()
@staticmethod
def _get_unused_reg_name(used_registers):
name = None
i = 0
while name in used_registers or name is None:
name = '%"reg.{}"'.format(i)
i += 1
used_registers.add(name)
return name
def get_default_init_values(self):
r = []
for reg in self.get_source_registers():
try:
r.append(init_value_by_llvm_type[reg.llvm_type])
except KeyError:
raise ValueError("Invalid or unsupported LLVM type {!r}.".format(reg.llvm_type))
return r
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
class Operation(Synthable):
"""Base class for operations."""
class Instruction(Operation):
def __init__(self, instruction, destination_operand, source_operands):
super().__init__()
self.instruction = instruction
self.destination_operand = destination_operand
assert isinstance(destination_operand, Register), "Destination needs to be a register."
self.source_operands = source_operands
def get_source_registers(self):
sop_types = set()
sr = []
for sop in self.source_operands:
if isinstance(sop, Register):
if sop.llvm_type not in sop_types:
sop_types.add(sop.llvm_type)
sr.append(sop)
elif isinstance(sop, MemoryReference):
sr += list(sop.get_registers())
return sr
def get_destination_registers(self):
if isinstance(self.destination_operand, Register):
return [self.destination_operand]
else:
return []
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
"""
Build IR string based on in and out operand names and types.
"""
if used_registers is None:
used_registers = set(dst_reg_names + src_reg_names)
# Build constraint string from operands
constraints = ','.join(
['=' + self.destination_operand.get_constraint_char()] +
[sop.get_constraint_char() for sop in self.source_operands])
# Build argument string from operands and register names
operands = []
sop_types = {}
i = 0
for sop in self.source_operands:
if isinstance(sop, Immediate):
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=sop.value))
elif isinstance(sop, Register):
if sop.llvm_type in sop_types:
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[sop_types[sop.llvm_type]]))
else:
sop_types[sop.llvm_type] = i
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[i]))
i += 1
elif isinstance(sop, MemoryReference):
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[i]))
i += 1
else:
raise NotImplementedError("Only register and immediate operands are supported.")
args = ', '.join(operands)
# Build instruction from instruction and operands
return ('{dst_reg} = call {dst_type} asm '
' "{instruction}", "{constraints}" ({args})').format(
dst_reg=dst_reg_names[0],
dst_type=self.destination_operand.llvm_type,
instruction=self.instruction,
constraints=constraints,
args=args)
@classmethod
def from_string(cls, s):
"""
Create Instruction object from string.
:param s: must have the form:
"asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
"""
instruction = s
# It is important that the match objects are in reverse order, to allow string replacements
# based on original match group locations
operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
# Destination indices start at 0
dst_index = 0
# Source indices at "number of destination operands"
src_index = ['dst' in o.group(1) for o in operands].count(True)
dst_ops = []
src_ops = []
for m in operands:
direction, operand_string = m.group(1, 2)
operand = Operand.from_string(operand_string)
if 'src' in direction and not 'dst' in direction:
src_ops.append(operand)
# replace with index string
instruction = (instruction[:m.start()] + "${}".format(src_index)
+ instruction[m.end():])
src_index += 1
if 'dst' in direction:
dst_ops.append(operand)
# replace with index string
instruction = (instruction[:m.start()] + "${}".format(dst_index)
+ instruction[m.end():])
if 'src' in direction:
src_ops.append(Register(operand_string.split(':', 1)[0], str(dst_index)))
src_index += 1
dst_index += 1
if len(dst_ops) != 1:
raise ValueError("Instruction supports only single destinations.")
return cls(instruction, dst_ops[0], src_ops)
class Load(Operation):
def __init__(self, chain_length, structure='linear'):
"""
*chain_length* is the number of pointers to place in memory.
*structure* may be 'linear' (1-offsets) or 'random'.
"""
super().__init__()
self.chain_length = chain_length
self.structure = structure
# TODO
class AddressGeneration(Operation):
def __init__(self, offset, base, index, width, destination='base'):
super().__init__()
self.offset = offset
self.base = base
self.index = index
self.width = width
self.destination = destination
raise NotImplementedError()
class Serialized(Synthable):
def __init__(self, synths):
super().__init__()
self.synths = synths
assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
def get_source_registers(self):
if self.synths:
return self.synths[0].get_source_registers()
else:
return []
def get_destination_registers(self):
if self.synths:
return self.synths[-1].get_destination_registers()
else:
return []
@staticmethod
def match(source_registers, destination_registers):
"""
Find maximum number of matches from source (previous destinations) to
destination (current source) registers.
Return list of two-tuples of matches (src_idx, dst_idx)
"""
matched_pairs = []
unmatched_dests = set(destination_registers)
for dst_idx, dst in enumerate(destination_registers):
for src_idx, src in enumerate(source_registers):
if src.llvm_type == dst.llvm_type:
matched_pairs.append((src_idx, dst_idx))
unmatched_dests.discard(dst)
return matched_pairs, unmatched_dests
def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
reg_naming_out = []
dst_naming = []
last_s = None
for i, s in enumerate(self.synths):
if i == 0:
# first source is passed in from outside
src_naming = src_reg_names
else:
# match with previous destinations
src_naming = []
match = False
for src in s.get_source_registers():
# Find matching destination from previous synths
src_match = False
for dst_idx, dst in enumerate(last_s.get_destination_registers()):
if dst.llvm_type == src.llvm_type:
match = src_match = True
src_naming.append(dst_naming[dst_idx])
# If source could not be matched, use constant value instead
if not src_match:
src_naming.append(init_value_by_llvm_type[src.llvm_type])
if not match:
raise NotSerializableError("Unable to find match.")
if i == len(self.synths) - 1:
# last destination is passed in from outside
dst_naming = dst_reg_names
else:
# noinspection PyUnusedLocal
dst_naming = [self._get_unused_reg_name(used_registers)
for j in s.get_destination_registers()]
reg_naming_out.append((dst_naming, src_naming))
last_s = s
return reg_naming_out, used_registers
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
if used_registers is None:
used_registers = set(dst_reg_names + src_reg_names)
reg_names, used_registers = self.generate_register_naming(
dst_reg_names, src_reg_names, used_registers)
code = []
for s, r in zip(self.synths, reg_names):
code.append(s.build_ir(*r, used_registers))
return '\n'.join(code)
class Parallelized(Synthable):
def __init__(self, synths, interleave=False):
super().__init__()
self.synths = synths
self.interleave = interleave
assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
def get_source_registers(self):
sources = []
for s in self.synths:
sources += s.get_source_registers()
return sources
def get_destination_registers(self):
destinations = []
for s in self.synths:
destinations += s.get_destination_registers()
return destinations
def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
# Split reg_naming among all synths
reg_naming_out = []
for s in self.synths:
n_dsts = len(s.get_destination_registers())
n_srcs = len(s.get_source_registers())
reg_naming_out.append((dst_reg_names[:n_dsts], src_reg_names[:n_srcs]))
dst_reg_names, src_reg_names = (dst_reg_names[n_dsts:], src_reg_names[n_srcs:])
return reg_naming_out, used_registers
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
if used_registers is None:
used_registers = set(dst_reg_names + src_reg_names)
reg_names, used_registers = self.generate_register_naming(
dst_reg_names, src_reg_names, used_registers)
code = []
for s, r in zip(self.synths, reg_names):
code.append(s.build_ir(*r, used_registers))
# Interleave parallelized sequences
if self.interleave:
code = ['\n'.join(filter(None.__ne__, c))
for c in list(zip_longest(*[c.split('\n') for c in code]))]
return '\n'.join(code)
if __name__ == '__main__':
i1 = Instruction(
instruction='add $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
i2 = Instruction(
instruction='sub $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
i3 = Instruction(
instruction='mul $1, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Register('i64', 'r')])
i4 = Instruction(
instruction='div $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
i5 = Instruction(
instruction='mul $2, $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
i6 = Instruction(
instruction='inc $0',
destination_operand=Register('i64', 'r'),
source_operands=[Register('i64', 'r')])
s1 = Serialized([i1, i2])
s2 = Serialized([s1, i3])
print(s1.build_ir(['%out'], ['%in']), '\n')
print(s2.build_ir(['%out'], ['%in']), '\n')
s3 = Serialized([i4, i5])
p1 = Parallelized([i6, s2, s3])
print(p1.build_ir(['%out.0', '%out.1', '%out.2'], ['%in.0', '%in.1', '%in.2']), '\n')
s4 = Serialized([i1, i2, i3, i4, i5, i6])
print(s4.build_ir(['%out'], ['%in']), '\n')
print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))

View File

@@ -1,243 +0,0 @@
#!/usr/bin/env python3
import collections
import itertools
import socket
import numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
from asmbench import op, bench
from asmbench import oldjit
def jit_based_benchs():
modules = collections.OrderedDict()
modules['lea_b'] = (
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1))
modules['lea_b+off'] = (
oldjit.AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=('i', None, '23'),
base=('r', 'i64', '666'),
index=None,
width=None,
destination='base',
parallel=10,
serial=1))
modules['lea_idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=None,
base=None,
index=('r', 'i64', '1'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1))
modules['lea_off+idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=('i', 'i64', '-0x8'),
base=None,
index=('r', 'i64', '51'),
width=('i', None, '4'),
destination='index',
parallel=10,
serial=1))
modules['lea_b+idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=None,
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1))
modules['lea_b+off+idx*w'] = (
oldjit.AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=1,
serial=5),
oldjit.AddressGenerationBenchmark(
offset=('i', None, '42'),
base=('r', 'i64', '23'),
index=('r', 'i64', '12'),
width=('i', None, '4'),
destination='base',
parallel=10,
serial=1))
modules['LD_linear'] = (
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=1,
serial=2),
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='linear',
parallel=4,
serial=2))
modules['LD_random'] = (
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=1,
serial=2),
oldjit.LoadBenchmark(
chain_length=2048, # 2048 * 8B = 16kB
structure='random',
parallel=4,
serial=2))
for name, mods in modules.items():
lat_module, tp_module = mods
r_lat = lat_module.build_and_execute(repeat=3)
cy_per_it_lat = min(r_lat['runtimes']) * r_lat['frequency'] / (
r_lat['iterations'] * lat_module.parallel * lat_module.serial)
r_tp = tp_module.build_and_execute(repeat=3)
cy_per_it_tp = min(r_tp['runtimes']) * r_tp['frequency'] / (
r_tp['iterations'] * tp_module.parallel * tp_module.serial)
print('{key:<16} LAT {cy_per_it_lat:.3f} cy TP {cy_per_it_tp:.3f} cy'.format(
key=name,
cy_per_it_lat=cy_per_it_lat,
cy_per_it_tp=cy_per_it_tp))
def plot_combined(single_measured, combined_measured):
instructions = list(single_measured.keys())
d = numpy.ndarray((len(single_measured), len(single_measured)))
d.fill(float('nan'))
for k, v in combined_measured.items():
i1, i2 = [instructions.index(i) for i in [c[0] for c in k]]
d[i1, i2] = v[2]
cmap = mpl.cm.get_cmap('plasma', 5)
cmap.set_bad('w') # default value is 'k'
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(111)
cax = ax1.imshow(d, interpolation="nearest", cmap=cmap, norm=mpl.colors.Normalize(vmin=-.5, vmax=1.5))
ax1.set_xticks(range(len(instructions)))
ax1.set_xticklabels(instructions, rotation=90)
ax1.set_yticks(range(len(instructions)))
ax1.set_yticklabels(instructions)
ax1.set_title(socket.gethostname())
ax1.grid()
cb = fig.colorbar(cax, shrink=0.65)
cb.set_ticks([-.5, 0, 1, 1.5])
cb.set_ticklabels(['< -0.5', '0.0 (complete overlap)', '1.0 (no overlap)', '> 1.5'])
cb.set_label('inverse parallel overlap')
fig.tight_layout()
plt.show()
if __name__ == '__main__':
bench.setup_llvm()
instructions = [
(i[0], i[1], op.Instruction.from_string(i[1]))
for i in [
('ADD32ri', 'add {src:i32:1}, {srcdst:i32:r}'),
('ADD64ri32', 'add {src:i32:1}, {srcdst:i64:r}'),
('INC64r', 'inc {srcdst:i64:r}'),
('SUB32ri', 'sub {src:i32:1}, {srcdst:i64:r}'),
('MOV64ri32', 'mov {src:i32:1}, {srcdst:i64:r}'),
('VINSERTF128rr', 'vinsertf128 {src:i8:0}, {src:<2 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VCVTSI642SSrr', 'vcvtsi2ss {src:i64:r}, {src:float:x}, {dst:float:x}'),
('VADDPDYrr', 'vaddpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VADDSDrr', 'vaddsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VADDSSrr', 'vaddss {src:float:x}, {src:float:x}, {dst:float:x}'),
('VFMADD213PDYr', 'vfmadd213pd {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
('VFMADD213PDr', 'vfmadd213pd {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
('VFMADD213PSYr', 'vfmadd213ps {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
('VFMADD213PSr', 'vfmadd213ps {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
('VFMADD213SDr', 'vfmadd213sd {src:double:x}, {src:double:x}, {srcdst:double:x}'),
('VFMADD213SSr', 'vfmadd213ss {src:float:x}, {src:float:x}, {srcdst:float:x}'),
('VMULPDYrr', 'vmulpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VMULSDrr', 'vmulsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VMULSSrr', 'vmulss {src:float:x}, {src:float:x}, {dst:float:x}'),
('VSUBSDrr', 'vsubsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VSUBSSrr', 'vsubss {src:float:x}, {src:float:x}, {dst:float:x}'),
('VDIVPDYrr', 'vdivpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
('VDIVSDrr', 'vdivsd {src:double:x}, {src:double:x}, {dst:double:x}'),
('VDIVSSrr', 'vdivss {src:float:x}, {src:float:x}, {dst:float:x}'),
]
]
instructions_measured = collections.OrderedDict()
for llvm_name, i_str, i in instructions:
lat, tp = bench.bench_instructions(
[i],
serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
print('{:<16} LAT {:.3f} cy TP {:.3f} cy'.format(llvm_name, lat, tp))
instructions_measured[llvm_name] = (lat, tp)
jit_based_benchs()
two_combinations_measured = collections.OrderedDict()
for a, b in itertools.combinations_with_replacement(instructions, 2):
lat, tp = bench.bench_instructions(
[a[2], b[2]],
serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
same_port_metric = ((
tp-max(instructions_measured[a[0]][1], instructions_measured[b[0]][1])) /
min(instructions_measured[a[0]][1], instructions_measured[b[0]][1]))
print('{:<16} {:<16} LAT {:.3f} cy TP {:.3f} cy SPM {:>5.2f}'.format(
a[0], b[0], lat, tp, same_port_metric))
two_combinations_measured[(a[0], a[1]), (b[0], b[1])] = (lat, tp, same_port_metric)
plot_combined(instructions_measured, two_combinations_measured)

View File

@@ -1,82 +0,0 @@
#!/usr/bin/env python3
import collections
import itertools
import socket
import textwrap
import numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
from asmbench import op, bench
from asmbench import oldjit
type_size = {
'i32': 4,
'i64': 8,
'f32': 4,
'float': 4,
'f64': 8,
'double': 8,
}
class StreamsBenchmark(bench.Benchmark):
def __init__(self,
read_streams=0, read_write_streams=0, write_streams=0,
stream_byte_length=0,
element_type='i64'):
super().__init__()
self.read_streams = read_streams
self.read_write_streams = read_write_streams
self.write_streams = write_streams
self.stream_byte_length = stream_byte_length
self.element_type = element_type
def build_ir(self, iaca_marker=False):
if iaca_marker:
iaca_start_marker = textwrap.dedent('''\
call void asm "movl $$111,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
iaca_stop_marker = textwrap.dedent('''\
call void asm "movl $$222,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
else:
iaca_start_marker = ''
iaca_stop_marker = ''
ir = textwrap.dedent('''\
define i64 @"test"(i64 %"N"{pointer_arguments})
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{iaca_start_marker}
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
{iaca_stop_marker}
ret i64 %"ret"
}}
''').format(
pointer_arguments='',
loop_body='',
iaca_start_marker=iaca_start_marker,
iaca_stop_marker=iaca_stop_marker)
return ir
if __name__ == '__main__':
bench.setup_llvm()
sb = StreamsBenchmark()
print(sb.build_and_execute())

View File

@@ -1,3 +0,0 @@
#!/bin/sh
clang -g `llvm-config --cflags` test.c -c
clang++ test.o `llvm-config --cxxflags --ldflags --libs --system-libs all` -o test

Binary file not shown.

View File

@@ -1,72 +0,0 @@
/**
* LLVM equivalent of:
*
* int sum(int a, int b) {
* return a + b;
* }
*/
#include <llvm-c/Core.h>
#include <llvm-c/ExecutionEngine.h>
#include <llvm-c/Target.h>
#include <llvm-c/Analysis.h>
#include <llvm-c/BitWriter.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char** argv) {
LLVMModuleRef mod = LLVMModuleCreateWithName("my_module");
LLVMTypeRef param_types[] = { LLVMInt32Type(), LLVMInt32Type() };
LLVMTypeRef ret_type = LLVMFunctionType(LLVMInt32Type(), param_types, 2, 0);
LLVMValueRef sum = LLVMAddFunction(mod, "sum", ret_type);
LLVMBasicBlockRef entry = LLVMAppendBasicBlock(sum, "entry");
LLVMBuilderRef builder = LLVMCreateBuilder();
LLVMPositionBuilderAtEnd(builder, entry);
LLVMValueRef tmp = LLVMBuildAdd(builder, LLVMGetParam(sum, 0), LLVMGetParam(sum, 1), "tmp");
LLVMBuildRet(builder, tmp);
char *error = NULL;
LLVMVerifyModule(mod, LLVMAbortProcessAction, &error);
LLVMDisposeMessage(error);
LLVMExecutionEngineRef engine;
error = NULL;
LLVMLinkInMCJIT();
LLVMInitializeNativeTarget();
if (LLVMCreateExecutionEngineForModule(&engine, mod, &error) != 0) {
fprintf(stderr, "failed to create execution engine\n");
abort();
}
if (error) {
fprintf(stderr, "error: %s\n", error);
LLVMDisposeMessage(error);
exit(EXIT_FAILURE);
}
if (argc < 3) {
fprintf(stderr, "usage: %s x y\n", argv[0]);
exit(EXIT_FAILURE);
}
long long x = strtoll(argv[1], NULL, 10);
long long y = strtoll(argv[2], NULL, 10);
LLVMGenericValueRef args[] = {
LLVMCreateGenericValueOfInt(LLVMInt32Type(), x, 0),
LLVMCreateGenericValueOfInt(LLVMInt32Type(), y, 0)
};
LLVMGenericValueRef res = LLVMRunFunction(engine, sum, 2, args);
printf("%d\n", (int)LLVMGenericValueToInt(res, 0));
// Write out bitcode to file
if (LLVMWriteBitcodeToFile(mod, "sum.bc") != 0) {
fprintf(stderr, "error writing bitcode to file, skipping\n");
}
LLVMDisposeBuilder(builder);
LLVMDisposeExecutionEngine(engine);
}

Binary file not shown.

View File

@@ -1,37 +0,0 @@
#!/usr/bin/env python3
import llvmlite.binding as llvm
llvm.initialize()
# From
# >>> cp = (ctypes.c_char_p * 1)()
# >>> ffi.lib.LLVMPY_GetHostCPUFeatures(cp)
# >>> print(cp[0])
# llvm.set_option('', '-mattr=+sse2,+cx16,-tbm,-avx512ifma,-avx512dq,-fma4,+prfchw,+bmi2,+xsavec,+fsgsbase,+popcnt,+aes,+xsaves,-avx512er,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-xop,+rdseed,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vl,-avx512cd,+avx,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,+cmov,-avx512vbmi,+movbe,+xsaveopt,-sha,+adx,-avx512pf,+sse3')
# llvm.set_option('', '-march=native')
# llvm.set_option('', '-mcpu=native')
# llvm.set_option('', '-version')
# llvm.set_option('', '-help-list-hidden')
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
# llvm.set_option('', '-help-list-hidden')
ir = '''
target triple = "x86_64-apple-darwin17.5.0"
define <4 x double> @testv(i32**, i32) {
%out = tail call <4 x double> asm "vaddpd $1, $2, $0", "=x,x,x,~{dirflag},~{fpsr},~{flags}"(<4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>, <4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>)
ret <4 x double> %out
}
'''
module = llvm.parse_assembly(ir)
module.verify()
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
tm = llvm.Target.from_default_triple().create_target_machine(cpu=cpu, features=features)
with llvm.create_mcjit_compiler(module, tm) as ee:
ee.finalize_object()
print(tm.emit_assembly(module))

Binary file not shown.

View File

@@ -1,55 +0,0 @@
taschenbuch:pyasmjit codemonk$ clang -P - -march=native -### 2>&1|grep -E --color -o -- '"-target-feature" "[^"]+"'
"-target-feature" "+sse2"
"-target-feature" "+cx16"
"-target-feature" "-tbm"
"-target-feature" "-avx512ifma"
"-target-feature" "-avx512dq"
"-target-feature" "-fma4"
"-target-feature" "+prfchw"
"-target-feature" "+bmi2"
"-target-feature" "+xsavec"
"-target-feature" "+fsgsbase"
"-target-feature" "+popcnt"
"-target-feature" "+aes"
"-target-feature" "+xsaves"
"-target-feature" "-avx512er"
"-target-feature" "-avx512vpopcntdq"
"-target-feature" "-clwb"
"-target-feature" "-avx512f"
"-target-feature" "-clzero"
"-target-feature" "-pku"
"-target-feature" "+mmx"
"-target-feature" "-lwp"
"-target-feature" "-xop"
"-target-feature" "+rdseed"
"-target-feature" "-sse4a"
"-target-feature" "-avx512bw"
"-target-feature" "+clflushopt"
"-target-feature" "+xsave"
"-target-feature" "-avx512vl"
"-target-feature" "-avx512cd"
"-target-feature" "+avx"
"-target-feature" "+rtm"
"-target-feature" "+fma"
"-target-feature" "+bmi"
"-target-feature" "+rdrnd"
"-target-feature" "-mwaitx"
"-target-feature" "+sse4.1"
"-target-feature" "+sse4.2"
"-target-feature" "+avx2"
"-target-feature" "+sse"
"-target-feature" "+lzcnt"
"-target-feature" "+pclmul"
"-target-feature" "-prefetchwt1"
"-target-feature" "+f16c"
"-target-feature" "+ssse3"
"-target-feature" "+sgx"
"-target-feature" "+cmov"
"-target-feature" "-avx512vbmi"
"-target-feature" "+movbe"
"-target-feature" "+xsaveopt"
"-target-feature" "-sha"
"-target-feature" "+adx"
"-target-feature" "-avx512pf"
"-target-feature" "+sse3"
taschenbuch:pyasmjit codemonk$

View File

@@ -1,22 +0,0 @@
define i64 @"test"(i64 %"N")
{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
%"in.0" = phi i32 [3, %"entry"], [%"out.0", %"loop"]
%"reg.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"in.0", i32 1)
%"out.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"reg.0", i32 1)
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}

Binary file not shown.

View File

@@ -1,35 +0,0 @@
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test ## -- Begin function test
.p2align 4, 0x90
_test: ## @test
.cfi_startproc
## %bb.0: ## %entry
testq %rdi, %rdi
jle LBB0_1
## %bb.2: ## %loop.preheader
movl $3, %ecx
movq $-1, %rdx
.p2align 4, 0x90
LBB0_3: ## %loop
## =>This Inner Loop Header: Depth=1
## InlineAsm Start
addl $1, %ecx
## InlineAsm End
leaq 1(%rdx), %rax
addq $2, %rdx
cmpq %rdi, %rdx
movq %rax, %rdx
## InlineAsm Start
addl $1, %ecx
## InlineAsm End
jl LBB0_3
## %bb.4: ## %end
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
## -- End function
.subsections_via_symbols

View File

@@ -1,67 +0,0 @@
define i64 @"test"(i64 %"N")
{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
%in.0 = phi i64 [1, %"entry"], [%out.0, %"loop"]
%in.1 = phi i64 [1, %"entry"], [%out.1, %"loop"]
%in.2 = phi i64 [1, %"entry"], [%out.2, %"loop"]
%in.3 = phi i64 [1, %"entry"], [%out.3, %"loop"]
%in.4 = phi i64 [1, %"entry"], [%out.4, %"loop"]
%in.5 = phi i64 [1, %"entry"], [%out.5, %"loop"]
%in.6 = phi i64 [1, %"entry"], [%out.6, %"loop"]
%in.7 = phi i64 [1, %"entry"], [%out.7, %"loop"]
%"reg.0" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.0, i64 1)
%"reg.1" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.0", i64 1)
%"reg.2" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.1", i64 1)
%"reg.3" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.2", i64 1)
%"reg.4" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.3", i64 1)
%"reg.5" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.4", i64 1)
%"reg.6" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.5", i64 1)
%out.0 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.6", i64 1)
%"reg.7" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.1, i64 1)
%"reg.8" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.7", i64 1)
%"reg.9" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.8", i64 1)
%"reg.10" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.9", i64 1)
%"reg.11" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.10", i64 1)
%"reg.12" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.11", i64 1)
%"reg.13" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.12", i64 1)
%out.1 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.13", i64 1)
%"reg.14" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.2, i64 1)
%"reg.15" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.14", i64 1)
%"reg.16" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.15", i64 1)
%"reg.17" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.16", i64 1)
%"reg.18" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.17", i64 1)
%"reg.19" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.18", i64 1)
%"reg.20" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.19", i64 1)
%out.2 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.20", i64 1)
%"reg.21" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.3, i64 1)
%"reg.22" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.21", i64 1)
%"reg.23" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.22", i64 1)
%"reg.24" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.23", i64 1)
%"reg.25" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.24", i64 1)
%"reg.26" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.25", i64 1)
%"reg.27" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.26", i64 1)
%out.3 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.27", i64 1)
%"reg.28" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.4, i64 1)
%"reg.29" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.28", i64 1)
%"reg.30" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.29", i64 1)
%"reg.31" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.30", i64 1)
%"reg.32" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.31", i64 1)
%"reg.33" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.32", i64 1)
%out.4 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.33", i64 1)
%out.5 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.5, i64 1)
%out.6 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.6, i64 1)
%out.7 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.7, i64 1)
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [-1, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}

Binary file not shown.

View File

@@ -1,6 +0,0 @@
#include <stdio.h>
int main() {
printf("%d\n", test(100));
return 0;
}

View File

@@ -1,95 +0,0 @@
#!/usr/bin/env python3
import llvmlite.binding as llvm
import ctypes
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
code = '''define i64 @"test"(i64 %"N")
{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
%in.0 = phi i64 [1, %"entry"], [%out.0, %"loop"]
%in.1 = phi i64 [1, %"entry"], [%out.1, %"loop"]
%in.2 = phi i64 [1, %"entry"], [%out.2, %"loop"]
%in.3 = phi i64 [1, %"entry"], [%out.3, %"loop"]
%in.4 = phi i64 [1, %"entry"], [%out.4, %"loop"]
%in.5 = phi i64 [1, %"entry"], [%out.5, %"loop"]
%in.6 = phi i64 [1, %"entry"], [%out.6, %"loop"]
%in.7 = phi i64 [1, %"entry"], [%out.7, %"loop"]
%"reg.0" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.0, i64 1)
%"reg.1" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.0", i64 1)
%"reg.2" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.1", i64 1)
%"reg.3" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.2", i64 1)
%"reg.4" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.3", i64 1)
%"reg.5" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.4", i64 1)
%"reg.6" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.5", i64 1)
%out.0 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.6", i64 1)
%"reg.7" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.1, i64 1)
%"reg.8" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.7", i64 1)
%"reg.9" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.8", i64 1)
%"reg.10" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.9", i64 1)
%"reg.11" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.10", i64 1)
%"reg.12" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.11", i64 1)
%"reg.13" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.12", i64 1)
%out.1 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.13", i64 1)
%"reg.14" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.2, i64 1)
%"reg.15" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.14", i64 1)
%"reg.16" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.15", i64 1)
%"reg.17" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.16", i64 1)
%"reg.18" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.17", i64 1)
%"reg.19" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.18", i64 1)
%"reg.20" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.19", i64 1)
%out.2 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.20", i64 1)
%"reg.21" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.3, i64 1)
%"reg.22" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.21", i64 1)
%"reg.23" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.22", i64 1)
%"reg.24" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.23", i64 1)
%"reg.25" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.24", i64 1)
%"reg.26" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.25", i64 1)
%"reg.27" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.26", i64 1)
%out.3 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.27", i64 1)
%"reg.28" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.4, i64 1)
%"reg.29" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.28", i64 1)
%"reg.30" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.29", i64 1)
%"reg.31" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.30", i64 1)
%"reg.32" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.31", i64 1)
%"reg.33" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.32", i64 1)
%out.4 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.33", i64 1)
%out.5 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.5, i64 1)
%out.6 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.6, i64 1)
%out.7 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.7, i64 1)
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [-1, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}'''
llvm_module = llvm.parse_assembly(code)
llvm_module.verify()
tm = llvm.Target.from_default_triple().create_target_machine(
features=llvm.get_host_cpu_features().flatten(),
cpu=llvm.get_host_cpu_name(),
opt=3)
ee = llvm.create_mcjit_compiler(llvm_module, tm)
ee.finalize_object()
cfptr = ee.get_function_address('test')
cfunc = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)(cfptr)
N = 100
ret = cfunc(N)
print(ret)
if ret == 0:
print("FAIL")
elif ret == N-1:
print("Probably good.")

Binary file not shown.

View File

@@ -1,6 +0,0 @@
#include <stdio.h>
int test(int);
int main() {
printf("%d\n", test(123123123));
return 0;
}

View File

@@ -1,32 +0,0 @@
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _main ## -- Begin function main
.p2align 4, 0x90
_main: ## @main
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
subq $16, %rsp
movl $0, -4(%rbp)
movl $123123123, %edi ## imm = 0x756B5B3
callq _test
leaq L_.str(%rip), %rdi
movl %eax, %esi
movb $0, %al
callq _printf
xorl %eax, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
## -- End function
.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "%d\n"
.subsections_via_symbols

View File

@@ -1,53 +0,0 @@
#!/usr/bin/env python3
import ctypes
import llvmlite.binding as llvm
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
code = """
define i64 @"test"(i64 %"N")
{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
%"in.0" = phi i32 [3, %"entry"], [%"out.0", %"loop"]
%"reg.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"in.0", i32 1)
%"out.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"reg.0", i32 1)
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}
"""
features = llvm.get_host_cpu_features().flatten()
# znver1 on naples and skylake-avx512 on skylake-sp
for cpu in ["skylake-avx512", "znver1"]:
tm = llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, opt=2)
tm.set_asm_verbosity(0)
module = llvm.parse_assembly(code)
asm = tm.emit_assembly(module)
print(asm)
with llvm.create_mcjit_compiler(module, tm) as ee:
ee.finalize_object()
cfptr = ee.get_function_address('test')
cfunc = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)(cfptr)
print('->', cfunc(100000))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
dist/asmjit-0.1.tar.gz vendored

Binary file not shown.

Submodule doc/asmbench-SC18SRC-poster deleted from 89206c1415

BIN
dump.bin

Binary file not shown.

6
min.ll
View File

@@ -1,6 +0,0 @@
define <4 x double> @testv(i32**, i32) {
%out = tail call <4 x double> asm "vaddpd $1, $2, $0", "=x,x,x,~{dirflag},~{fpsr},~{flags}"(<4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>, <4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>)
ret <4 x double> %out
}

BIN
min.o

Binary file not shown.

21
min.s
View File

@@ -1,21 +0,0 @@
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal8,8byte_literals
.p2align 3 ## -- Begin function testv
LCPI0_0:
.quad 4593527504729830064 ## 0x3fbf7ced916872b0
.section __TEXT,__text,regular,pure_instructions
.globl _testv
.p2align 4, 0x90
_testv: ## @testv
.cfi_startproc
## BB#0:
vbroadcastsd LCPI0_0(%rip), %ymm0 ## ymm0 = [4593527504729830064,4593527504729830064,4593527504729830064,4593527504729830064]
## InlineAsm Start
vaddpd %ymm0, %ymm0, %ymm0
## InlineAsm End
retq
.cfi_endproc
## -- End function
.subsections_via_symbols

View File

@@ -1,450 +0,0 @@
## Selected Instructions
VPERMILPSri, MULPSrr, ANDPDrr, VPSIGNBrr, PSIGNBrr, PMOVZXWDrr, PMINUWrr, PADDSWrr, VPSHUFHWri, MOVUPDrr
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal4,4byte_literals
.p2align 2
LCPI0_0:
.long 1065361408
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movabsq $LCPI0_0, %rax
vbroadcastss (%rax), %xmm0
movq $-1, %rcx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
vpermilps $1, %xmm0, %xmm0
mulps %xmm0, %xmm0
andpd %xmm0, %xmm0
vpsignb %xmm0, %xmm0, %xmm0
psignb %xmm0, %xmm0
pmovzxwd %xmm0, %xmm0
pminuw %xmm0, %xmm0
paddsw %xmm0, %xmm0
vpshufhw $1, %xmm0, %xmm0
movupd %xmm0, %xmm0
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (24655919,),
'frequency': 2600000000.0,
'iterations': 24655919,
'parallel_factor': 1,
'returned': [24655918, 24655918, 24655918, 24655918],
'runtimes': [0.13202582497615367,
0.13208268792368472,
0.13151856907643378,
0.13161470007617027]}
minimal throughput: 13.87 cy
## Selected Instructions
VFMADD132PDYr, VPADDWYrr, VFMADD132PSYr, VPADDDYrr, VSUBPDYrr, VPACKUSDWYrr, VPMULHUWYrr, VMINPDYrr, VPUNPCKLWDYrr, VBLENDVPSYrr
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal4,4byte_literals
.p2align 2
LCPI0_0:
.long 1065361408
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movabsq $LCPI0_0, %rax
vbroadcastss (%rax), %ymm0
movq $-1, %rcx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
vfmadd132pd %ymm0, %ymm0, %ymm0
vpaddw %ymm0, %ymm0, %ymm0
vfmadd132ps %ymm0, %ymm0, %ymm0
vpaddd %ymm0, %ymm0, %ymm0
vsubpd %ymm0, %ymm0, %ymm0
vpackusdw %ymm0, %ymm0, %ymm0
vpmulhuw %ymm0, %ymm0, %ymm0
vminpd %ymm0, %ymm0, %ymm0
vpunpcklwd %ymm0, %ymm0, %ymm0
vblendvps %ymm0, %ymm0, %ymm0, %ymm0
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
vzeroupper
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (10000000,),
'frequency': 2600000000.0,
'iterations': 10000000,
'parallel_factor': 1,
'returned': [9999999, 9999999, 9999999, 9999999],
'runtimes': [0.11892832000739872,
0.11891822703182697,
0.11902078497223556,
0.12094117503147572]}
minimal throughput: 30.92 cy
## Selected Instructions
VCVTSI642SDrr, VFMADD213SDr, DIVSDrr, VCVTSI642SDrr, MAXSDrr, VFNMADD213SDr, VFMADD132SDr, VMAXSDrr, VFNMADD132SDr, SQRTSDr
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal8,8byte_literals
.p2align 3
LCPI0_0:
.quad 4607186816846528512
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movq $-1, %rcx
movabsq $LCPI0_0, %rax
vmovsd (%rax), %xmm0
movl $3, %edx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
vcvtsi2sdq %rdx, %xmm0, %xmm0
vfmadd213sd %xmm0, %xmm0, %xmm0
divsd %xmm0, %xmm0
vcvtsi2sdq %rdx, %xmm0, %xmm0
maxsd %xmm0, %xmm0
vfnmadd213sd %xmm0, %xmm0, %xmm0
vfmadd132sd %xmm0, %xmm0, %xmm0
vmaxsd %xmm0, %xmm0, %xmm0
vfnmadd132sd %xmm0, %xmm0, %xmm0
sqrtsd %xmm0, %xmm0
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (5841530,),
'frequency': 2600000000.0,
'iterations': 5841530,
'parallel_factor': 1,
'returned': [5841529, 5841529, 5841529, 5841529],
'runtimes': [0.13433505699504167,
0.13318849296774715,
0.13303690601605922,
0.13309408095665276]}
minimal throughput: 59.21 cy
## Selected Instructions
RCPSSr, VCVTSI2SSrr, MULSSrr, VCVTSD2SSrr, VROUNDSSr, VRCPSSr, VCVTSI2SSrr, VSQRTSSr, VFNMADD231SSr, VSQRTSSr
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal4,4byte_literals
.p2align 2
LCPI0_0:
.long 1065361408
.section __TEXT,__literal8,8byte_literals
.p2align 3
LCPI0_1:
.quad 4607186816846528512
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movq $-1, %rcx
movabsq $LCPI0_0, %rax
vmovss (%rax), %xmm1
movl $3, %edx
movabsq $LCPI0_1, %rax
vmovsd (%rax), %xmm0
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
rcpss %xmm1, %xmm1
vcvtsi2ssl %edx, %xmm1, %xmm1
mulss %xmm1, %xmm1
vcvtsd2ss %xmm0, %xmm1, %xmm1
vroundss $1, %xmm1, %xmm1, %xmm1
vrcpss %xmm1, %xmm1, %xmm1
vcvtsi2ssl %edx, %xmm1, %xmm1
vsqrtss %xmm1, %xmm1, %xmm1
vfnmadd231ss %xmm1, %xmm1, %xmm1
vsqrtss %xmm1, %xmm1, %xmm1
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (6011291,),
'frequency': 2600000000.0,
'iterations': 6011291,
'parallel_factor': 1,
'returned': [6011290, 6011290, 6011290, 6011290],
'runtimes': [0.13239118899218738,
0.13244657206814736,
0.1326694720191881,
0.13262002903502434]}
minimal throughput: 57.26 cy
## Selected Instructions
ROR16ri, CMOVS16rr, SBB16ri, ADC16ri8, XOR16ri8, BTR16rr, XOR16ri8, SAR16r1, DEC16r, SUB16ri
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movw $3, %cx
movq $-1, %rdx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
rorw %cx
cmovsw %cx, %cx
sbbw $1, %cx
adcw $1, %cx
xorw $1, %cx
btrw %cx, %cx
xorw $1, %cx
sarw %cx
decw %cx
subw $1, %cx
## InlineAsm End
leaq 1(%rdx), %rax
addq $2, %rdx
cmpq %rdi, %rdx
movq %rax, %rdx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (31283731,),
'frequency': 2600000000.0,
'iterations': 31283731,
'parallel_factor': 1,
'returned': [31283730, 31283730, 31283730, 31283730],
'runtimes': [0.13291946100071073,
0.13294463406782597,
0.1332225619116798,
0.13287500606384128]}
minimal throughput: 11.04 cy
## Selected Instructions
SHLX32rr, CMOVO32rr, MOV32rr, CMOVS32rr, CRC32r32r8, SHR32r1, ADD32rr, CRC32r32r8, RCR32ri, SHR32r1
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movl $3, %esi
movq $-1, %rdx
movb $3, %cl
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
shlxl %esi, %esi, %eax
cmovol %eax, %eax
movl %eax, %esi
cmovsl %esi, %esi
crc32b %cl, %esi
shrl %esi
addl %esi, %esi
crc32b %cl, %esi
rcrl %esi
shrl %esi
## InlineAsm End
leaq 1(%rdx), %rax
addq $2, %rdx
cmpq %rdi, %rdx
movq %rax, %rdx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (24008543,),
'frequency': 2600000000.0,
'iterations': 24008543,
'parallel_factor': 1,
'returned': [24008542, 24008542, 24008542, 24008542],
'runtimes': [0.13333229208365083,
0.13314284407533705,
0.13381975598167628,
0.13447994901798666]}
minimal throughput: 14.42 cy
## Selected Instructions
SHRX64rr, SBB64ri32, AND64ri8, MOV64rc, INC64r, SUB64ri32, POPCNT64rr, OR64ri8, BTS64rr, ROL64ri
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movq $-1, %rcx
movl $3, %edx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
shrxq %rdx, %rdx, %rax
sbbq $1, %rax
andq $1, %rax
movq %rax, %rax
incq %rax
subq $1, %rax
popcntq %rax, %rdx
orq $1, %rdx
btsq %rdx, %rdx
rolq %rdx
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (27539225,),
'frequency': 2600000000.0,
'iterations': 27539225,
'parallel_factor': 1,
'returned': [27539224, 27539224, 27539224, 27539224],
'runtimes': [0.1335972750093788,
0.13322542910464108,
0.13357082300353795,
0.13376462296582758]}
minimal throughput: 12.58 cy
## Selected Instructions
SAR8r1, SHR8ri, INC8r, AND8rr, RCR8ri, ROL8ri, SUB8ri, SBB8rr, NEG8r, NOT8r
## Generated Assembly (1x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movb $3, %cl
movq $-1, %rdx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
sarb %cl
shrb %cl
incb %cl
andb %cl, %cl
rcrb %cl
rolb %cl
subb $1, %cl
sbbb %cl, %cl
negb %cl
notb %cl
## InlineAsm End
leaq 1(%rdx), %rax
addq $2, %rdx
cmpq %rdi, %rdx
movq %rax, %rdx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (30431254,),
'frequency': 2600000000.0,
'iterations': 30431254,
'parallel_factor': 1,
'returned': [30431253, 30431253, 30431253, 30431253],
'runtimes': [0.13894746906589717,
0.1348069809610024,
0.13318019802682102,
0.13318415405228734]}
minimal throughput: 11.38 cy

File diff suppressed because it is too large Load Diff

Binary file not shown.

BIN
test

Binary file not shown.

BIN
test.o

Binary file not shown.

136
test.s
View File

@@ -1,136 +0,0 @@
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _foo ## -- Begin function foo
.p2align 4, 0x90
_foo: ## @foo
.cfi_startproc
## BB#0:
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
xorl %eax, %eax
testl %edi, %edi
jle LBB0_2
.p2align 4, 0x90
LBB0_1: ## =>This Inner Loop Header: Depth=1
## InlineAsm Start
addl $23, %eax
## InlineAsm End
## InlineAsm Start
subl $13, %eax
## InlineAsm End
## InlineAsm Start
subl $10, %eax
## InlineAsm End
incl %eax
cmpl %edi, %eax
jl LBB0_1
LBB0_2:
popq %rbp
retq
.cfi_endproc
## -- End function
.section __TEXT,__literal8,8byte_literals
.p2align 3 ## -- Begin function benchmark
LCPI1_0:
.quad 4696837146684686336 ## double 1.0E+6
.section __TEXT,__text,regular,pure_instructions
.globl _benchmark
.p2align 4, 0x90
_benchmark: ## @benchmark
.cfi_startproc
## BB#0:
pushq %rbp
Lcfi3:
.cfi_def_cfa_offset 16
Lcfi4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi5:
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
subq $48, %rsp
Lcfi6:
.cfi_offset %rbx, -32
Lcfi7:
.cfi_offset %r14, -24
movq %rsi, %r14
movss %xmm0, -20(%rbp) ## 4-byte Spill
movl %edi, %ebx
leaq -56(%rbp), %rdi
xorl %esi, %esi
callq _gettimeofday
movl %ebx, %edi
callq *%r14
leaq -40(%rbp), %rdi
xorl %esi, %esi
callq _gettimeofday
movq -40(%rbp), %rax
subq -56(%rbp), %rax
cvtsi2sdq %rax, %xmm1
movl -32(%rbp), %eax
subl -48(%rbp), %eax
xorps %xmm0, %xmm0
cvtsi2sdl %eax, %xmm0
mulsd LCPI1_0(%rip), %xmm0
addsd %xmm1, %xmm0
movss -20(%rbp), %xmm1 ## 4-byte Reload
## xmm1 = mem[0],zero,zero,zero
cvtss2sd %xmm1, %xmm1
divsd %xmm1, %xmm0
leaq L_.str(%rip), %rdi
movb $1, %al
callq _printf
addq $48, %rsp
popq %rbx
popq %r14
popq %rbp
retq
.cfi_endproc
## -- End function
.section __TEXT,__literal4,4byte_literals
.p2align 2 ## -- Begin function main
LCPI2_0:
.long 1326386456 ## float 2.4E+9
.section __TEXT,__text,regular,pure_instructions
.globl _main
.p2align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Lcfi8:
.cfi_def_cfa_offset 16
Lcfi9:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi10:
.cfi_def_cfa_register %rbp
movq 8(%rsi), %rdi
callq _atoi
leaq _foo(%rip), %rsi
movss LCPI2_0(%rip), %xmm0 ## xmm0 = mem[0],zero,zero,zero
movl %eax, %edi
callq _benchmark
xorl %eax, %eax
popq %rbp
retq
.cfi_endproc
## -- End function
.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "%.3f (clock cycles)\n"
.comm _latency,8,3 ## @latency
.comm _ninst,8,3 ## @ninst
.subsections_via_symbols

Binary file not shown.

View File

@@ -1,39 +0,0 @@
ADD32ri
ADD64ri32
CMP32rm
CMP32rr
CMP64ri32
CMP64rr
INC64r
MOVSX64rm32
SUB32ri
VADDPDYrm
VADDSDrm
VADDSDrr
VADDSSrr
VCVTSI642SSrr
VCVTSS2SIrr_Int
VFMADD213PDYr
VFMADD213PDr
VFMADD213PSYr
VFMADD213PSr
VFMADD213SDr
VFMADD213SSr
VINSERTF128rr
VMULPDYrr
VMULSDrm_Int
VMULSDrr_Int
VMULSSrr_Int
VSUBPDYrm
VSUBSDrm_Int
VSUBSDrr_Int
VSUBSSrr_Int
MOV64mr (store)
MOV32rm
MOV64rm
VMOVSD??? mem_xmm
VMOVSD??? xmm_mem
LEA32r <-- which ones?
LEA64r <-- which ones?

View File

@@ -1,51 +0,0 @@
add-r32_imd
add-r64_imd
inc-r64
mov-mem_r64
mov-r32_imd
movslq-r64_r32
sub-r32_imd
vaddpd-avx
vaddsd-xmm_xmm_xmm
vaddss-xmm_xmm_xmm
vcvtsi2ss-xmm_xmm_r32
vcvtss2si-r32_xmm
vfmadd213pd-avx
vfmadd213pd-sse
vfmadd213ps-avx
vfmadd213ps-sse
vfmadd213sd
vfmadd213ss
vinsertf128-ymm_ymm_imd
vmulpd-ymm_ymm_ymm
vmulsd-xmm_xmm_xmm
vmulss-xmm_xmm_xmm
vsubsd-xmm_xmm_xmm
vsubss-xmm_xmm_xmm
# LEAs:
lea-r32_mem
lea-r64_mem
lea-r64_mem2
# /w mem operand:
mov-r32_mem
mov-r64_mem
vmovsd-mem_xmm
vmovsd-xmm_mem
vaddpd-ymm_ymm_mem
vaddsd-xmm_xmm_mem
vmulsd-xmm_xmm_mem
vsubpd-ymm_ymm_mem
vsubsd-xmm_xmm_mem
# impossible to serialize:
cmp-r32_r32
cmp-r64_imd
cmp-r64_r64
cmp-r32_mem