mirror of
https://github.com/RRZE-HPC/asmbench.git
synced 2025-07-20 20:21:05 +02:00
Revert "fixed up to work with latest kerncraft"
This reverts commit 2ccfb0c9ea
.
This commit is contained in:
12
.idea/asmbench.iml
generated
12
.idea/asmbench.iml
generated
@@ -1,12 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="projectConfiguration" value="Nosetests" />
|
||||
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
|
||||
</component>
|
||||
</module>
|
4
.idea/encodings.xml
generated
4
.idea/encodings.xml
generated
@@ -1,4 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
|
||||
</project>
|
7
.idea/misc.xml
generated
7
.idea/misc.xml
generated
@@ -1,7 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="JavaScriptSettings">
|
||||
<option name="languageLevel" value="ES6" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.4 (/opt/local/bin/python)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
generated
8
.idea/modules.xml
generated
@@ -1,8 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/asmbench.iml" filepath="$PROJECT_DIR$/.idea/asmbench.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
469
.idea/workspace.xml
generated
469
.idea/workspace.xml
generated
@@ -1,469 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="ce9d0a71-6676-44f6-88f0-52583274be24" name="Default" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/vcs.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/doc/asmbench-SC18SRC-poster/sc18-src-poster.ai" beforeDir="false" afterPath="$PROJECT_DIR$/doc/asmbench-SC18SRC-poster/sc18-src-poster.ai" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/tablegen.py" beforeDir="false" afterPath="$PROJECT_DIR$/tablegen.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/oldjit.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="12135">
|
||||
<caret line="820" column="33" selection-start-line="820" selection-start-column="33" selection-end-line="820" selection-end-column="33" />
|
||||
<folding>
|
||||
<element signature="e#23#36#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/sc18src.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="120">
|
||||
<caret line="17" column="21" selection-start-line="17" selection-start-column="21" selection-end-line="17" selection-end-column="21" />
|
||||
<folding>
|
||||
<element signature="e#23#41#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/tablegen.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1350">
|
||||
<caret line="100" column="28" selection-start-line="100" selection-start-column="28" selection-end-line="100" selection-end-column="28" />
|
||||
<folding>
|
||||
<element signature="e#24#34#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/streams.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1230">
|
||||
<caret line="82" lean-forward="true" selection-start-line="82" selection-end-line="82" />
|
||||
<folding>
|
||||
<element signature="e#24#42#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/op.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="3900">
|
||||
<caret line="260" column="35" selection-start-line="260" selection-start-column="35" selection-end-line="260" selection-end-column="35" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/bench.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="45">
|
||||
<caret line="3" column="15" selection-start-line="3" selection-end-line="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Setup Script" />
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>build_ir</find>
|
||||
<find>combined_instructions</find>
|
||||
<find>random</find>
|
||||
<find>serial</find>
|
||||
<find>IntegerLoopBenchmark</find>
|
||||
<find>latenchy</find>
|
||||
<find>iaca_markers</find>
|
||||
<find>setup_llvm</find>
|
||||
<find>get_iaca_analysis</find>
|
||||
<find>get_target_machine</find>
|
||||
<find>foo</find>
|
||||
<find>instructions_ret_type</find>
|
||||
<find>iaca</find>
|
||||
<find>get_registers</find>
|
||||
<find>sop_t</find>
|
||||
<find>AddressGenerationBenchmark</find>
|
||||
<find>lea</find>
|
||||
<find>,)</find>
|
||||
<find>prepare_arguments</find>
|
||||
<find>VSUBSSrr</find>
|
||||
<find>build_and_execute</find>
|
||||
<find>jit.</find>
|
||||
<find>fn</find>
|
||||
<find>asmjit</find>
|
||||
<find>ValueError</find>
|
||||
<find>split_llvm_vector_type</find>
|
||||
<find>get_default_init_values</find>
|
||||
<find>llvm</find>
|
||||
<find>self.init_val</find>
|
||||
<find>i64</find>
|
||||
</findStrings>
|
||||
<replaceStrings>
|
||||
<replace>generate_register_nameing</replace>
|
||||
<replace>naming</replace>
|
||||
<replace>iaca_marker</replace>
|
||||
<replace>jit.AddressGenerationBenchmark</replace>
|
||||
<replace>)</replace>
|
||||
<replace>oldjit.</replace>
|
||||
<replace>asmbench</replace>
|
||||
</replaceStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="IdeDocumentHistory">
|
||||
<option name="CHANGED_PATHS">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/debug_avx_feature.py" />
|
||||
<option value="$PROJECT_DIR$/op.py" />
|
||||
<option value="$PROJECT_DIR$/bench.py" />
|
||||
<option value="$PROJECT_DIR$/asmjit/__init__.py" />
|
||||
<option value="$PROJECT_DIR$/jit.py" />
|
||||
<option value="$PROJECT_DIR$/tablegen.py" />
|
||||
<option value="$PROJECT_DIR$/dev_test/reproduce.py" />
|
||||
<option value="$PROJECT_DIR$/asmjit/__main__.py" />
|
||||
<option value="$PROJECT_DIR$/asmjit/op.py" />
|
||||
<option value="$PROJECT_DIR$/README.md" />
|
||||
<option value="$PROJECT_DIR$/asmjit/bench.py" />
|
||||
<option value="$PROJECT_DIR$/run_SC18_SRC.py" />
|
||||
<option value="$PROJECT_DIR$/asmjit/sc18src.py" />
|
||||
<option value="$PROJECT_DIR$/README.md" />
|
||||
<option value="$PROJECT_DIR$/doc/sc18src_artifact_appendix.md" />
|
||||
<option value="$PROJECT_DIR$/README.rst" />
|
||||
<option value="$PROJECT_DIR$/MANIFEST.in" />
|
||||
<option value="$PROJECT_DIR$/setup.py" />
|
||||
<option value="$PROJECT_DIR$/setup.py" />
|
||||
<option value="$PROJECT_DIR$/asmbench/bench.py" />
|
||||
<option value="$PROJECT_DIR$/asmbench/op.py" />
|
||||
<option value="$APPLICATION_CONFIG_DIR$/scratches/scratch.py" />
|
||||
<option value="$PROJECT_DIR$/asmbench/streams.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectFrameBounds">
|
||||
<option name="x" value="971" />
|
||||
<option name="y" value="-1669" />
|
||||
<option name="width" value="1241" />
|
||||
<option name="height" value="1669" />
|
||||
</component>
|
||||
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
|
||||
<component name="ProjectView">
|
||||
<navigator proportions="" version="1">
|
||||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<expand>
|
||||
<path>
|
||||
<item name="asmbench" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="asmbench" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="asmbench" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="asmbench" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="asmbench" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
</pane>
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
<property name="WebServerToolWindowFactoryState" value="false" />
|
||||
<property name="com.intellij.ide.scratch.LRUPopupBuilder$1/New Scratch File" value="Python" />
|
||||
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
|
||||
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
|
||||
<property name="run.code.analysis.last.selected.profile" value="pProject Default" />
|
||||
<property name="settings.editor.selected.configurable" value="editor.preferences.completion" />
|
||||
</component>
|
||||
<component name="PyConsoleOptionsProvider">
|
||||
<option name="myPythonConsoleState">
|
||||
<console-settings is-module-sdk="true">
|
||||
<option name="myUseModuleSdk" value="true" />
|
||||
</console-settings>
|
||||
</option>
|
||||
</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="MoveFile.RECENT_KEYS">
|
||||
<recent name="$PROJECT_DIR$/asmjit" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunDashboard">
|
||||
<option name="ruleStates">
|
||||
<list>
|
||||
<RuleState>
|
||||
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
<RuleState>
|
||||
<option name="name" value="StatusDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="SvnConfiguration">
|
||||
<configuration />
|
||||
</component>
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="ce9d0a71-6676-44f6-88f0-52583274be24" name="Default" comment="" />
|
||||
<created>1528185911695</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1528185911695</updated>
|
||||
<workItem from="1549014553562" duration="10478000" />
|
||||
<workItem from="1549470823118" duration="191000" />
|
||||
<workItem from="1549577395449" duration="719000" />
|
||||
<workItem from="1549629861489" duration="622000" />
|
||||
<workItem from="1549636051326" duration="400000" />
|
||||
<workItem from="1550675127118" duration="4866000" />
|
||||
<workItem from="1553613650758" duration="756000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TimeTrackingManager">
|
||||
<option name="totallyTimeSpent" value="18032000" />
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="971" y="-1669" width="1241" height="1669" extended-state="0" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25771475" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info id="Favorites" order="2" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info anchor="bottom" id="Find" order="1" weight="0.32980832" />
|
||||
<window_info anchor="bottom" id="Run" order="2" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" />
|
||||
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
|
||||
<window_info anchor="bottom" id="Database Changes" order="8" show_stripe_button="false" />
|
||||
<window_info anchor="bottom" id="Terminal" order="9" />
|
||||
<window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Version Control" order="11" />
|
||||
<window_info anchor="bottom" id="Messages" order="12" />
|
||||
<window_info anchor="bottom" id="Python Console" order="13" />
|
||||
<window_info active="true" anchor="bottom" id="Inspection Results" order="14" visible="true" weight="0.32980832" />
|
||||
<window_info anchor="right" id="Commander" order="0" weight="0.4" />
|
||||
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
||||
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
||||
<window_info anchor="right" id="SciView" order="3" />
|
||||
<window_info anchor="right" id="Database" order="4" />
|
||||
</layout>
|
||||
<layout-to-restore>
|
||||
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.23436196" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info id="Favorites" order="2" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info anchor="bottom" id="Find" order="1" weight="0.32980832" />
|
||||
<window_info anchor="bottom" id="Run" order="2" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" />
|
||||
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
|
||||
<window_info anchor="bottom" id="Version Control" order="8" />
|
||||
<window_info anchor="bottom" id="Database Changes" order="9" show_stripe_button="false" />
|
||||
<window_info anchor="bottom" id="Python Console" order="10" />
|
||||
<window_info anchor="bottom" id="Terminal" order="11" />
|
||||
<window_info anchor="bottom" id="Event Log" order="12" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Messages" order="13" />
|
||||
<window_info active="true" anchor="bottom" id="Inspection Results" order="14" visible="true" weight="0.32980832" />
|
||||
<window_info anchor="right" id="Commander" order="0" weight="0.4" />
|
||||
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
||||
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
||||
<window_info anchor="right" id="SciView" order="3" />
|
||||
<window_info anchor="right" id="Database" order="4" />
|
||||
</layout-to-restore>
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="1" />
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<breakpoints>
|
||||
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
|
||||
<url>file://$PROJECT_DIR$/asmbench/bench.py</url>
|
||||
<line>1</line>
|
||||
<option name="timeStamp" value="3" />
|
||||
</line-breakpoint>
|
||||
</breakpoints>
|
||||
</breakpoint-manager>
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/debug_avx_feature.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="30">
|
||||
<caret line="2" selection-start-line="2" selection-end-line="2" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/dev_test/reproduce.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="705">
|
||||
<caret line="46" column="33" selection-start-line="46" selection-start-column="33" selection-end-line="46" selection-end-column="33" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_skylapesp2.txt" />
|
||||
<entry file="file://$PROJECT_DIR$/SC18_SRC_skylapesp2.txt">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="135">
|
||||
<caret line="9" column="43" lean-forward="true" selection-start-line="9" selection-start-column="43" selection-end-line="9" selection-end-column="43" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_naples1.txt" />
|
||||
<entry file="file://$PROJECT_DIR$/doc/sc18-src-poster_data/SC18_SRC_skylakesp2.txt" />
|
||||
<entry file="file://$PROJECT_DIR$/README.md" />
|
||||
<entry file="file://$PROJECT_DIR$/MANIFEST.in">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="15">
|
||||
<caret line="1" column="18" selection-start-line="1" selection-start-column="18" selection-end-line="1" selection-end-column="18" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/doc/sc18src_artifact_appendix.md">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="15">
|
||||
<caret line="1" lean-forward="true" selection-start-line="1" selection-end-line="1" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/README.rst">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<caret line="7" selection-start-line="7" selection-end-line="7" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/setup.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="420">
|
||||
<caret line="28" lean-forward="true" selection-start-line="28" selection-end-line="28" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/build/lib/asmbench/bench.py" />
|
||||
<entry file="file://$PROJECT_DIR$/build/lib/asmjit/bench.py" />
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$APPLICATION_CONFIG_DIR$/scratches/scratch.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/__main__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="255">
|
||||
<caret line="21" column="21" selection-start-line="21" selection-start-column="21" selection-end-line="21" selection-end-column="21" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/run_SC18_SRC.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-2084">
|
||||
<caret line="2" column="16" selection-start-line="2" selection-start-column="16" selection-end-line="2" selection-end-column="16" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/oldjit.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="12135">
|
||||
<caret line="820" column="33" selection-start-line="820" selection-start-column="33" selection-end-line="820" selection-end-column="33" />
|
||||
<folding>
|
||||
<element signature="e#23#36#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/sc18src.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="120">
|
||||
<caret line="17" column="21" selection-start-line="17" selection-start-column="21" selection-end-line="17" selection-end-column="21" />
|
||||
<folding>
|
||||
<element signature="e#23#41#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/tablegen.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1350">
|
||||
<caret line="100" column="28" selection-start-line="100" selection-start-column="28" selection-end-line="100" selection-end-column="28" />
|
||||
<folding>
|
||||
<element signature="e#24#34#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/op.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="3900">
|
||||
<caret line="260" column="35" selection-start-line="260" selection-start-column="35" selection-end-line="260" selection-end-column="35" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/bench.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="45">
|
||||
<caret line="3" column="15" selection-start-line="3" selection-end-line="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/asmbench/streams.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1230">
|
||||
<caret line="82" lean-forward="true" selection-start-line="82" selection-end-line="82" />
|
||||
<folding>
|
||||
<element signature="e#24#42#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</component>
|
||||
<component name="masterDetails">
|
||||
<states>
|
||||
<state key="ScopeChooserConfigurable.UI">
|
||||
<settings>
|
||||
<splitter-proportions>
|
||||
<option name="proportions">
|
||||
<list>
|
||||
<option value="0.2" />
|
||||
</list>
|
||||
</option>
|
||||
</splitter-proportions>
|
||||
</settings>
|
||||
</state>
|
||||
</states>
|
||||
</component>
|
||||
</project>
|
@@ -1,261 +0,0 @@
|
||||
ADD32ri LAT 1.001 cy TP 0.293 cy
|
||||
ADD64ri32 LAT 1.001 cy TP 0.295 cy
|
||||
INC64r LAT 1.000 cy TP 0.314 cy
|
||||
MOV64ri32 LAT 0.535 cy TP 0.354 cy
|
||||
SUB32ri LAT 1.001 cy TP 0.330 cy
|
||||
VADDPDYrr LAT 4.002 cy TP 0.523 cy
|
||||
VADDSDrr LAT 4.002 cy TP 0.523 cy
|
||||
VADDSSrr LAT 4.002 cy TP 0.523 cy
|
||||
VCVTSI642SSrr LAT 2.001 cy TP 2.001 cy
|
||||
VFMADD213PDYr LAT 4.002 cy TP 0.523 cy
|
||||
VFMADD213PDr LAT 4.002 cy TP 0.523 cy
|
||||
VFMADD213PSYr LAT 4.002 cy TP 0.523 cy
|
||||
VFMADD213PSr LAT 4.002 cy TP 0.523 cy
|
||||
VFMADD213SDr LAT 4.002 cy TP 0.523 cy
|
||||
VFMADD213SSr LAT 4.002 cy TP 0.523 cy
|
||||
VINSERTF128rr LAT 3.001 cy TP 1.000 cy
|
||||
VMULPDYrr LAT 4.002 cy TP 0.523 cy
|
||||
VMULSDrr LAT 4.002 cy TP 0.523 cy
|
||||
VMULSSrr LAT 4.002 cy TP 0.523 cy
|
||||
VSUBSDrr LAT 4.002 cy TP 0.523 cy
|
||||
VSUBSSrr LAT 4.002 cy TP 0.523 cy
|
||||
lea_b LAT 0.600 cy TP 0.550 cy
|
||||
lea_b+off LAT 0.600 cy TP 0.550 cy
|
||||
lea_idx*w LAT 0.600 cy TP 0.550 cy
|
||||
lea_off+idx*w LAT 0.600 cy TP 0.550 cy
|
||||
lea_b+idx*w LAT 1.000 cy TP 0.601 cy
|
||||
lea_b+off+idx*w LAT 3.001 cy TP 1.000 cy
|
||||
LD_linear LAT 2.006 cy TP 0.502 cy
|
||||
LD_random LAT 2.006 cy TP 0.502 cy
|
||||
ADD32ri ADD32ri LAT 1.086 cy TP 0.614 cy SPM 1.09
|
||||
ADD32ri ADD64ri32 LAT 1.086 cy TP 0.614 cy SPM 1.09
|
||||
ADD32ri INC64r LAT 1.086 cy TP 0.629 cy SPM 1.08
|
||||
ADD32ri MOV64ri32 LAT 1.000 cy TP 0.603 cy SPM 0.85
|
||||
ADD32ri SUB32ri LAT 1.086 cy TP 0.614 cy SPM 0.97
|
||||
ADD32ri VADDPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VADDSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VADDSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VCVTSI642SSrr LAT 2.001 cy TP 2.001 cy SPM 0.00
|
||||
ADD32ri VFMADD213PDYr LAT 4.002 cy TP 0.581 cy SPM 0.20
|
||||
ADD32ri VFMADD213PDr LAT 4.002 cy TP 0.582 cy SPM 0.20
|
||||
ADD32ri VFMADD213PSYr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VFMADD213PSr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VFMADD213SDr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VFMADD213SSr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VINSERTF128rr LAT 3.001 cy TP 1.000 cy SPM -0.00
|
||||
ADD32ri VMULPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VMULSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VMULSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VSUBSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD32ri VSUBSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 ADD64ri32 LAT 1.086 cy TP 0.611 cy SPM 1.07
|
||||
ADD64ri32 INC64r LAT 1.086 cy TP 0.605 cy SPM 0.99
|
||||
ADD64ri32 MOV64ri32 LAT 1.000 cy TP 0.578 cy SPM 0.76
|
||||
ADD64ri32 SUB32ri LAT 1.086 cy TP 0.611 cy SPM 0.95
|
||||
ADD64ri32 VADDPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VADDSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VADDSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VCVTSI642SSrr LAT 2.001 cy TP 1.000 cy SPM -3.39
|
||||
ADD64ri32 VFMADD213PDYr LAT 4.002 cy TP 0.581 cy SPM 0.20
|
||||
ADD64ri32 VFMADD213PDr LAT 4.002 cy TP 0.581 cy SPM 0.20
|
||||
ADD64ri32 VFMADD213PSYr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VFMADD213PSr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VFMADD213SDr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VFMADD213SSr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VINSERTF128rr LAT 3.002 cy TP 1.001 cy SPM 0.00
|
||||
ADD64ri32 VMULPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VMULSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VMULSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VSUBSDrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
ADD64ri32 VSUBSSrr LAT 4.002 cy TP 0.592 cy SPM 0.23
|
||||
INC64r INC64r LAT 1.086 cy TP 0.611 cy SPM 0.95
|
||||
INC64r MOV64ri32 LAT 1.000 cy TP 0.588 cy SPM 0.74
|
||||
INC64r SUB32ri LAT 1.086 cy TP 0.609 cy SPM 0.89
|
||||
INC64r VADDPDYrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VADDSDrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VADDSSrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VCVTSI642SSrr LAT 2.001 cy TP 1.000 cy SPM -3.19
|
||||
INC64r VFMADD213PDYr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VFMADD213PDr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VFMADD213PSYr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VFMADD213PSr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VFMADD213SDr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VFMADD213SSr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VINSERTF128rr LAT 3.001 cy TP 1.000 cy SPM 0.00
|
||||
INC64r VMULPDYrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VMULSDrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VMULSSrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VSUBSDrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
INC64r VSUBSSrr LAT 4.002 cy TP 0.564 cy SPM 0.13
|
||||
MOV64ri32 MOV64ri32 LAT 0.657 cy TP 0.578 cy SPM 0.63
|
||||
MOV64ri32 SUB32ri LAT 1.000 cy TP 0.578 cy SPM 0.68
|
||||
MOV64ri32 VADDPDYrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VADDSDrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VADDSSrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VCVTSI642SSrr LAT 2.001 cy TP 1.001 cy SPM -2.83
|
||||
MOV64ri32 VFMADD213PDYr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VFMADD213PDr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VFMADD213PSYr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VFMADD213PSr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VFMADD213SDr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VFMADD213SSr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VINSERTF128rr LAT 3.001 cy TP 1.001 cy SPM 0.00
|
||||
MOV64ri32 VMULPDYrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VMULSDrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VMULSSrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VSUBSDrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
MOV64ri32 VSUBSSrr LAT 4.002 cy TP 0.557 cy SPM 0.10
|
||||
SUB32ri SUB32ri LAT 1.086 cy TP 0.611 cy SPM 0.85
|
||||
SUB32ri VADDPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VADDSDrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VADDSSrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VCVTSI642SSrr LAT 2.001 cy TP 1.000 cy SPM -3.03
|
||||
SUB32ri VFMADD213PDYr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VFMADD213PDr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VFMADD213PSYr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VFMADD213PSr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VFMADD213SDr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VFMADD213SSr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VINSERTF128rr LAT 3.001 cy TP 1.000 cy SPM -0.00
|
||||
SUB32ri VMULPDYrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VMULSDrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VMULSSrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VSUBSDrr LAT 4.002 cy TP 0.592 cy SPM 0.21
|
||||
SUB32ri VSUBSSrr LAT 4.002 cy TP 0.961 cy SPM 1.33
|
||||
VADDPDYrr VADDPDYrr LAT 4.002 cy TP 1.036 cy SPM 0.98
|
||||
VADDPDYrr VADDSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDPDYrr VADDSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDPDYrr VCVTSI642SSrr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VADDPDYrr VFMADD213PDYr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDPDYrr VFMADD213PDr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDPDYrr VFMADD213PSYr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDPDYrr VFMADD213PSr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDPDYrr VFMADD213SDr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDPDYrr VFMADD213SSr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDPDYrr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VADDPDYrr VMULPDYrr LAT 4.002 cy TP 1.036 cy SPM 0.98
|
||||
VADDPDYrr VMULSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDPDYrr VMULSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDPDYrr VSUBSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDPDYrr VSUBSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDSDrr VADDSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VADDSDrr VADDSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VADDSDrr VCVTSI642SSrr LAT 4.002 cy TP 2.001 cy SPM 0.00
|
||||
VADDSDrr VFMADD213PDYr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSDrr VFMADD213PDr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSDrr VFMADD213PSYr LAT 4.002 cy TP 1.030 cy SPM 0.97
|
||||
VADDSDrr VFMADD213PSr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSDrr VFMADD213SDr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSDrr VFMADD213SSr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSDrr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VADDSDrr VMULPDYrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDSDrr VMULSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VADDSDrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VADDSDrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VADDSDrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VADDSSrr VADDSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VADDSSrr VCVTSI642SSrr LAT 4.002 cy TP 2.001 cy SPM 0.00
|
||||
VADDSSrr VFMADD213PDYr LAT 4.002 cy TP 1.030 cy SPM 0.97
|
||||
VADDSSrr VFMADD213PDr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSSrr VFMADD213PSYr LAT 4.002 cy TP 1.030 cy SPM 0.97
|
||||
VADDSSrr VFMADD213PSr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSSrr VFMADD213SDr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSSrr VFMADD213SSr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VADDSSrr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VADDSSrr VMULPDYrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VADDSSrr VMULSDrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VADDSSrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VADDSSrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VADDSSrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VCVTSI642SSrr VCVTSI642SSrr LAT 4.002 cy TP 4.002 cy SPM 1.00
|
||||
VCVTSI642SSrr VFMADD213PDYr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VCVTSI642SSrr VFMADD213PDr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VCVTSI642SSrr VFMADD213PSYr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VCVTSI642SSrr VFMADD213PSr LAT 4.002 cy TP 2.001 cy SPM 0.00
|
||||
VCVTSI642SSrr VFMADD213SDr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VCVTSI642SSrr VFMADD213SSr LAT 4.002 cy TP 2.147 cy SPM 0.28
|
||||
VCVTSI642SSrr VINSERTF128rr LAT 3.002 cy TP 3.001 cy SPM 1.00
|
||||
VCVTSI642SSrr VMULPDYrr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VCVTSI642SSrr VMULSDrr LAT 4.002 cy TP 2.001 cy SPM 0.00
|
||||
VCVTSI642SSrr VMULSSrr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VCVTSI642SSrr VSUBSDrr LAT 4.002 cy TP 2.001 cy SPM 0.00
|
||||
VCVTSI642SSrr VSUBSSrr LAT 4.002 cy TP 2.001 cy SPM -0.00
|
||||
VFMADD213PDYr VFMADD213PDYr LAT 4.002 cy TP 1.047 cy SPM 1.00
|
||||
VFMADD213PDYr VFMADD213PDr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PDYr VFMADD213PSYr LAT 4.002 cy TP 1.047 cy SPM 1.00
|
||||
VFMADD213PDYr VFMADD213PSr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PDYr VFMADD213SDr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PDYr VFMADD213SSr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PDYr VINSERTF128rr LAT 4.002 cy TP 1.001 cy SPM 0.00
|
||||
VFMADD213PDYr VMULPDYrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PDYr VMULSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PDYr VMULSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PDYr VSUBSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PDYr VSUBSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PDr VFMADD213PDr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PDr VFMADD213PSYr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PDr VFMADD213PSr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PDr VFMADD213SDr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PDr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PDr VINSERTF128rr LAT 4.002 cy TP 0.675 cy SPM -0.62
|
||||
VFMADD213PDr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
|
||||
VFMADD213PDr VMULSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PDr VMULSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PDr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PDr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PSYr VFMADD213PSYr LAT 4.002 cy TP 1.047 cy SPM 1.00
|
||||
VFMADD213PSYr VFMADD213PSr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PSYr VFMADD213SDr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PSYr VFMADD213SSr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VFMADD213PSYr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VFMADD213PSYr VMULPDYrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PSYr VMULSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PSYr VMULSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PSYr VSUBSDrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PSYr VSUBSSrr LAT 4.002 cy TP 1.029 cy SPM 0.97
|
||||
VFMADD213PSr VFMADD213PSr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PSr VFMADD213SDr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PSr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213PSr VINSERTF128rr LAT 4.002 cy TP 0.675 cy SPM -0.62
|
||||
VFMADD213PSr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
|
||||
VFMADD213PSr VMULSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PSr VMULSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PSr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213PSr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213SDr VFMADD213SDr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213SDr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213SDr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VFMADD213SDr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
|
||||
VFMADD213SDr VMULSDrr LAT 4.002 cy TP 1.156 cy SPM 1.21
|
||||
VFMADD213SDr VMULSSrr LAT 4.002 cy TP 1.156 cy SPM 1.21
|
||||
VFMADD213SDr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213SDr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213SSr VFMADD213SSr LAT 4.002 cy TP 1.046 cy SPM 1.00
|
||||
VFMADD213SSr VINSERTF128rr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VFMADD213SSr VMULPDYrr LAT 4.002 cy TP 1.026 cy SPM 0.96
|
||||
VFMADD213SSr VMULSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213SSr VMULSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213SSr VSUBSDrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VFMADD213SSr VSUBSSrr LAT 4.002 cy TP 1.028 cy SPM 0.97
|
||||
VINSERTF128rr VINSERTF128rr LAT 3.001 cy TP 2.001 cy SPM 1.00
|
||||
VINSERTF128rr VMULPDYrr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VINSERTF128rr VMULSDrr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VINSERTF128rr VMULSSrr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VINSERTF128rr VSUBSDrr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VINSERTF128rr VSUBSSrr LAT 4.002 cy TP 1.000 cy SPM -0.00
|
||||
VMULPDYrr VMULPDYrr LAT 4.002 cy TP 1.036 cy SPM 0.98
|
||||
VMULPDYrr VMULSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VMULPDYrr VMULSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VMULPDYrr VSUBSDrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VMULPDYrr VSUBSSrr LAT 4.002 cy TP 1.045 cy SPM 1.00
|
||||
VMULSDrr VMULSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VMULSDrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VMULSDrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VMULSDrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VMULSSrr VMULSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VMULSSrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VMULSSrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VSUBSDrr VSUBSDrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
VSUBSDrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.98
|
||||
VSUBSSrr VSUBSSrr LAT 4.002 cy TP 1.038 cy SPM 0.99
|
||||
[1;34m[likwid-pin] Main PID -> core 0 - OK[0m
|
Binary file not shown.
@@ -1,20 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleDevelopmentRegion</key>
|
||||
<string>English</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>com.apple.xcode.dsym.a.out</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>dSYM</string>
|
||||
<key>CFBundleSignature</key>
|
||||
<string>????</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1</string>
|
||||
</dict>
|
||||
</plist>
|
Binary file not shown.
@@ -1,29 +0,0 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: asmbench
|
||||
Version: 0.1.4
|
||||
Summary: A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT
|
||||
Home-page: https://github.com/RRZE-HPC/asmbench
|
||||
Author: Julian Hammer
|
||||
Author-email: julian.hammer@fau.de
|
||||
License: AGPLv3
|
||||
Description: asmbench
|
||||
========
|
||||
|
||||
A benchmark toolkit for assembly instructions using the LLVM JIT.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
To benchmark latency and throughput of a 64bit integer add use the following command:
|
||||
|
||||
``python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}'``
|
||||
|
||||
To benchmark two instructions interleaved use this:
|
||||
|
||||
``python -m asmbench 'add {src:i64:r}, {srcdst:i64:r}' 'sub {src:i64:r}, {srcdst:i64:r}'``
|
||||
|
||||
To find out more add `-h` for help and `-v` for verbose mode.
|
||||
|
||||
Platform: UNKNOWN
|
||||
Provides-Extra: iaca
|
||||
Provides-Extra: sc18src
|
@@ -1,17 +0,0 @@
|
||||
LICENSE
|
||||
MANIFEST.in
|
||||
README.rst
|
||||
setup.py
|
||||
asmbench/__init__.py
|
||||
asmbench/__main__.py
|
||||
asmbench/bench.py
|
||||
asmbench/oldjit.py
|
||||
asmbench/op.py
|
||||
asmbench/sc18src.py
|
||||
asmbench/streams.py
|
||||
asmbench.egg-info/PKG-INFO
|
||||
asmbench.egg-info/SOURCES.txt
|
||||
asmbench.egg-info/dependency_links.txt
|
||||
asmbench.egg-info/entry_points.txt
|
||||
asmbench.egg-info/requires.txt
|
||||
asmbench.egg-info/top_level.txt
|
@@ -1 +0,0 @@
|
||||
|
@@ -1,3 +0,0 @@
|
||||
[console_scripts]
|
||||
asmbench = asmbench.__main__:main
|
||||
|
@@ -1,9 +0,0 @@
|
||||
llvmlite>=0.23.2
|
||||
psutil
|
||||
|
||||
[iaca]
|
||||
kerncraft
|
||||
|
||||
[sc18src]
|
||||
numpy
|
||||
matplotlib
|
@@ -1 +0,0 @@
|
||||
asmbench
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -12,9 +12,9 @@ import sys
|
||||
import llvmlite.binding as llvm
|
||||
import psutil
|
||||
try:
|
||||
from kerncraft import incode_model
|
||||
from kerncraft import iaca
|
||||
except ImportError:
|
||||
incode_model = None
|
||||
iaca = None
|
||||
|
||||
from . import op
|
||||
|
||||
@@ -87,13 +87,13 @@ class Benchmark:
|
||||
|
||||
def get_iaca_analysis(self, arch):
|
||||
"""Compile and return IACA analysis."""
|
||||
if incode_model is None:
|
||||
if iaca is None:
|
||||
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
|
||||
tm = self.get_target_machine()
|
||||
tmpf = tempfile.NamedTemporaryFile("wb")
|
||||
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
|
||||
tmpf.flush()
|
||||
return incode_model.iaca_analyse_instrumented_binary(tmpf.name, arch)
|
||||
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
|
||||
|
||||
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
|
||||
# Compile the module to machine code using MCJIT
|
||||
|
@@ -1,82 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
import socket
|
||||
import textwrap
|
||||
|
||||
import numpy
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
|
||||
from asmbench import op, bench
|
||||
from asmbench import oldjit
|
||||
|
||||
|
||||
type_size = {
|
||||
'i32': 4,
|
||||
'i64': 8,
|
||||
'f32': 4,
|
||||
'float': 4,
|
||||
'f64': 8,
|
||||
'double': 8,
|
||||
}
|
||||
|
||||
|
||||
class StreamsBenchmark(bench.Benchmark):
|
||||
def __init__(self,
|
||||
read_streams=0, read_write_streams=0, write_streams=0,
|
||||
stream_byte_length=0,
|
||||
element_type='i64'):
|
||||
super().__init__()
|
||||
self.read_streams = read_streams
|
||||
self.read_write_streams = read_write_streams
|
||||
self.write_streams = write_streams
|
||||
self.stream_byte_length = stream_byte_length
|
||||
self.element_type = element_type
|
||||
|
||||
def build_ir(self, iaca_marker=False):
|
||||
if iaca_marker:
|
||||
iaca_start_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$111,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
iaca_stop_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$222,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
else:
|
||||
iaca_start_marker = ''
|
||||
iaca_stop_marker = ''
|
||||
|
||||
ir = textwrap.dedent('''\
|
||||
define i64 @"test"(i64 %"N"{pointer_arguments})
|
||||
{{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
{iaca_start_marker}
|
||||
{loop_body}
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
{iaca_stop_marker}
|
||||
ret i64 %"ret"
|
||||
}}
|
||||
''').format(
|
||||
pointer_arguments='',
|
||||
loop_body='',
|
||||
iaca_start_marker=iaca_start_marker,
|
||||
iaca_stop_marker=iaca_stop_marker)
|
||||
|
||||
return ir
|
||||
|
||||
if __name__ == '__main__':
|
||||
bench.setup_llvm()
|
||||
sb = StreamsBenchmark()
|
||||
print(sb.build_and_execute())
|
||||
|
@@ -1 +0,0 @@
|
||||
__version__ = '0.1.4'
|
@@ -1,48 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
import psutil
|
||||
import llvmlite.binding as llvm
|
||||
|
||||
from . import op, bench
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
|
||||
# parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
|
||||
parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
|
||||
help='instruction declaration, e.g., "add {src:i32:r}, {srcdst:i32:r}"')
|
||||
parser.add_argument('--serialize', action='store_true',
|
||||
help='Serialize instructions.')
|
||||
parser.add_argument('--latency-serial', '-l', type=int, default=8,
|
||||
help='length of serial chain for each instruction in latency benchmark')
|
||||
parser.add_argument('--parallel', '-p',type=int, default=10,
|
||||
help='number of parallel instances of serial chains in throughput '
|
||||
'benchmark')
|
||||
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
|
||||
help='length of serial instances of serial chains in throughput benchmark')
|
||||
parser.add_argument('--iaca', type=str, default=None,
|
||||
help='Compare throughput measurement with IACA analysis, pass '
|
||||
'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
|
||||
parser.add_argument("--verbose", "-v", action="count", default=0,
|
||||
help="increase output verbosity")
|
||||
parser.add_argument('-f', '--frequency', type=float, required=psutil.cpu_freq() is None,
|
||||
help='Provided (in GHz), if psutil.cpu_freq() does report anything.')
|
||||
args = parser.parse_args()
|
||||
if args.frequency:
|
||||
args.frequency *= 1e9
|
||||
|
||||
bench.setup_llvm()
|
||||
lat, tp = bench.bench_instructions(args.instructions,
|
||||
serial_factor=args.latency_serial,
|
||||
parallel_factor=args.parallel,
|
||||
throughput_serial_factor=args.throughput_serial,
|
||||
serialize=args.serialize,
|
||||
verbosity=args.verbose,
|
||||
iaca_comparison=args.iaca,
|
||||
frequency=args.frequency)
|
||||
print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@@ -1,399 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import ctypes
|
||||
import time
|
||||
import textwrap
|
||||
import itertools
|
||||
import re
|
||||
from pprint import pprint
|
||||
import tempfile
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import llvmlite.binding as llvm
|
||||
import psutil
|
||||
try:
|
||||
from kerncraft import iaca
|
||||
except ImportError:
|
||||
iaca = None
|
||||
|
||||
from . import op
|
||||
|
||||
|
||||
def setup_llvm():
|
||||
llvm.initialize()
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
|
||||
|
||||
def uniquify(l):
|
||||
# Uniquify list while preserving order
|
||||
seen = set()
|
||||
return [x for x in l if x not in seen and not seen.add(x)]
|
||||
|
||||
|
||||
class Benchmark:
|
||||
def __init__(self, frequency=None):
|
||||
self.frequency = frequency or psutil.cpu_freq().max * 1e6
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(
|
||||
self.__class__.__name__,
|
||||
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')]))
|
||||
|
||||
@staticmethod
|
||||
def prepare_arguments(previous_args=None, time_factor=1.0):
|
||||
"""Build argument tuple, to be passed to low level function."""
|
||||
if previous_args is None:
|
||||
return 10000000,
|
||||
else:
|
||||
try:
|
||||
return int(previous_args[0] * time_factor),
|
||||
except OverflowError:
|
||||
return previous_args[0]*10,
|
||||
|
||||
@staticmethod
|
||||
def get_iterations(args) -> int:
|
||||
"""Return number of iterations performed, based on lower level function arguments."""
|
||||
return args[0]
|
||||
|
||||
def build_ir(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_llvm_module(self, iaca_marker=False):
|
||||
"""Build and return LLVM module from LLVM IR code."""
|
||||
ir = self.build_ir(iaca_marker=iaca_marker)
|
||||
return llvm.parse_assembly(ir)
|
||||
|
||||
def get_target_machine(self):
|
||||
"""Instantiate and return target machine."""
|
||||
features = llvm.get_host_cpu_features().flatten()
|
||||
cpu = '' # llvm.get_host_cpu_name() # Work around until ryzen problems are fixed
|
||||
return llvm.Target.from_default_triple().create_target_machine(
|
||||
cpu=cpu, features=features, opt=3)
|
||||
|
||||
def get_assembly(self, iaca_marker=False):
|
||||
"""Compile and return assembly from LLVM module."""
|
||||
tm = self.get_target_machine()
|
||||
tm.set_asm_verbosity(0)
|
||||
asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
|
||||
# Remove double comments
|
||||
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
|
||||
return asm
|
||||
|
||||
def get_function_ctype(self):
|
||||
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
|
||||
|
||||
def get_iaca_analysis(self, arch):
|
||||
"""Compile and return IACA analysis."""
|
||||
if iaca is None:
|
||||
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
|
||||
tm = self.get_target_machine()
|
||||
tmpf = tempfile.NamedTemporaryFile("wb")
|
||||
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
|
||||
tmpf.flush()
|
||||
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
|
||||
|
||||
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
|
||||
# Compile the module to machine code using MCJIT
|
||||
tm = self.get_target_machine()
|
||||
runtimes = []
|
||||
return_values = []
|
||||
args = self.prepare_arguments()
|
||||
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
|
||||
ee.finalize_object()
|
||||
|
||||
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
|
||||
# code in memory.
|
||||
cfptr = ee.get_function_address('test')
|
||||
|
||||
# To convert an address to an actual callable thing we have to use
|
||||
# CFUNCTYPE, and specify the arguments & return type.
|
||||
cfunc = self.get_function_ctype()(cfptr)
|
||||
|
||||
# Now 'cfunc' is an actual callable we can invoke
|
||||
# TODO replace time.clock with a C implemententation for less overhead
|
||||
# TODO return result in machine readable format
|
||||
fixed_args = False
|
||||
for i in range(repeat):
|
||||
tries = 0
|
||||
while True:
|
||||
if tries > 10:
|
||||
raise RuntimeError("Unable to measure non-zero runtime.")
|
||||
tries += 1
|
||||
start = time.perf_counter()
|
||||
ret = cfunc(*args)
|
||||
end = time.perf_counter()
|
||||
elapsed = end - start
|
||||
if ret != args[0]-1:
|
||||
raise RuntimeError(
|
||||
"Return value {} is invalid, should have been {}.".format(ret, args[0]-1))
|
||||
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
|
||||
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
|
||||
factor = target_elapsed / elapsed
|
||||
args = self.prepare_arguments(previous_args=args, time_factor=factor)
|
||||
continue
|
||||
else:
|
||||
# After we have the right argument choice, we keep it.
|
||||
fixed_args = True
|
||||
break
|
||||
return_values.append(ret)
|
||||
runtimes.append(elapsed)
|
||||
return {'iterations': self.get_iterations(args),
|
||||
'arguments': args,
|
||||
'runtimes': runtimes,
|
||||
'frequency': self.frequency,
|
||||
'returned': return_values}
|
||||
|
||||
|
||||
class LoopBenchmark(Benchmark):
|
||||
def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.root_synth = root_synth
|
||||
self.init_values = init_values or root_synth.get_default_init_values()
|
||||
self.loop_carried_dependencies = loop_carried_dependencies
|
||||
|
||||
if len(root_synth.get_source_registers()) != len(self.init_values):
|
||||
raise ValueError("Number of init values and source registers do not match.")
|
||||
|
||||
def get_source_names(self):
|
||||
return ['%in.{}'.format(i) for i in range(len(self.root_synth.get_source_registers()))]
|
||||
|
||||
def get_destination_names(self):
|
||||
return ['%out.{}'.format(i) for i in
|
||||
range(len(self.root_synth.get_destination_registers()))]
|
||||
|
||||
def get_phi_code(self):
|
||||
if not self.loop_carried_dependencies:
|
||||
return ''
|
||||
# Compile loop carried dependencies
|
||||
lcd = []
|
||||
# Change in naming (src <-> dst) is on purpose!
|
||||
srcs = self.root_synth.get_destination_registers()
|
||||
dsts = self.root_synth.get_source_registers()
|
||||
# cycle iterator is used to not only reuse a single destination, but go through all of them
|
||||
srcs_it = itertools.cycle(enumerate(srcs))
|
||||
matched = False
|
||||
last_match_idx = len(srcs) - 1
|
||||
for dst_idx, dst in enumerate(dsts):
|
||||
for src_idx, src in srcs_it:
|
||||
if src.llvm_type == dst.llvm_type:
|
||||
lcd.append([dst,
|
||||
self.get_source_names()[dst_idx],
|
||||
self.init_values[dst_idx],
|
||||
src,
|
||||
self.get_destination_names()[src_idx]])
|
||||
matched = True
|
||||
last_match_idx = src_idx
|
||||
break
|
||||
# since srcs_it is an infinity iterator, we need to abort after a complete cycle
|
||||
if src_idx == last_match_idx:
|
||||
break
|
||||
if not matched:
|
||||
raise ValueError("Unable to match source to any destination.")
|
||||
|
||||
code = ''
|
||||
for dst_reg, dst_name, init_value, src_reg, src_name in lcd:
|
||||
assert dst_reg.llvm_type == src_reg.llvm_type, \
|
||||
"Source and destination types do not match"
|
||||
code += ('{dst_name} = phi {llvm_type} [{init_value}, %"entry"], '
|
||||
'[{src_name}, %"loop"]\n').format(
|
||||
llvm_type=dst_reg.llvm_type,
|
||||
dst_name=dst_name,
|
||||
init_value=init_value,
|
||||
src_name=src_name)
|
||||
|
||||
# Add extra phi for constant values. Assuming LLVM will optimize them "away"
|
||||
for dst_idx, dst in enumerate(dsts):
|
||||
if dst not in [d for d, dn, i, s, sn in lcd]:
|
||||
code += ('{dst_reg} = phi {llvm_type} [{init_value}, %"entry"], '
|
||||
'[{init_value}, %"loop"]\n').format(
|
||||
llvm_type=dst.llvm_type,
|
||||
dst_reg=self.get_source_names()[dst_idx],
|
||||
init_value=self.init_values[dst_idx])
|
||||
|
||||
return code
|
||||
|
||||
def build_ir(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class IntegerLoopBenchmark(LoopBenchmark):
|
||||
def build_ir(self, iaca_marker=False):
|
||||
if iaca_marker:
|
||||
iaca_start_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$111,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
iaca_stop_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$222,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
else:
|
||||
iaca_start_marker = ''
|
||||
iaca_stop_marker = ''
|
||||
|
||||
ir = textwrap.dedent('''\
|
||||
define i64 @"test"(i64 %"N")
|
||||
{{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
{phi}
|
||||
{iaca_start_marker}
|
||||
{loop_body}
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
{iaca_stop_marker}
|
||||
ret i64 %"ret"
|
||||
}}
|
||||
''').format(
|
||||
loop_body=textwrap.indent(
|
||||
self.root_synth.build_ir(self.get_destination_names(),
|
||||
self.get_source_names()), ' '),
|
||||
phi=textwrap.indent(self.get_phi_code(), ' '),
|
||||
iaca_start_marker=iaca_start_marker,
|
||||
iaca_stop_marker=iaca_stop_marker)
|
||||
|
||||
return ir
|
||||
|
||||
|
||||
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
|
||||
serialize=False, verbosity=0, iaca_comparison=None,
|
||||
repeat=4, min_elapsed=0.1, max_elapsed=0.2, frequency=None):
|
||||
not_serializable = False
|
||||
try:
|
||||
# Latency Benchmark
|
||||
if verbosity > 0:
|
||||
print('## Latency Benchmark')
|
||||
p_instrs = []
|
||||
if not serialize:
|
||||
for i in instructions:
|
||||
p_instrs.append(op.Serialized([i] * serial_factor))
|
||||
else:
|
||||
p_instrs = [op.Serialized(instructions * serial_factor)]
|
||||
p = op.Parallelized(p_instrs)
|
||||
b = IntegerLoopBenchmark(p, frequency=frequency)
|
||||
if verbosity >= 3:
|
||||
print('### LLVM IR')
|
||||
print(b.build_ir())
|
||||
if verbosity >= 2:
|
||||
print('### Assembly')
|
||||
print(b.get_assembly())
|
||||
if verbosity >= 3:
|
||||
print('### IACA Analysis')
|
||||
try:
|
||||
print(b.get_iaca_analysis('SKL')['output'])
|
||||
except ValueError as e:
|
||||
print("Unable to perform IACA analysis (skipping): ", e)
|
||||
except FileNotFoundError as e:
|
||||
print("IACA binary not found by kerncraft. Run iaca_get to install.", e)
|
||||
|
||||
result = b.build_and_execute(
|
||||
repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
|
||||
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
result['latency'] = lat
|
||||
if verbosity > 0:
|
||||
print('### Detailed Results')
|
||||
pprint(result)
|
||||
print()
|
||||
except op.NotSerializableError as e:
|
||||
print("Latency measurement not possible:", e)
|
||||
not_serializable = True
|
||||
|
||||
if not_serializable:
|
||||
throughput_serial_factor = 1
|
||||
print("WARNING: throughput_serial_factor has be set to 1.")
|
||||
|
||||
# Throughput Benchmark
|
||||
if verbosity > 0:
|
||||
print('## Throughput Benchmark')
|
||||
p_instrs = []
|
||||
if not serialize:
|
||||
for i in instructions:
|
||||
p_instrs.append(op.Serialized([i] * throughput_serial_factor))
|
||||
else:
|
||||
p_instrs = [op.Serialized(instructions * throughput_serial_factor)]
|
||||
p = op.Parallelized(p_instrs * parallel_factor, interleave=True)
|
||||
b = IntegerLoopBenchmark(p, frequency=frequency)
|
||||
if verbosity >= 3:
|
||||
print('### LLVM IR')
|
||||
print(b.build_ir())
|
||||
if verbosity >= 2:
|
||||
print('### Assembly')
|
||||
print(b.get_assembly())
|
||||
if verbosity >= 3:
|
||||
print('### IACA Analysis')
|
||||
try:
|
||||
print(b.get_iaca_analysis('SKL')['output'])
|
||||
except ValueError as e:
|
||||
print("Unable to perform IACA analysis (skipping): ", e)
|
||||
except FileNotFoundError as e:
|
||||
print("IACA binary not found by kerncraft. Run iaca_get to install.", e)
|
||||
result = b.build_and_execute(
|
||||
repeat=repeat, min_elapsed=min_elapsed, max_elapsed=max_elapsed)
|
||||
tp = min(
|
||||
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
result['throughput'] = tp
|
||||
if iaca_comparison is not None:
|
||||
iaca_analysis = b.get_iaca_analysis(iaca_comparison)
|
||||
result['iaca throughput'] = iaca_analysis['throughput']/(
|
||||
parallel_factor * throughput_serial_factor)
|
||||
if verbosity > 0:
|
||||
print('### Detailed Results')
|
||||
pprint(result)
|
||||
print()
|
||||
if verbosity > 1 and iaca_comparison is not None:
|
||||
print('### IACA Results')
|
||||
print(iaca_analysis['output'])
|
||||
print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
|
||||
throughput_serial_factor, parallel_factor))
|
||||
|
||||
# Result compilation
|
||||
return lat, tp
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
setup_llvm()
|
||||
|
||||
i1 = op.Instruction(
|
||||
instruction='add $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
|
||||
i2 = op.Instruction(
|
||||
instruction='sub $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
|
||||
s = op.Serialized([i1, i2])
|
||||
i3 = op.Instruction(
|
||||
instruction='add $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
|
||||
i4 = op.Instruction(
|
||||
instruction='sub $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
|
||||
i5 = op.Instruction(
|
||||
instruction='add $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
|
||||
i6 = op.Instruction(
|
||||
instruction='add $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
|
||||
s1 = op.Serialized([i1, i2])
|
||||
s2 = op.Serialized([s1, i3])
|
||||
s3 = op.Serialized([i4, i5])
|
||||
p1 = op.Parallelized([i6, s2, s3])
|
||||
init_values = ['1' for r in p1.get_source_registers()]
|
||||
b = IntegerLoopBenchmark(p1, init_values)
|
||||
print(b.build_ir())
|
||||
print(b.get_assembly())
|
@@ -1,897 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import ctypes
|
||||
import sys
|
||||
import time
|
||||
import textwrap
|
||||
import itertools
|
||||
import random
|
||||
import collections
|
||||
import pprint
|
||||
import math
|
||||
import argparse
|
||||
|
||||
import llvmlite.binding as llvm
|
||||
import psutil
|
||||
|
||||
|
||||
# TODOs
|
||||
# * API to create test scenarios
|
||||
# * DSL?
|
||||
# * Test cases:
|
||||
# * Instructions:
|
||||
# * [x] arithmetics \w reg and/or imm.
|
||||
# * scalar
|
||||
# * packed
|
||||
# * [x] lea
|
||||
# * [x] LOAD / mov \w mem
|
||||
# * [TODO] STORE / mov to mem
|
||||
# * [x] Single Latency
|
||||
# * [x] Single Throughput
|
||||
# * [TODO] Combined Throughput
|
||||
# * [TODO] Random Throughput
|
||||
# * [TODO] Automated TP, Lat, #pipeline analysis
|
||||
# * [TODO] IACA marked binary output generation
|
||||
# * [TODO] Fuzzing algorithm
|
||||
# * [TODO] CLI
|
||||
# * C based timing routine? As an extension?
|
||||
# * make sanity checks during runtime, check for fixed frequency and pinning
|
||||
|
||||
def floor_harmonic_fraction(n, error=0.1):
|
||||
"""
|
||||
Finds closest floored integer or inverse integer and returns error.
|
||||
|
||||
(numerator, denominator, relative error) where either numerator or denominator is exactly one.
|
||||
"""
|
||||
floor_n = math.floor(n)
|
||||
if floor_n > 0:
|
||||
return floor_n, 1, 1 - floor_n / n
|
||||
else:
|
||||
i = 2
|
||||
while (1 / i) > n:
|
||||
i += 1
|
||||
|
||||
return 1, i, 1 - (1 / i) / n
|
||||
|
||||
|
||||
class Benchmark:
|
||||
def __init__(self, parallel=1, serial=5, frequency=None):
|
||||
self._function_ctype = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
|
||||
self.parallel = parallel
|
||||
self.serial = serial
|
||||
self.frequency = frequency or psutil.cpu_freq().current * 1e6
|
||||
|
||||
# Do interesting work
|
||||
self._loop_body = textwrap.dedent('''\
|
||||
%"checksum" = phi i64 [0, %"entry"], [%"checksum.1", %"loop"]
|
||||
%"checksum.1" = call i64 asm sideeffect "
|
||||
add $1, $0",
|
||||
"=r,i,r" (i64 1, i64 %"checksum")\
|
||||
''')
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(
|
||||
self.__class__.__name__,
|
||||
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')]))
|
||||
|
||||
def get_ir(self):
|
||||
# FP add loop - may have issues
|
||||
# return textwrap.dedent('''\
|
||||
# define i64 @"test"(i64 %"N")
|
||||
# {{
|
||||
# entry:
|
||||
# %"N.fp" = sitofp i64 %"N" to double
|
||||
# %"loop_cond" = fcmp olt double 0.0, %"N.fp"
|
||||
# br i1 %"loop_cond", label %"loop", label %"end"
|
||||
#
|
||||
# loop:
|
||||
# %"loop_counter" = phi double [0.0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
# {loop_body}
|
||||
# %"loop_counter.1" = fadd double %"loop_counter", 1.0
|
||||
# %"loop_cond.1" = fcmp olt double %"loop_counter.1", %"N.fp"
|
||||
# br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
#
|
||||
# end:
|
||||
# %"ret.fp" = phi double [0.0, %"entry"], [%"loop_counter", %"loop"]
|
||||
# %"ret" = fptosi double %"ret.fp" to i64
|
||||
# ret i64 %"ret"
|
||||
# }}
|
||||
# ''').format(
|
||||
# loop_body=textwrap.indent(self._loop_body, ' '))
|
||||
return textwrap.dedent('''\
|
||||
define i64 @"test"(i64 %"N")
|
||||
{{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
{loop_body}
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
ret i64 %"ret"
|
||||
}}
|
||||
''').format(
|
||||
loop_body=textwrap.indent(self._loop_body, ' '))
|
||||
|
||||
def prepare_arguments(self, previous_args=None, time_factor=1.0):
|
||||
"""Build argument tuple, to be passed to low level function."""
|
||||
if previous_args is None:
|
||||
return 100,
|
||||
else:
|
||||
return int(previous_args[0] * time_factor),
|
||||
|
||||
def get_iterations(self, args):
|
||||
"""Return number of iterations performed, based on lower level function arguments."""
|
||||
return args[0]
|
||||
|
||||
def get_llvm_module(self):
|
||||
"""Build and return LLVM module from LLVM IR code."""
|
||||
if not hasattr(self, '_llvm_module'):
|
||||
self._llvm_module = llvm.parse_assembly(self.get_ir())
|
||||
self._llvm_module.verify()
|
||||
return self._llvm_module
|
||||
|
||||
def get_target_machine(self):
|
||||
"""Instantiate and return target machine."""
|
||||
if not hasattr(self, '_llvm_module'):
|
||||
features = llvm.get_host_cpu_features().flatten()
|
||||
cpu = llvm.get_host_cpu_name()
|
||||
self._tm = llvm.Target.from_default_triple().create_target_machine(
|
||||
cpu=cpu, features=features, opt=1)
|
||||
return self._tm
|
||||
|
||||
def get_assembly(self):
|
||||
"""Compile and return assembly from LLVM module."""
|
||||
tm = self.get_target_machine()
|
||||
tm.set_asm_verbosity(0)
|
||||
return tm.emit_assembly(self.get_llvm_module())
|
||||
|
||||
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
|
||||
# Compile the module to machine code using MCJIT
|
||||
tm = self.get_target_machine()
|
||||
runtimes = []
|
||||
args = self.prepare_arguments()
|
||||
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
|
||||
ee.finalize_object()
|
||||
|
||||
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
|
||||
# code in memory.
|
||||
cfptr = ee.get_function_address('test')
|
||||
|
||||
# To convert an address to an actual callable thing we have to use
|
||||
# CFUNCTYPE, and specify the arguments & return type.
|
||||
cfunc = self._function_ctype(cfptr)
|
||||
|
||||
# Now 'cfunc' is an actual callable we can invoke
|
||||
# TODO replace time.clock with a C implemententation for less overhead
|
||||
# TODO return result in machine readable format
|
||||
fixed_args = False
|
||||
for i in range(repeat):
|
||||
while True:
|
||||
start = time.perf_counter()
|
||||
res = cfunc(*args)
|
||||
end = time.perf_counter()
|
||||
elapsed = end - start
|
||||
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
|
||||
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
|
||||
factor = target_elapsed / elapsed
|
||||
args = self.prepare_arguments(previous_args=args, time_factor=factor)
|
||||
continue
|
||||
else:
|
||||
# After we have the right argument choice, we keep it.
|
||||
fixed_args = True
|
||||
break
|
||||
|
||||
runtimes.append(elapsed)
|
||||
|
||||
return {'iterations': self.get_iterations(args),
|
||||
'arguments': args,
|
||||
'runtimes': runtimes,
|
||||
'frequency': self.frequency}
|
||||
|
||||
@classmethod
|
||||
def get_latency(cls, max_serial=6, print_table=False, **kwargs):
|
||||
if print_table:
|
||||
print(' s |' + ''.join([' {:^5}'.format(i) for i in range(1, max_serial)]))
|
||||
print(' | ', end='')
|
||||
serial_runs = []
|
||||
for s in range(1, max_serial):
|
||||
m = cls(serial=s, parallel=1, **kwargs)
|
||||
r = m.build_and_execute(repeat=1)
|
||||
cy_per_it = min(r['runtimes']) * r['frequency'] / (
|
||||
r['iterations'] * m.parallel * m.serial)
|
||||
if print_table:
|
||||
print('{:.3f} '.format(cy_per_it), end='')
|
||||
sys.stdout.flush()
|
||||
|
||||
serial_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
|
||||
|
||||
if print_table:
|
||||
print()
|
||||
print('LAT: {lat[0]}/{lat[1]}cy (min. error {lat[2]:.1%})'.format(
|
||||
lat=min(serial_runs)[1]))
|
||||
|
||||
return min(serial_runs)[1]
|
||||
|
||||
@classmethod
|
||||
def get_throughput(cls, max_serial=6, max_parallel=17, print_table=False, **kwargs):
|
||||
if print_table:
|
||||
print('s\p |' + ''.join([' {:^5}'.format(i) for i in range(2, max_parallel)]))
|
||||
parallel_runs = []
|
||||
for s in range(1, max_serial):
|
||||
if print_table:
|
||||
print('{:>3} | '.format(s), end='')
|
||||
for p in range(2, max_parallel):
|
||||
m = cls(serial=s, parallel=p, **kwargs)
|
||||
r = m.build_and_execute(repeat=1)
|
||||
cy_per_it = min(r['runtimes']) * r['frequency'] / (
|
||||
r['iterations'] * m.parallel * m.serial)
|
||||
if print_table:
|
||||
print('{:.3f} '.format(cy_per_it), end='')
|
||||
sys.stdout.flush()
|
||||
parallel_runs.append((cy_per_it, floor_harmonic_fraction(cy_per_it), m))
|
||||
if print_table:
|
||||
print()
|
||||
|
||||
if print_table:
|
||||
print('TP: {tp[0]}/{tp[1]}cy (min. error {tp[2]:.1%});'.format(
|
||||
tp=min(parallel_runs)[1]))
|
||||
|
||||
return min(parallel_runs)[1]
|
||||
|
||||
|
||||
class InstructionBenchmark(Benchmark):
|
||||
def __init__(self, instruction='addq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('i', 'i64', '1'),),
|
||||
parallel=10,
|
||||
serial=4,
|
||||
**kwargs):
|
||||
"""
|
||||
Build LLVM IR for arithmetic instruction benchmark without memory references.
|
||||
|
||||
Currently only one destination (dst) or combined destination and source (dstsrc) operand
|
||||
is allowed. Only instruction's operands ($N) refer to the order of opernads found in
|
||||
dst + dstsrc + src.
|
||||
"""
|
||||
Benchmark.__init__(self, parallel=parallel, serial=serial, **kwargs)
|
||||
self.instruction = instruction
|
||||
self.dst_operands = dst_operands
|
||||
self.dstsrc_operands = dstsrc_operands
|
||||
self.src_operands = src_operands
|
||||
self._loop_body = ''
|
||||
if len(dst_operands) + len(dstsrc_operands) != 1:
|
||||
raise NotImplemented("Must have exactly one dst or dstsrc operand.")
|
||||
if not all([op[0] in 'irx'
|
||||
for op in itertools.chain(dst_operands, dstsrc_operands, src_operands)]):
|
||||
raise NotImplemented("This class only supports register and immediate operands.")
|
||||
|
||||
# Part 1: PHI functions and initializations
|
||||
for i, dstsrc_op in enumerate(dstsrc_operands):
|
||||
# constraint code, llvm type string, initial value
|
||||
if dstsrc_op[0] in 'rx':
|
||||
# register operand
|
||||
for p in range(self.parallel):
|
||||
self._loop_body += (
|
||||
'%"dstsrc{index}_{p}" = phi {type} '
|
||||
'[{initial}, %"entry"], [%"dstsrc{index}_{p}.out", %"loop"]\n').format(
|
||||
index=i, type=dstsrc_op[1], initial=dstsrc_op[2], p=p)
|
||||
else:
|
||||
raise NotImplemented("Operand type in {!r} is not yet supported.".format(dstsrc_op))
|
||||
|
||||
# Part 2: Inline ASM call
|
||||
# Build constraint string from operands
|
||||
constraints = ','.join(
|
||||
['=' + dop[0] for dop in itertools.chain(dst_operands, dstsrc_operands)] +
|
||||
[sop[0] for sop in itertools.chain(src_operands)] +
|
||||
['{}'.format(i + len(dst_operands)) for i in range(len(dstsrc_operands))])
|
||||
|
||||
for i, dstsrc_op in enumerate(dstsrc_operands):
|
||||
# Build instruction from instruction and operands
|
||||
# TODO support multiple dstsrc operands
|
||||
# TODO support dst and dstsrc operands at the same time
|
||||
for p in range(self.parallel):
|
||||
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
|
||||
for j, dop in enumerate(dstsrc_operands):
|
||||
operands.append('{type} %dstsrc{index}_{p}'.format(type=dop[1], index=j, p=p))
|
||||
args = ', '.join(operands)
|
||||
|
||||
self._loop_body += (
|
||||
'%"dstsrc{index}_{p}.out" = call {dst_type} asm sideeffect'
|
||||
' "{instruction}", "{constraints}" ({args})\n').format(
|
||||
index=i,
|
||||
dst_type=dstsrc_op[1],
|
||||
instruction='\n'.join([instruction] * self.serial),
|
||||
constraints=constraints,
|
||||
args=args,
|
||||
p=p)
|
||||
|
||||
for i, dst_op in enumerate(dst_operands):
|
||||
# Build instruction from instruction and operands
|
||||
# TODO support multiple dst operands
|
||||
# TODO support dst and dstsrc operands at the same time
|
||||
if self.serial != 1:
|
||||
raise NotImplemented("Serial > 1 and dst operand is not supported.")
|
||||
for p in range(self.parallel):
|
||||
operands = ['{type} {val}'.format(type=sop[1], val=sop[2]) for sop in src_operands]
|
||||
args = ', '.join(operands)
|
||||
|
||||
self._loop_body += (
|
||||
'%"dst{index}_{p}.out" = call {dst_type} asm sideeffect'
|
||||
' "{instruction}", "{constraints}" ({args})\n').format(
|
||||
index=i,
|
||||
dst_type=dst_op[1],
|
||||
instruction=instruction,
|
||||
constraints=constraints,
|
||||
args=args,
|
||||
p=p)
|
||||
|
||||
|
||||
class AddressGenerationBenchmark(Benchmark):
|
||||
def __init__(self,
|
||||
offset=('i', 'i64', '0x42'),
|
||||
base=('r', 'i64', '0'),
|
||||
index=('r', 'i64', '0'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=4,
|
||||
**kwargs):
|
||||
"""
|
||||
Benchmark for address generation modes.
|
||||
|
||||
Arguments may be None or (arg_type, reg_type, initial_value), with arg_type 'r' (register)
|
||||
or 'i' (immediate) and initial_value a string.
|
||||
E.g., ('r', 'i64', '0') or ('i', None, '4')
|
||||
|
||||
+--------------------------------+-----------------------------+
|
||||
| Mode | AT&T |
|
||||
+--------------------------------+-----------------------------+
|
||||
| Offset | leal 0x0100, %eax | <- no latency support
|
||||
| Base | leal (%esi), %eax |
|
||||
| Offset + Base | leal -8(%ebp), %eax |
|
||||
| Offset + Index*Width | leal 0x100(,%ebx,4), %eax |
|
||||
| Offset + Base + Index*Width | leal 0x8(%edx,%ebx,4), %eax |
|
||||
+--------------------------------+-----------------------------+
|
||||
OFFSET(BASE, INDEX, WIDTH) -> offset + base + index*width
|
||||
offset: immediate integer (+/-)
|
||||
base: register
|
||||
index: register
|
||||
width: immediate 1,2,4 or 8
|
||||
"""
|
||||
Benchmark.__init__(self, parallel=parallel, serial=serial, **kwargs)
|
||||
self.offset = offset
|
||||
self.base = base
|
||||
self.index = index
|
||||
self.width = width
|
||||
self.destination = destination
|
||||
self.parallel = parallel
|
||||
# Sanity checks:
|
||||
if bool(index) ^ bool(width):
|
||||
raise ValueError("Index and width both need to be set, or be None.")
|
||||
elif index and width:
|
||||
if width[0] != 'i' or int(width[2]) not in [1, 2, 4, 8]:
|
||||
raise ValueError("Width may only be immediate 1,2,4 or 8.")
|
||||
if index[0] != 'r':
|
||||
raise ValueError("Index must be a register.")
|
||||
|
||||
if offset and offset[0] != 'i':
|
||||
raise ValueError("Offset must be an immediate.")
|
||||
if base and base[0] != 'r':
|
||||
raise ValueError("Offset must be a register.")
|
||||
|
||||
if not index and not width and not offset and not base:
|
||||
raise ValueError("Must provide at least an offset or base.")
|
||||
|
||||
if destination == 'base' and not base:
|
||||
raise ValueError("Destination may only be set to 'base' if base is set.")
|
||||
elif destination == 'index' and not index:
|
||||
raise ValueError("Destination may only be set to 'index' if index is set.")
|
||||
elif destination not in ['base', 'index']:
|
||||
raise ValueError("Destination must be set to 'base' or 'index'.")
|
||||
|
||||
if not base and not index:
|
||||
raise ValueError("Either base or index must be set for latency test to work.")
|
||||
|
||||
if serial != 1 and not (base or index):
|
||||
raise ValueError("Serial > 1 only works with index and/or base in use.")
|
||||
|
||||
self._loop_body = ''
|
||||
|
||||
ops = ''
|
||||
if offset:
|
||||
ops += offset[2]
|
||||
if base:
|
||||
ops += '($0'
|
||||
if width and index:
|
||||
ops += ',$1,{}'.format(width[2])
|
||||
ops += ')'
|
||||
|
||||
if destination == 'base':
|
||||
ops += ', $0'
|
||||
else: # destination == 'index'
|
||||
ops += ', $1'
|
||||
else:
|
||||
if width and index:
|
||||
ops += '(,$0,{}), $0'.format(width[2])
|
||||
ops += ' '
|
||||
|
||||
if destination == 'base':
|
||||
destination_reg = base
|
||||
else: # destination == 'index'
|
||||
destination_reg = index
|
||||
|
||||
# Part 1: PHI function for destination
|
||||
for p in range(parallel):
|
||||
self._loop_body += (
|
||||
'%"{name}_{p}.0" = '
|
||||
'phi {type} [{initial}, %"entry"], [%"{name}_{p}.{s}", %"loop"]\n').format(
|
||||
name=destination, type=destination_reg[1], initial=destination_reg[2], p=p,
|
||||
s=self.serial)
|
||||
|
||||
for p in range(parallel):
|
||||
for s in range(self.serial):
|
||||
constraints = '=r,r'
|
||||
if base and index:
|
||||
constraints += ',r'
|
||||
if destination == 'base':
|
||||
args = '{base_type} %"{base_name}_{p}.{s_in}", {index_type} {index_value}'.format(
|
||||
base_type=base[1], base_name=destination,
|
||||
index_type=index[1], index_value=index[2], p=p, s_in=s)
|
||||
else: # destination == 'index':
|
||||
args = '{base_type} {base_value}, {index_type} %"{index_name}_{p}.{s_in}"'.format(
|
||||
base_type=base[1], base_value=base[2],
|
||||
index_type=index[1], index_name=destination, p=p, s_in=s)
|
||||
else:
|
||||
args = '{type} %"{name}_{p}.{s_in}"'.format(
|
||||
type=destination_reg[1], name=destination, p=p, s_in=s)
|
||||
|
||||
self._loop_body += (
|
||||
'%"{name}_{p}.{s_out}" = call {type} asm sideeffect'
|
||||
' "lea {ops}", "{constraints}" ({args})\n').format(
|
||||
name=destination,
|
||||
type=destination_reg[1],
|
||||
ops=ops,
|
||||
constraints=constraints,
|
||||
args=args,
|
||||
p=p,
|
||||
s_out=s + 1)
|
||||
|
||||
|
||||
class LoadBenchmark(Benchmark):
|
||||
def __init__(self, chain_length=2048, structure='linear', parallel=6, serial=4, **kwargs):
|
||||
"""
|
||||
Benchmark for L1 load using pointer chasing.
|
||||
|
||||
*chain_length* is the number of pointers to place in memory.
|
||||
*structure* may be 'linear' (1-offsets) or 'random'.
|
||||
"""
|
||||
Benchmark.__init__(self, parallel=parallel, serial=1, **kwargs)
|
||||
self._serial = serial
|
||||
self._loop_body = ''
|
||||
element_type = ctypes.POINTER(ctypes.c_int)
|
||||
self._function_ctype = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int, ctypes.POINTER(element_type), ctypes.c_int)
|
||||
self.chain_length = chain_length
|
||||
self.parallel = parallel
|
||||
self.structure = structure
|
||||
self._pointer_field = (element_type * chain_length)()
|
||||
if chain_length % serial != 0:
|
||||
raise ValueError(
|
||||
"chain_length ({}) needs to be divisible by serial factor ({}).".format(
|
||||
chain_length, serial))
|
||||
|
||||
# Initialize pointer field
|
||||
# Field must represent a ring of pointers
|
||||
if structure == 'linear':
|
||||
for i in range(chain_length):
|
||||
self._pointer_field[i] = ctypes.cast(
|
||||
ctypes.pointer(self._pointer_field[(i + 1) % chain_length]), element_type)
|
||||
elif structure == 'random':
|
||||
shuffled_indices = list(range(chain_length))
|
||||
random.shuffle(shuffled_indices)
|
||||
for i in range(chain_length):
|
||||
self._pointer_field[shuffled_indices[i]] = ctypes.cast(
|
||||
ctypes.pointer(self._pointer_field[shuffled_indices[(i + 1) % chain_length]]),
|
||||
element_type)
|
||||
else:
|
||||
raise ValueError("Given structure is not supported. Supported are: "
|
||||
"linear and random.")
|
||||
|
||||
def prepare_arguments(self, previous_args=None, time_factor=1.0):
|
||||
"""Build argument tuple, to be passed to low level function."""
|
||||
if previous_args is None:
|
||||
return self._pointer_field, 100
|
||||
else:
|
||||
return previous_args[0], int(previous_args[1] * time_factor)
|
||||
|
||||
def get_iterations(self, args):
|
||||
"""Return number of iterations performed, based on lower level function arguments."""
|
||||
return self.chain_length * args[1]
|
||||
|
||||
def get_ir(self):
|
||||
"""
|
||||
Return LLVM IR equivalent of (in case of parallel == 1 and serial == 1):
|
||||
|
||||
int test(int** ptrf, int repeat) {
|
||||
int** p0 = (int**)ptrf[0];
|
||||
int i = 0;
|
||||
while(i < N) {
|
||||
int** p = (int**)*p0;
|
||||
while(p != p0) {
|
||||
p = (int**)*p;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
"""
|
||||
ret = textwrap.dedent('''
|
||||
define i32 @test(i32** %"ptrf_0", i32 %"repeats") {
|
||||
entry:
|
||||
''')
|
||||
# Load pointer to ptrf[p] and p0
|
||||
for p in range(self.parallel):
|
||||
if p > 0:
|
||||
ret += ' %"ptrf_{p}" = getelementptr i32*, i32** %"ptrf_0", i64 {p}\n'.format(p=p)
|
||||
ret += (
|
||||
' %"pp0_{p}" = bitcast i32** %"ptrf_{p}" to i32***\n'
|
||||
' %"p0_{p}" = load i32**, i32*** %"pp0_{p}", align 8\n').format(p=p)
|
||||
|
||||
ret += textwrap.dedent('''
|
||||
%"cmp.entry" = icmp sgt i32 %"repeats", 0
|
||||
br i1 %"cmp.entry", label %"loop0", label %"end"
|
||||
|
||||
loop0:
|
||||
br label %"loop1"
|
||||
|
||||
loop1:
|
||||
%"i" = phi i32 [ %"i.1", %"loop3" ], [ 0, %"loop0" ]
|
||||
br label %"loop2"
|
||||
|
||||
loop2:\n''')
|
||||
|
||||
for p in range(self.parallel):
|
||||
ret += (' %"p_{p}.0" = phi i32** '
|
||||
'[ %"p0_{p}", %"loop1" ], [ %"p_{p}.{s_max}", %"loop2" ]\n').format(
|
||||
p=p, s_max=self._serial)
|
||||
|
||||
# load p, compare to p0 and or-combine results
|
||||
for p in range(self.parallel):
|
||||
for s in range(self._serial):
|
||||
ret += (' %"pp_{p}.{s}" = bitcast i32** %"p_{p}.{s_prev}" to i32***\n'
|
||||
' %"p_{p}.{s}" = load i32**, i32*** %"pp_{p}.{s}", align 8\n').format(
|
||||
p=p, s=s + 1, s_prev=s)
|
||||
|
||||
# Compare is needed for all registers, for llvm not to remove unused
|
||||
# instructions:
|
||||
ret += ' %"cmp_{p}.loop2" = icmp eq i32** %"p_{p}.{s_max}", %"p0_{p}"\n'.format(
|
||||
p=p, s_max=self._serial)
|
||||
|
||||
# TODO tree reduce cmp to make use of all cmp_* values
|
||||
|
||||
# It is sufficient to use only one compare, all others will be eliminated
|
||||
ret += ' br i1 %"cmp_0.loop2", label %"loop3", label %"loop2"\n'
|
||||
|
||||
ret += textwrap.dedent('''
|
||||
loop3:
|
||||
%"i.1" = add i32 %"i", 1
|
||||
%"cmp.loop3" = icmp eq i32 %"i.1", %"repeats"
|
||||
br i1 %"cmp.loop3", label %"end", label %"loop1"
|
||||
|
||||
end:
|
||||
%"ret" = phi i32 [ 0, %"entry" ], [ %"repeats", %"loop3" ]
|
||||
ret i32 %"ret"
|
||||
}''')
|
||||
return ret
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-v', '--verbose', action='count', default=0)
|
||||
parser.add_argument('-f', '--frequency', type=float, required=psutil.cpu_freq() is None,
|
||||
help='Provided (in GHz), if psutil.cpu_freq() does report anything.')
|
||||
args = parser.parse_args()
|
||||
if args.frequency:
|
||||
args.frequency *= 1e9
|
||||
|
||||
llvm.initialize()
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
|
||||
modules = collections.OrderedDict()
|
||||
|
||||
# immediate source
|
||||
modules['add i64 r64 LAT'] = InstructionBenchmark(
|
||||
instruction='addq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('i', 'i64', '1'),),
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
# register source
|
||||
modules['add r64 r64 LAT'] = InstructionBenchmark(
|
||||
instruction='addq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('r', 'i64', '1'),),
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
# multiple instructions
|
||||
modules['4xadd i64 r64 LAT'] = InstructionBenchmark(
|
||||
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('i', 'i64', '1'),),
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
# immediate source
|
||||
modules['add i64 r64 TP'] = InstructionBenchmark(
|
||||
instruction='addq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('i', 'i64', '1'),),
|
||||
parallel=10,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
# register source
|
||||
modules['add r64 r64 TP'] = InstructionBenchmark(
|
||||
instruction='addq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('r', 'i64', '1'),),
|
||||
parallel=10,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
# multiple instructions
|
||||
modules['4xadd i64 r64 TP'] = InstructionBenchmark(
|
||||
instruction='addq $1, $0\naddq $1, $0\naddq $1, $0\naddq $1, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('r', 'i64', '0'),),
|
||||
src_operands=(('i', 'i64', '1'),),
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base LAT'] = AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base+offset LAT'] = AddressGenerationBenchmark(
|
||||
offset=('i', None, '23'),
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea index*width LAT'] = AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=None,
|
||||
index=('r', 'i64', '1'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea offset+index*width LAT'] = AddressGenerationBenchmark(
|
||||
offset=('i', 'i64', '-0x8'),
|
||||
base=None,
|
||||
index=('r', 'i64', '51'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base+index*width LAT'] = AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base+offset+index*width LAT'] = AddressGenerationBenchmark(
|
||||
offset=('i', None, '42'),
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base TP'] = AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base+offset TP'] = AddressGenerationBenchmark(
|
||||
offset=('i', None, '23'),
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea index*width TP'] = AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=None,
|
||||
index=('r', 'i64', '1'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea offset+index*width TP'] = AddressGenerationBenchmark(
|
||||
offset=('i', 'i64', '-0x8'),
|
||||
base=None,
|
||||
index=('r', 'i64', '51'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base+index*width TP'] = AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['lea base+offset+index*width TP'] = AddressGenerationBenchmark(
|
||||
offset=('i', None, '42'),
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['LD linear LAT'] = LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='linear',
|
||||
parallel=1,
|
||||
serial=8,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['LD random LAT'] = LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='random',
|
||||
parallel=1,
|
||||
serial=8,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['LD linear TP'] = LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='linear',
|
||||
parallel=16,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['LD random TP'] = LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='random',
|
||||
parallel=16,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['vaddpd x<4 x double> x<4 x double> x<4 x double> LAT'] = InstructionBenchmark(
|
||||
instruction='vaddpd $1, $0, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
|
||||
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) LAT'] = InstructionBenchmark(
|
||||
instruction='vmulpd $1, $0, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
|
||||
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
|
||||
parallel=1,
|
||||
serial=5,
|
||||
frequency=args.frequency)
|
||||
|
||||
# This is actually a TP benchmark with parallel=1, because there are no inter-loop depencies:
|
||||
modules['vmulpd x<4 x double> x<4 x double> x<4 x double> (dstsrc) TP'] = InstructionBenchmark(
|
||||
instruction='vmulpd $1, $2, $0',
|
||||
dst_operands=(),
|
||||
dstsrc_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10'] * 4))),),
|
||||
src_operands=(('x', '<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10'] * 4))),),
|
||||
parallel=10,
|
||||
serial=1,
|
||||
frequency=args.frequency)
|
||||
|
||||
modules = collections.OrderedDict([(k, v) for k,v in modules.items() if k.startswith('LD ')])
|
||||
|
||||
for key, module in modules.items():
|
||||
if args.verbose > 0:
|
||||
print("=== Benchmark")
|
||||
print(repr(module))
|
||||
print("=== LLVM")
|
||||
print(module.get_ir())
|
||||
print("=== Assembly")
|
||||
print(module.get_assembly())
|
||||
r = module.build_and_execute(repeat=3)
|
||||
if args.verbose > 0:
|
||||
print("=== Result")
|
||||
pprint.pprint(r)
|
||||
|
||||
cy_per_it = min(r['runtimes']) * r['frequency'] / (
|
||||
r['iterations'] * module.parallel * module.serial)
|
||||
print('{key:<32} {cy_per_it:.3f} cy/It with {runtime_sum:.4f}s'.format(
|
||||
key=key,
|
||||
module=module,
|
||||
cy_per_it=cy_per_it,
|
||||
runtime_sum=sum(r['runtimes'])))
|
||||
|
||||
# InstructionBenchmark.get_latency(
|
||||
# instruction='vmulpd $1, $0, $0',
|
||||
# dst_operands=(),
|
||||
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
|
||||
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
|
||||
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
|
||||
# print_table=True)
|
||||
# InstructionBenchmark.get_throughput(
|
||||
# instruction='vmulpd $1, $0, $0',
|
||||
# dst_operands=(),
|
||||
# dstsrc_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 1.23e-10']*4))),),
|
||||
# src_operands=(('x','<4 x double>', '<{}>'.format(', '.join(['double 3.21e-10']*4))),
|
||||
# ('x','<4 x double>', '<{}>'.format(', '.join(['double 2.13e-10']*4))),),
|
||||
# print_table=True)
|
||||
#
|
||||
# InstructionBenchmark.get_latency(
|
||||
# instruction='nop',
|
||||
# dst_operands=(),
|
||||
# dstsrc_operands=(('r','i8', '0'),),
|
||||
# src_operands=(),
|
||||
# print_table=True)
|
||||
# InstructionBenchmark.get_throughput(
|
||||
# instruction='nop',
|
||||
# dst_operands=(),
|
||||
# dstsrc_operands=(('r','i8', '0'),),
|
||||
# src_operands=(),
|
||||
# print_table=True)
|
@@ -1,514 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import re
|
||||
from itertools import zip_longest
|
||||
|
||||
# TODO use abc to force implementation of interface requirements
|
||||
|
||||
init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
|
||||
# LLVM requires floating point constants to have a non-repeating binary representation
|
||||
# See http://llvm.org/docs/LangRef.html#simple-constants for details
|
||||
init_value_by_llvm_type.update({fp_type: str(1+1/2**10)
|
||||
for fp_type in ['float', 'double', 'fp128']})
|
||||
# For vector-types we reuse the scalar values
|
||||
init_value_by_llvm_type.update(
|
||||
{'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
|
||||
for t, v in init_value_by_llvm_type.items()
|
||||
for vec in [2, 4, 8, 16, 32, 64]})
|
||||
|
||||
|
||||
class NotSerializableError(Exception):
|
||||
pass
|
||||
|
||||
class Operand:
|
||||
def __init__(self, llvm_type):
|
||||
self.llvm_type = llvm_type
|
||||
|
||||
def get_constraint_char(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(
|
||||
self.__class__.__name__,
|
||||
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')]))
|
||||
|
||||
@staticmethod
|
||||
def from_string(s):
|
||||
options = [Register.from_string, Immediate.from_string, MemoryReference.from_string]
|
||||
for o in options:
|
||||
try:
|
||||
return o(s)
|
||||
except ValueError:
|
||||
continue
|
||||
raise ValueError("No matching operand type found for '{}'.".format(s))
|
||||
|
||||
|
||||
class Immediate(Operand):
|
||||
def __init__(self, llvm_type, value):
|
||||
Operand.__init__(self, llvm_type)
|
||||
self.value = value
|
||||
|
||||
def get_constraint_char(self):
|
||||
return 'i'
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, s):
|
||||
"""
|
||||
Create Immediate object from string.
|
||||
|
||||
:param s: must have the form: "llvm_type:value"
|
||||
"""
|
||||
llvm_type, value = s.split(':', 1)
|
||||
value_regex = r'(0x[0-9a-fA-F]+|[0-9]+(\.[0-9]+)?)'
|
||||
if not re.match(value_regex, value):
|
||||
raise ValueError("Invalid immediate value, must match {!r}".format(value_regex))
|
||||
return cls(llvm_type, value)
|
||||
|
||||
|
||||
class MemoryReference(Operand):
|
||||
"""
|
||||
offset + base + index*width
|
||||
|
||||
OFFSET(BASE, INDEX, WIDTH) in AT&T assembly
|
||||
|
||||
Possible operand values:
|
||||
offset: immediate integer (+/-)
|
||||
base: register
|
||||
index: register
|
||||
width: immediate 1,2,4 or 8
|
||||
"""
|
||||
|
||||
def __init__(self, llvm_type, offset=None, base=None, index=None, width=None):
|
||||
super().__init__(llvm_type)
|
||||
self.offset = offset
|
||||
self.base = base
|
||||
self.index = index
|
||||
self.width = width
|
||||
|
||||
# Sanity checks:
|
||||
if bool(index) ^ bool(width):
|
||||
raise ValueError("Index and width both need to be set, or None.")
|
||||
elif index and width:
|
||||
if not (isinstance(width, Immediate) and int(width.value) in [1, 2, 4, 8]):
|
||||
raise ValueError("Width may only be immediate 1,2,4 or 8.")
|
||||
if not isinstance(index, Register):
|
||||
raise ValueError("Index must be a register.")
|
||||
|
||||
if offset and not isinstance(offset, Immediate):
|
||||
raise ValueError("Offset must be an immediate.")
|
||||
if base and not isinstance(base, Register):
|
||||
raise ValueError("Offset must be a register.")
|
||||
|
||||
if not index and not width and not offset and not base:
|
||||
raise ValueError("Must provide at least an offset or base.")
|
||||
|
||||
def get_constraint_char(self):
|
||||
return 'm'
|
||||
|
||||
def get_registers(self):
|
||||
if self.base:
|
||||
yield self.base
|
||||
if self.index:
|
||||
yield self.index
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, s):
|
||||
"""
|
||||
Create MemoryReference from string.
|
||||
|
||||
:param s: must fulfill the regex: "mem:[bdis]+"
|
||||
"""
|
||||
m = re.match(r"\*([^:]+):([obiw]+)", s)
|
||||
if not m:
|
||||
raise ValueError("Invalid format, must match 'mem:[obiw]+'.")
|
||||
else:
|
||||
llvm_type, features = m.groups()
|
||||
offset = None
|
||||
if 'o' in features:
|
||||
offset = Immediate('i32', 8)
|
||||
base = None
|
||||
if 'b' in features:
|
||||
base = Register('i64', 'r')
|
||||
index = None
|
||||
if 'i' in features:
|
||||
index = Register('i64', 'r')
|
||||
width = None
|
||||
if 'w' in features:
|
||||
width = Immediate('i32', 8)
|
||||
return cls(llvm_type, offset=offset, base=base, index=index, width=width)
|
||||
|
||||
|
||||
class Register(Operand):
|
||||
def __init__(self, llvm_type, constraint_char='r'):
|
||||
super().__init__(llvm_type)
|
||||
self.constraint_char = constraint_char
|
||||
|
||||
def get_constraint_char(self):
|
||||
return self.constraint_char
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, s):
|
||||
"""
|
||||
Create Register object from string.
|
||||
|
||||
:param s: must have the form: "llvm_type:constraint_char"
|
||||
"""
|
||||
llvm_type, constraint_char = s.split(':', 1)
|
||||
valid_cc = 'rx'
|
||||
if constraint_char not in valid_cc:
|
||||
raise ValueError("Invalid constraint character, must be one of {!r}".format(valid_cc))
|
||||
return cls(llvm_type, constraint_char)
|
||||
|
||||
|
||||
class Synthable:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def build_ir(self, dst_reg_names, src_reg_names, used_registers):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_source_registers(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_destination_registers(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def _get_unused_reg_name(used_registers):
|
||||
name = None
|
||||
i = 0
|
||||
while name in used_registers or name is None:
|
||||
name = '%"reg.{}"'.format(i)
|
||||
i += 1
|
||||
used_registers.add(name)
|
||||
return name
|
||||
|
||||
def get_default_init_values(self):
|
||||
r = []
|
||||
for reg in self.get_source_registers():
|
||||
try:
|
||||
r.append(init_value_by_llvm_type[reg.llvm_type])
|
||||
except KeyError:
|
||||
raise ValueError("Invalid or unsupported LLVM type {!r}.".format(reg.llvm_type))
|
||||
return r
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(
|
||||
self.__class__.__name__,
|
||||
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')]))
|
||||
|
||||
|
||||
class Operation(Synthable):
|
||||
"""Base class for operations."""
|
||||
|
||||
|
||||
class Instruction(Operation):
|
||||
def __init__(self, instruction, destination_operand, source_operands):
|
||||
super().__init__()
|
||||
self.instruction = instruction
|
||||
self.destination_operand = destination_operand
|
||||
assert isinstance(destination_operand, Register), "Destination needs to be a register."
|
||||
self.source_operands = source_operands
|
||||
|
||||
def get_source_registers(self):
|
||||
sop_types = set()
|
||||
sr = []
|
||||
for sop in self.source_operands:
|
||||
if isinstance(sop, Register):
|
||||
if sop.llvm_type not in sop_types:
|
||||
sop_types.add(sop.llvm_type)
|
||||
sr.append(sop)
|
||||
elif isinstance(sop, MemoryReference):
|
||||
sr += list(sop.get_registers())
|
||||
|
||||
return sr
|
||||
|
||||
def get_destination_registers(self):
|
||||
if isinstance(self.destination_operand, Register):
|
||||
return [self.destination_operand]
|
||||
else:
|
||||
return []
|
||||
|
||||
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
|
||||
"""
|
||||
Build IR string based on in and out operand names and types.
|
||||
"""
|
||||
if used_registers is None:
|
||||
used_registers = set(dst_reg_names + src_reg_names)
|
||||
|
||||
# Build constraint string from operands
|
||||
constraints = ','.join(
|
||||
['=' + self.destination_operand.get_constraint_char()] +
|
||||
[sop.get_constraint_char() for sop in self.source_operands])
|
||||
|
||||
# Build argument string from operands and register names
|
||||
operands = []
|
||||
sop_types = {}
|
||||
i = 0
|
||||
for sop in self.source_operands:
|
||||
if isinstance(sop, Immediate):
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=sop.value))
|
||||
elif isinstance(sop, Register):
|
||||
if sop.llvm_type in sop_types:
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=src_reg_names[sop_types[sop.llvm_type]]))
|
||||
else:
|
||||
sop_types[sop.llvm_type] = i
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=src_reg_names[i]))
|
||||
i += 1
|
||||
elif isinstance(sop, MemoryReference):
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=src_reg_names[i]))
|
||||
i += 1
|
||||
else:
|
||||
raise NotImplementedError("Only register and immediate operands are supported.")
|
||||
args = ', '.join(operands)
|
||||
|
||||
# Build instruction from instruction and operands
|
||||
return ('{dst_reg} = call {dst_type} asm '
|
||||
' "{instruction}", "{constraints}" ({args})').format(
|
||||
dst_reg=dst_reg_names[0],
|
||||
dst_type=self.destination_operand.llvm_type,
|
||||
instruction=self.instruction,
|
||||
constraints=constraints,
|
||||
args=args)
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, s):
|
||||
"""
|
||||
Create Instruction object from string.
|
||||
|
||||
:param s: must have the form:
|
||||
"asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
|
||||
"""
|
||||
instruction = s
|
||||
# It is important that the match objects are in reverse order, to allow string replacements
|
||||
# based on original match group locations
|
||||
operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
|
||||
# Destination indices start at 0
|
||||
dst_index = 0
|
||||
# Source indices at "number of destination operands"
|
||||
src_index = ['dst' in o.group(1) for o in operands].count(True)
|
||||
|
||||
dst_ops = []
|
||||
src_ops = []
|
||||
for m in operands:
|
||||
direction, operand_string = m.group(1, 2)
|
||||
operand = Operand.from_string(operand_string)
|
||||
if 'src' in direction and not 'dst' in direction:
|
||||
src_ops.append(operand)
|
||||
# replace with index string
|
||||
instruction = (instruction[:m.start()] + "${}".format(src_index)
|
||||
+ instruction[m.end():])
|
||||
src_index += 1
|
||||
if 'dst' in direction:
|
||||
dst_ops.append(operand)
|
||||
# replace with index string
|
||||
instruction = (instruction[:m.start()] + "${}".format(dst_index)
|
||||
+ instruction[m.end():])
|
||||
if 'src' in direction:
|
||||
src_ops.append(Register(operand_string.split(':', 1)[0], str(dst_index)))
|
||||
src_index += 1
|
||||
dst_index += 1
|
||||
|
||||
if len(dst_ops) != 1:
|
||||
raise ValueError("Instruction supports only single destinations.")
|
||||
return cls(instruction, dst_ops[0], src_ops)
|
||||
|
||||
|
||||
class Load(Operation):
|
||||
def __init__(self, chain_length, structure='linear'):
|
||||
"""
|
||||
*chain_length* is the number of pointers to place in memory.
|
||||
*structure* may be 'linear' (1-offsets) or 'random'.
|
||||
"""
|
||||
super().__init__()
|
||||
self.chain_length = chain_length
|
||||
self.structure = structure
|
||||
# TODO
|
||||
|
||||
|
||||
class AddressGeneration(Operation):
|
||||
def __init__(self, offset, base, index, width, destination='base'):
|
||||
super().__init__()
|
||||
self.offset = offset
|
||||
self.base = base
|
||||
self.index = index
|
||||
self.width = width
|
||||
self.destination = destination
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class Serialized(Synthable):
|
||||
def __init__(self, synths):
|
||||
super().__init__()
|
||||
self.synths = synths
|
||||
assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
|
||||
|
||||
def get_source_registers(self):
|
||||
if self.synths:
|
||||
return self.synths[0].get_source_registers()
|
||||
else:
|
||||
return []
|
||||
|
||||
def get_destination_registers(self):
|
||||
if self.synths:
|
||||
return self.synths[-1].get_destination_registers()
|
||||
else:
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def match(source_registers, destination_registers):
|
||||
"""
|
||||
Find maximum number of matches from source (previous destinations) to
|
||||
destination (current source) registers.
|
||||
|
||||
Return list of two-tuples of matches (src_idx, dst_idx)
|
||||
"""
|
||||
matched_pairs = []
|
||||
unmatched_dests = set(destination_registers)
|
||||
for dst_idx, dst in enumerate(destination_registers):
|
||||
for src_idx, src in enumerate(source_registers):
|
||||
if src.llvm_type == dst.llvm_type:
|
||||
matched_pairs.append((src_idx, dst_idx))
|
||||
unmatched_dests.discard(dst)
|
||||
|
||||
return matched_pairs, unmatched_dests
|
||||
|
||||
def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
|
||||
reg_naming_out = []
|
||||
dst_naming = []
|
||||
last_s = None
|
||||
for i, s in enumerate(self.synths):
|
||||
if i == 0:
|
||||
# first source is passed in from outside
|
||||
src_naming = src_reg_names
|
||||
else:
|
||||
# match with previous destinations
|
||||
src_naming = []
|
||||
match = False
|
||||
for src in s.get_source_registers():
|
||||
# Find matching destination from previous synths
|
||||
src_match = False
|
||||
for dst_idx, dst in enumerate(last_s.get_destination_registers()):
|
||||
if dst.llvm_type == src.llvm_type:
|
||||
match = src_match = True
|
||||
src_naming.append(dst_naming[dst_idx])
|
||||
# If source could not be matched, use constant value instead
|
||||
if not src_match:
|
||||
src_naming.append(init_value_by_llvm_type[src.llvm_type])
|
||||
if not match:
|
||||
raise NotSerializableError("Unable to find match.")
|
||||
|
||||
if i == len(self.synths) - 1:
|
||||
# last destination is passed in from outside
|
||||
dst_naming = dst_reg_names
|
||||
else:
|
||||
# noinspection PyUnusedLocal
|
||||
dst_naming = [self._get_unused_reg_name(used_registers)
|
||||
for j in s.get_destination_registers()]
|
||||
|
||||
reg_naming_out.append((dst_naming, src_naming))
|
||||
last_s = s
|
||||
return reg_naming_out, used_registers
|
||||
|
||||
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
|
||||
if used_registers is None:
|
||||
used_registers = set(dst_reg_names + src_reg_names)
|
||||
reg_names, used_registers = self.generate_register_naming(
|
||||
dst_reg_names, src_reg_names, used_registers)
|
||||
code = []
|
||||
for s, r in zip(self.synths, reg_names):
|
||||
code.append(s.build_ir(*r, used_registers))
|
||||
return '\n'.join(code)
|
||||
|
||||
|
||||
class Parallelized(Synthable):
|
||||
def __init__(self, synths, interleave=False):
|
||||
super().__init__()
|
||||
self.synths = synths
|
||||
self.interleave = interleave
|
||||
assert all([isinstance(s, Synthable) for s in synths]), "All elements need to be Sythable"
|
||||
|
||||
def get_source_registers(self):
|
||||
sources = []
|
||||
for s in self.synths:
|
||||
sources += s.get_source_registers()
|
||||
return sources
|
||||
|
||||
def get_destination_registers(self):
|
||||
destinations = []
|
||||
for s in self.synths:
|
||||
destinations += s.get_destination_registers()
|
||||
return destinations
|
||||
|
||||
def generate_register_naming(self, dst_reg_names, src_reg_names, used_registers):
|
||||
# Split reg_naming among all synths
|
||||
reg_naming_out = []
|
||||
for s in self.synths:
|
||||
n_dsts = len(s.get_destination_registers())
|
||||
n_srcs = len(s.get_source_registers())
|
||||
reg_naming_out.append((dst_reg_names[:n_dsts], src_reg_names[:n_srcs]))
|
||||
dst_reg_names, src_reg_names = (dst_reg_names[n_dsts:], src_reg_names[n_srcs:])
|
||||
return reg_naming_out, used_registers
|
||||
|
||||
def build_ir(self, dst_reg_names, src_reg_names, used_registers=None):
|
||||
if used_registers is None:
|
||||
used_registers = set(dst_reg_names + src_reg_names)
|
||||
reg_names, used_registers = self.generate_register_naming(
|
||||
dst_reg_names, src_reg_names, used_registers)
|
||||
code = []
|
||||
for s, r in zip(self.synths, reg_names):
|
||||
code.append(s.build_ir(*r, used_registers))
|
||||
|
||||
# Interleave parallelized sequences
|
||||
if self.interleave:
|
||||
code = ['\n'.join(filter(None.__ne__, c))
|
||||
for c in list(zip_longest(*[c.split('\n') for c in code]))]
|
||||
return '\n'.join(code)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
i1 = Instruction(
|
||||
instruction='add $2, $0',
|
||||
destination_operand=Register('i64', 'r'),
|
||||
source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
|
||||
i2 = Instruction(
|
||||
instruction='sub $2, $0',
|
||||
destination_operand=Register('i64', 'r'),
|
||||
source_operands=[Register('i64', 'r'), Immediate('i64', '1')])
|
||||
i3 = Instruction(
|
||||
instruction='mul $1, $0',
|
||||
destination_operand=Register('i64', 'r'),
|
||||
source_operands=[Register('i64', 'r'), Register('i64', 'r')])
|
||||
i4 = Instruction(
|
||||
instruction='div $2, $0',
|
||||
destination_operand=Register('i64', 'r'),
|
||||
source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
|
||||
i5 = Instruction(
|
||||
instruction='mul $2, $0',
|
||||
destination_operand=Register('i64', 'r'),
|
||||
source_operands=[Register('i64', 'r'), Immediate('i64', '23')])
|
||||
i6 = Instruction(
|
||||
instruction='inc $0',
|
||||
destination_operand=Register('i64', 'r'),
|
||||
source_operands=[Register('i64', 'r')])
|
||||
s1 = Serialized([i1, i2])
|
||||
s2 = Serialized([s1, i3])
|
||||
print(s1.build_ir(['%out'], ['%in']), '\n')
|
||||
print(s2.build_ir(['%out'], ['%in']), '\n')
|
||||
s3 = Serialized([i4, i5])
|
||||
p1 = Parallelized([i6, s2, s3])
|
||||
print(p1.build_ir(['%out.0', '%out.1', '%out.2'], ['%in.0', '%in.1', '%in.2']), '\n')
|
||||
|
||||
s4 = Serialized([i1, i2, i3, i4, i5, i6])
|
||||
print(s4.build_ir(['%out'], ['%in']), '\n')
|
||||
|
||||
print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))
|
@@ -1,243 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import collections
|
||||
import itertools
|
||||
import socket
|
||||
|
||||
import numpy
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
|
||||
from asmbench import op, bench
|
||||
from asmbench import oldjit
|
||||
|
||||
|
||||
def jit_based_benchs():
|
||||
modules = collections.OrderedDict()
|
||||
modules['lea_b'] = (
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5),
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1))
|
||||
|
||||
modules['lea_b+off'] = (
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=('i', None, '23'),
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5),
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=('i', None, '23'),
|
||||
base=('r', 'i64', '666'),
|
||||
index=None,
|
||||
width=None,
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1))
|
||||
|
||||
modules['lea_idx*w'] = (
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=None,
|
||||
index=('r', 'i64', '1'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=1,
|
||||
serial=5),
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=None,
|
||||
index=('r', 'i64', '1'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=10,
|
||||
serial=1))
|
||||
|
||||
modules['lea_off+idx*w'] = (
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=('i', 'i64', '-0x8'),
|
||||
base=None,
|
||||
index=('r', 'i64', '51'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=1,
|
||||
serial=5),
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=('i', 'i64', '-0x8'),
|
||||
base=None,
|
||||
index=('r', 'i64', '51'),
|
||||
width=('i', None, '4'),
|
||||
destination='index',
|
||||
parallel=10,
|
||||
serial=1))
|
||||
|
||||
modules['lea_b+idx*w'] = (
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5),
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=None,
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1))
|
||||
|
||||
modules['lea_b+off+idx*w'] = (
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=('i', None, '42'),
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=1,
|
||||
serial=5),
|
||||
oldjit.AddressGenerationBenchmark(
|
||||
offset=('i', None, '42'),
|
||||
base=('r', 'i64', '23'),
|
||||
index=('r', 'i64', '12'),
|
||||
width=('i', None, '4'),
|
||||
destination='base',
|
||||
parallel=10,
|
||||
serial=1))
|
||||
|
||||
modules['LD_linear'] = (
|
||||
oldjit.LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='linear',
|
||||
parallel=1,
|
||||
serial=2),
|
||||
oldjit.LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='linear',
|
||||
parallel=4,
|
||||
serial=2))
|
||||
|
||||
modules['LD_random'] = (
|
||||
oldjit.LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='random',
|
||||
parallel=1,
|
||||
serial=2),
|
||||
oldjit.LoadBenchmark(
|
||||
chain_length=2048, # 2048 * 8B = 16kB
|
||||
structure='random',
|
||||
parallel=4,
|
||||
serial=2))
|
||||
|
||||
for name, mods in modules.items():
|
||||
lat_module, tp_module = mods
|
||||
r_lat = lat_module.build_and_execute(repeat=3)
|
||||
cy_per_it_lat = min(r_lat['runtimes']) * r_lat['frequency'] / (
|
||||
r_lat['iterations'] * lat_module.parallel * lat_module.serial)
|
||||
r_tp = tp_module.build_and_execute(repeat=3)
|
||||
cy_per_it_tp = min(r_tp['runtimes']) * r_tp['frequency'] / (
|
||||
r_tp['iterations'] * tp_module.parallel * tp_module.serial)
|
||||
print('{key:<16} LAT {cy_per_it_lat:.3f} cy TP {cy_per_it_tp:.3f} cy'.format(
|
||||
key=name,
|
||||
cy_per_it_lat=cy_per_it_lat,
|
||||
cy_per_it_tp=cy_per_it_tp))
|
||||
|
||||
def plot_combined(single_measured, combined_measured):
|
||||
instructions = list(single_measured.keys())
|
||||
d = numpy.ndarray((len(single_measured), len(single_measured)))
|
||||
d.fill(float('nan'))
|
||||
for k, v in combined_measured.items():
|
||||
i1, i2 = [instructions.index(i) for i in [c[0] for c in k]]
|
||||
d[i1, i2] = v[2]
|
||||
cmap = mpl.cm.get_cmap('plasma', 5)
|
||||
cmap.set_bad('w') # default value is 'k'
|
||||
fig = plt.figure(figsize=(10,10))
|
||||
ax1 = fig.add_subplot(111)
|
||||
cax = ax1.imshow(d, interpolation="nearest", cmap=cmap, norm=mpl.colors.Normalize(vmin=-.5, vmax=1.5))
|
||||
ax1.set_xticks(range(len(instructions)))
|
||||
ax1.set_xticklabels(instructions, rotation=90)
|
||||
ax1.set_yticks(range(len(instructions)))
|
||||
ax1.set_yticklabels(instructions)
|
||||
ax1.set_title(socket.gethostname())
|
||||
ax1.grid()
|
||||
cb = fig.colorbar(cax, shrink=0.65)
|
||||
cb.set_ticks([-.5, 0, 1, 1.5])
|
||||
cb.set_ticklabels(['< -0.5', '0.0 (complete overlap)', '1.0 (no overlap)', '> 1.5'])
|
||||
cb.set_label('inverse parallel overlap')
|
||||
fig.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
bench.setup_llvm()
|
||||
instructions = [
|
||||
(i[0], i[1], op.Instruction.from_string(i[1]))
|
||||
for i in [
|
||||
('ADD32ri', 'add {src:i32:1}, {srcdst:i32:r}'),
|
||||
('ADD64ri32', 'add {src:i32:1}, {srcdst:i64:r}'),
|
||||
('INC64r', 'inc {srcdst:i64:r}'),
|
||||
('SUB32ri', 'sub {src:i32:1}, {srcdst:i64:r}'),
|
||||
('MOV64ri32', 'mov {src:i32:1}, {srcdst:i64:r}'),
|
||||
('VINSERTF128rr', 'vinsertf128 {src:i8:0}, {src:<2 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
|
||||
('VCVTSI642SSrr', 'vcvtsi2ss {src:i64:r}, {src:float:x}, {dst:float:x}'),
|
||||
('VADDPDYrr', 'vaddpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
|
||||
('VADDSDrr', 'vaddsd {src:double:x}, {src:double:x}, {dst:double:x}'),
|
||||
('VADDSSrr', 'vaddss {src:float:x}, {src:float:x}, {dst:float:x}'),
|
||||
('VFMADD213PDYr', 'vfmadd213pd {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
|
||||
('VFMADD213PDr', 'vfmadd213pd {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
|
||||
('VFMADD213PSYr', 'vfmadd213ps {src:<4 x double>:x}, {src:<4 x double>:x}, {srcdst:<4 x double>:x}'),
|
||||
('VFMADD213PSr', 'vfmadd213ps {src:<2 x double>:x}, {src:<2 x double>:x}, {srcdst:<2 x double>:x}'),
|
||||
('VFMADD213SDr', 'vfmadd213sd {src:double:x}, {src:double:x}, {srcdst:double:x}'),
|
||||
('VFMADD213SSr', 'vfmadd213ss {src:float:x}, {src:float:x}, {srcdst:float:x}'),
|
||||
('VMULPDYrr', 'vmulpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
|
||||
('VMULSDrr', 'vmulsd {src:double:x}, {src:double:x}, {dst:double:x}'),
|
||||
('VMULSSrr', 'vmulss {src:float:x}, {src:float:x}, {dst:float:x}'),
|
||||
('VSUBSDrr', 'vsubsd {src:double:x}, {src:double:x}, {dst:double:x}'),
|
||||
('VSUBSSrr', 'vsubss {src:float:x}, {src:float:x}, {dst:float:x}'),
|
||||
('VDIVPDYrr', 'vdivpd {src:<4 x double>:x}, {src:<4 x double>:x}, {dst:<4 x double>:x}'),
|
||||
('VDIVSDrr', 'vdivsd {src:double:x}, {src:double:x}, {dst:double:x}'),
|
||||
('VDIVSSrr', 'vdivss {src:float:x}, {src:float:x}, {dst:float:x}'),
|
||||
]
|
||||
]
|
||||
instructions_measured = collections.OrderedDict()
|
||||
for llvm_name, i_str, i in instructions:
|
||||
lat, tp = bench.bench_instructions(
|
||||
[i],
|
||||
serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
|
||||
verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
|
||||
print('{:<16} LAT {:.3f} cy TP {:.3f} cy'.format(llvm_name, lat, tp))
|
||||
instructions_measured[llvm_name] = (lat, tp)
|
||||
|
||||
jit_based_benchs()
|
||||
|
||||
two_combinations_measured = collections.OrderedDict()
|
||||
|
||||
for a, b in itertools.combinations_with_replacement(instructions, 2):
|
||||
lat, tp = bench.bench_instructions(
|
||||
[a[2], b[2]],
|
||||
serial_factor=8, throughput_serial_factor=8, parallel_factor=10,
|
||||
verbosity=0, repeat=10, min_elapsed=0.3, max_elapsed=0.5)
|
||||
same_port_metric = ((
|
||||
tp-max(instructions_measured[a[0]][1], instructions_measured[b[0]][1])) /
|
||||
min(instructions_measured[a[0]][1], instructions_measured[b[0]][1]))
|
||||
print('{:<16} {:<16} LAT {:.3f} cy TP {:.3f} cy SPM {:>5.2f}'.format(
|
||||
a[0], b[0], lat, tp, same_port_metric))
|
||||
two_combinations_measured[(a[0], a[1]), (b[0], b[1])] = (lat, tp, same_port_metric)
|
||||
|
||||
plot_combined(instructions_measured, two_combinations_measured)
|
@@ -1,82 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
import socket
|
||||
import textwrap
|
||||
|
||||
import numpy
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
|
||||
from asmbench import op, bench
|
||||
from asmbench import oldjit
|
||||
|
||||
|
||||
type_size = {
|
||||
'i32': 4,
|
||||
'i64': 8,
|
||||
'f32': 4,
|
||||
'float': 4,
|
||||
'f64': 8,
|
||||
'double': 8,
|
||||
}
|
||||
|
||||
|
||||
class StreamsBenchmark(bench.Benchmark):
|
||||
def __init__(self,
|
||||
read_streams=0, read_write_streams=0, write_streams=0,
|
||||
stream_byte_length=0,
|
||||
element_type='i64'):
|
||||
super().__init__()
|
||||
self.read_streams = read_streams
|
||||
self.read_write_streams = read_write_streams
|
||||
self.write_streams = write_streams
|
||||
self.stream_byte_length = stream_byte_length
|
||||
self.element_type = element_type
|
||||
|
||||
def build_ir(self, iaca_marker=False):
|
||||
if iaca_marker:
|
||||
iaca_start_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$111,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
iaca_stop_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$222,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
else:
|
||||
iaca_start_marker = ''
|
||||
iaca_stop_marker = ''
|
||||
|
||||
ir = textwrap.dedent('''\
|
||||
define i64 @"test"(i64 %"N"{pointer_arguments})
|
||||
{{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
{iaca_start_marker}
|
||||
{loop_body}
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
{iaca_stop_marker}
|
||||
ret i64 %"ret"
|
||||
}}
|
||||
''').format(
|
||||
pointer_arguments='',
|
||||
loop_body='',
|
||||
iaca_start_marker=iaca_start_marker,
|
||||
iaca_stop_marker=iaca_stop_marker)
|
||||
|
||||
return ir
|
||||
|
||||
if __name__ == '__main__':
|
||||
bench.setup_llvm()
|
||||
sb = StreamsBenchmark()
|
||||
print(sb.build_and_execute())
|
||||
|
@@ -1,3 +0,0 @@
|
||||
#!/bin/sh
|
||||
clang -g `llvm-config --cflags` test.c -c
|
||||
clang++ test.o `llvm-config --cxxflags --ldflags --libs --system-libs all` -o test
|
BIN
c_api/test
BIN
c_api/test
Binary file not shown.
72
c_api/test.c
72
c_api/test.c
@@ -1,72 +0,0 @@
|
||||
/**
|
||||
* LLVM equivalent of:
|
||||
*
|
||||
* int sum(int a, int b) {
|
||||
* return a + b;
|
||||
* }
|
||||
*/
|
||||
|
||||
#include <llvm-c/Core.h>
|
||||
#include <llvm-c/ExecutionEngine.h>
|
||||
#include <llvm-c/Target.h>
|
||||
#include <llvm-c/Analysis.h>
|
||||
#include <llvm-c/BitWriter.h>
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
LLVMModuleRef mod = LLVMModuleCreateWithName("my_module");
|
||||
|
||||
LLVMTypeRef param_types[] = { LLVMInt32Type(), LLVMInt32Type() };
|
||||
LLVMTypeRef ret_type = LLVMFunctionType(LLVMInt32Type(), param_types, 2, 0);
|
||||
LLVMValueRef sum = LLVMAddFunction(mod, "sum", ret_type);
|
||||
|
||||
LLVMBasicBlockRef entry = LLVMAppendBasicBlock(sum, "entry");
|
||||
|
||||
LLVMBuilderRef builder = LLVMCreateBuilder();
|
||||
LLVMPositionBuilderAtEnd(builder, entry);
|
||||
LLVMValueRef tmp = LLVMBuildAdd(builder, LLVMGetParam(sum, 0), LLVMGetParam(sum, 1), "tmp");
|
||||
LLVMBuildRet(builder, tmp);
|
||||
|
||||
char *error = NULL;
|
||||
LLVMVerifyModule(mod, LLVMAbortProcessAction, &error);
|
||||
LLVMDisposeMessage(error);
|
||||
|
||||
LLVMExecutionEngineRef engine;
|
||||
error = NULL;
|
||||
LLVMLinkInMCJIT();
|
||||
LLVMInitializeNativeTarget();
|
||||
if (LLVMCreateExecutionEngineForModule(&engine, mod, &error) != 0) {
|
||||
fprintf(stderr, "failed to create execution engine\n");
|
||||
abort();
|
||||
}
|
||||
if (error) {
|
||||
fprintf(stderr, "error: %s\n", error);
|
||||
LLVMDisposeMessage(error);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "usage: %s x y\n", argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
long long x = strtoll(argv[1], NULL, 10);
|
||||
long long y = strtoll(argv[2], NULL, 10);
|
||||
|
||||
LLVMGenericValueRef args[] = {
|
||||
LLVMCreateGenericValueOfInt(LLVMInt32Type(), x, 0),
|
||||
LLVMCreateGenericValueOfInt(LLVMInt32Type(), y, 0)
|
||||
};
|
||||
LLVMGenericValueRef res = LLVMRunFunction(engine, sum, 2, args);
|
||||
printf("%d\n", (int)LLVMGenericValueToInt(res, 0));
|
||||
|
||||
// Write out bitcode to file
|
||||
if (LLVMWriteBitcodeToFile(mod, "sum.bc") != 0) {
|
||||
fprintf(stderr, "error writing bitcode to file, skipping\n");
|
||||
}
|
||||
|
||||
LLVMDisposeBuilder(builder);
|
||||
LLVMDisposeExecutionEngine(engine);
|
||||
}
|
BIN
c_api/test.o
BIN
c_api/test.o
Binary file not shown.
@@ -1,37 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import llvmlite.binding as llvm
|
||||
|
||||
llvm.initialize()
|
||||
# From
|
||||
# >>> cp = (ctypes.c_char_p * 1)()
|
||||
# >>> ffi.lib.LLVMPY_GetHostCPUFeatures(cp)
|
||||
# >>> print(cp[0])
|
||||
# llvm.set_option('', '-mattr=+sse2,+cx16,-tbm,-avx512ifma,-avx512dq,-fma4,+prfchw,+bmi2,+xsavec,+fsgsbase,+popcnt,+aes,+xsaves,-avx512er,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-xop,+rdseed,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vl,-avx512cd,+avx,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,+cmov,-avx512vbmi,+movbe,+xsaveopt,-sha,+adx,-avx512pf,+sse3')
|
||||
# llvm.set_option('', '-march=native')
|
||||
# llvm.set_option('', '-mcpu=native')
|
||||
# llvm.set_option('', '-version')
|
||||
# llvm.set_option('', '-help-list-hidden')
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
# llvm.set_option('', '-help-list-hidden')
|
||||
|
||||
ir = '''
|
||||
|
||||
target triple = "x86_64-apple-darwin17.5.0"
|
||||
|
||||
define <4 x double> @testv(i32**, i32) {
|
||||
|
||||
%out = tail call <4 x double> asm "vaddpd $1, $2, $0", "=x,x,x,~{dirflag},~{fpsr},~{flags}"(<4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>, <4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>)
|
||||
ret <4 x double> %out
|
||||
}
|
||||
'''
|
||||
|
||||
module = llvm.parse_assembly(ir)
|
||||
module.verify()
|
||||
features = llvm.get_host_cpu_features().flatten()
|
||||
cpu = llvm.get_host_cpu_name()
|
||||
tm = llvm.Target.from_default_triple().create_target_machine(cpu=cpu, features=features)
|
||||
with llvm.create_mcjit_compiler(module, tm) as ee:
|
||||
ee.finalize_object()
|
||||
print(tm.emit_assembly(module))
|
BIN
dev_test/a.out
BIN
dev_test/a.out
Binary file not shown.
@@ -1,55 +0,0 @@
|
||||
taschenbuch:pyasmjit codemonk$ clang -P - -march=native -### 2>&1|grep -E --color -o -- '"-target-feature" "[^"]+"'
|
||||
"-target-feature" "+sse2"
|
||||
"-target-feature" "+cx16"
|
||||
"-target-feature" "-tbm"
|
||||
"-target-feature" "-avx512ifma"
|
||||
"-target-feature" "-avx512dq"
|
||||
"-target-feature" "-fma4"
|
||||
"-target-feature" "+prfchw"
|
||||
"-target-feature" "+bmi2"
|
||||
"-target-feature" "+xsavec"
|
||||
"-target-feature" "+fsgsbase"
|
||||
"-target-feature" "+popcnt"
|
||||
"-target-feature" "+aes"
|
||||
"-target-feature" "+xsaves"
|
||||
"-target-feature" "-avx512er"
|
||||
"-target-feature" "-avx512vpopcntdq"
|
||||
"-target-feature" "-clwb"
|
||||
"-target-feature" "-avx512f"
|
||||
"-target-feature" "-clzero"
|
||||
"-target-feature" "-pku"
|
||||
"-target-feature" "+mmx"
|
||||
"-target-feature" "-lwp"
|
||||
"-target-feature" "-xop"
|
||||
"-target-feature" "+rdseed"
|
||||
"-target-feature" "-sse4a"
|
||||
"-target-feature" "-avx512bw"
|
||||
"-target-feature" "+clflushopt"
|
||||
"-target-feature" "+xsave"
|
||||
"-target-feature" "-avx512vl"
|
||||
"-target-feature" "-avx512cd"
|
||||
"-target-feature" "+avx"
|
||||
"-target-feature" "+rtm"
|
||||
"-target-feature" "+fma"
|
||||
"-target-feature" "+bmi"
|
||||
"-target-feature" "+rdrnd"
|
||||
"-target-feature" "-mwaitx"
|
||||
"-target-feature" "+sse4.1"
|
||||
"-target-feature" "+sse4.2"
|
||||
"-target-feature" "+avx2"
|
||||
"-target-feature" "+sse"
|
||||
"-target-feature" "+lzcnt"
|
||||
"-target-feature" "+pclmul"
|
||||
"-target-feature" "-prefetchwt1"
|
||||
"-target-feature" "+f16c"
|
||||
"-target-feature" "+ssse3"
|
||||
"-target-feature" "+sgx"
|
||||
"-target-feature" "+cmov"
|
||||
"-target-feature" "-avx512vbmi"
|
||||
"-target-feature" "+movbe"
|
||||
"-target-feature" "+xsaveopt"
|
||||
"-target-feature" "-sha"
|
||||
"-target-feature" "+adx"
|
||||
"-target-feature" "-avx512pf"
|
||||
"-target-feature" "+sse3"
|
||||
taschenbuch:pyasmjit codemonk$
|
@@ -1,22 +0,0 @@
|
||||
define i64 @"test"(i64 %"N")
|
||||
{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
%"in.0" = phi i32 [3, %"entry"], [%"out.0", %"loop"]
|
||||
|
||||
|
||||
%"reg.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"in.0", i32 1)
|
||||
%"out.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"reg.0", i32 1)
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
|
||||
ret i64 %"ret"
|
||||
}
|
BIN
dev_test/fail.o
BIN
dev_test/fail.o
Binary file not shown.
@@ -1,35 +0,0 @@
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _test ## -- Begin function test
|
||||
.p2align 4, 0x90
|
||||
_test: ## @test
|
||||
.cfi_startproc
|
||||
## %bb.0: ## %entry
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
## %bb.2: ## %loop.preheader
|
||||
movl $3, %ecx
|
||||
movq $-1, %rdx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3: ## %loop
|
||||
## =>This Inner Loop Header: Depth=1
|
||||
## InlineAsm Start
|
||||
addl $1, %ecx
|
||||
## InlineAsm End
|
||||
leaq 1(%rdx), %rax
|
||||
addq $2, %rdx
|
||||
cmpq %rdi, %rdx
|
||||
movq %rax, %rdx
|
||||
## InlineAsm Start
|
||||
addl $1, %ecx
|
||||
## InlineAsm End
|
||||
jl LBB0_3
|
||||
## %bb.4: ## %end
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
## -- End function
|
||||
|
||||
.subsections_via_symbols
|
@@ -1,67 +0,0 @@
|
||||
define i64 @"test"(i64 %"N")
|
||||
{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
%in.0 = phi i64 [1, %"entry"], [%out.0, %"loop"]
|
||||
%in.1 = phi i64 [1, %"entry"], [%out.1, %"loop"]
|
||||
%in.2 = phi i64 [1, %"entry"], [%out.2, %"loop"]
|
||||
%in.3 = phi i64 [1, %"entry"], [%out.3, %"loop"]
|
||||
%in.4 = phi i64 [1, %"entry"], [%out.4, %"loop"]
|
||||
%in.5 = phi i64 [1, %"entry"], [%out.5, %"loop"]
|
||||
%in.6 = phi i64 [1, %"entry"], [%out.6, %"loop"]
|
||||
%in.7 = phi i64 [1, %"entry"], [%out.7, %"loop"]
|
||||
|
||||
%"reg.0" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.0, i64 1)
|
||||
%"reg.1" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.0", i64 1)
|
||||
%"reg.2" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.1", i64 1)
|
||||
%"reg.3" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.2", i64 1)
|
||||
%"reg.4" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.3", i64 1)
|
||||
%"reg.5" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.4", i64 1)
|
||||
%"reg.6" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.5", i64 1)
|
||||
%out.0 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.6", i64 1)
|
||||
%"reg.7" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.1, i64 1)
|
||||
%"reg.8" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.7", i64 1)
|
||||
%"reg.9" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.8", i64 1)
|
||||
%"reg.10" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.9", i64 1)
|
||||
%"reg.11" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.10", i64 1)
|
||||
%"reg.12" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.11", i64 1)
|
||||
%"reg.13" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.12", i64 1)
|
||||
%out.1 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.13", i64 1)
|
||||
%"reg.14" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.2, i64 1)
|
||||
%"reg.15" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.14", i64 1)
|
||||
%"reg.16" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.15", i64 1)
|
||||
%"reg.17" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.16", i64 1)
|
||||
%"reg.18" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.17", i64 1)
|
||||
%"reg.19" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.18", i64 1)
|
||||
%"reg.20" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.19", i64 1)
|
||||
%out.2 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.20", i64 1)
|
||||
%"reg.21" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.3, i64 1)
|
||||
%"reg.22" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.21", i64 1)
|
||||
%"reg.23" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.22", i64 1)
|
||||
%"reg.24" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.23", i64 1)
|
||||
%"reg.25" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.24", i64 1)
|
||||
%"reg.26" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.25", i64 1)
|
||||
%"reg.27" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.26", i64 1)
|
||||
%out.3 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.27", i64 1)
|
||||
%"reg.28" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.4, i64 1)
|
||||
%"reg.29" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.28", i64 1)
|
||||
%"reg.30" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.29", i64 1)
|
||||
%"reg.31" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.30", i64 1)
|
||||
%"reg.32" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.31", i64 1)
|
||||
%"reg.33" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.32", i64 1)
|
||||
%out.4 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.33", i64 1)
|
||||
%out.5 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.5, i64 1)
|
||||
%out.6 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.6, i64 1)
|
||||
%out.7 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.7, i64 1)
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [-1, %"entry"], [%"loop_counter", %"loop"]
|
||||
ret i64 %"ret"
|
||||
}
|
Binary file not shown.
@@ -1,6 +0,0 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int main() {
|
||||
printf("%d\n", test(100));
|
||||
return 0;
|
||||
}
|
@@ -1,95 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import llvmlite.binding as llvm
|
||||
import ctypes
|
||||
|
||||
llvm.initialize()
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
|
||||
code = '''define i64 @"test"(i64 %"N")
|
||||
{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
%in.0 = phi i64 [1, %"entry"], [%out.0, %"loop"]
|
||||
%in.1 = phi i64 [1, %"entry"], [%out.1, %"loop"]
|
||||
%in.2 = phi i64 [1, %"entry"], [%out.2, %"loop"]
|
||||
%in.3 = phi i64 [1, %"entry"], [%out.3, %"loop"]
|
||||
%in.4 = phi i64 [1, %"entry"], [%out.4, %"loop"]
|
||||
%in.5 = phi i64 [1, %"entry"], [%out.5, %"loop"]
|
||||
%in.6 = phi i64 [1, %"entry"], [%out.6, %"loop"]
|
||||
%in.7 = phi i64 [1, %"entry"], [%out.7, %"loop"]
|
||||
|
||||
%"reg.0" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.0, i64 1)
|
||||
%"reg.1" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.0", i64 1)
|
||||
%"reg.2" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.1", i64 1)
|
||||
%"reg.3" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.2", i64 1)
|
||||
%"reg.4" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.3", i64 1)
|
||||
%"reg.5" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.4", i64 1)
|
||||
%"reg.6" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.5", i64 1)
|
||||
%out.0 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.6", i64 1)
|
||||
%"reg.7" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.1, i64 1)
|
||||
%"reg.8" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.7", i64 1)
|
||||
%"reg.9" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.8", i64 1)
|
||||
%"reg.10" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.9", i64 1)
|
||||
%"reg.11" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.10", i64 1)
|
||||
%"reg.12" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.11", i64 1)
|
||||
%"reg.13" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.12", i64 1)
|
||||
%out.1 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.13", i64 1)
|
||||
%"reg.14" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.2, i64 1)
|
||||
%"reg.15" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.14", i64 1)
|
||||
%"reg.16" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.15", i64 1)
|
||||
%"reg.17" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.16", i64 1)
|
||||
%"reg.18" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.17", i64 1)
|
||||
%"reg.19" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.18", i64 1)
|
||||
%"reg.20" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.19", i64 1)
|
||||
%out.2 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.20", i64 1)
|
||||
%"reg.21" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.3, i64 1)
|
||||
%"reg.22" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.21", i64 1)
|
||||
%"reg.23" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.22", i64 1)
|
||||
%"reg.24" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.23", i64 1)
|
||||
%"reg.25" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.24", i64 1)
|
||||
%"reg.26" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.25", i64 1)
|
||||
%"reg.27" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.26", i64 1)
|
||||
%out.3 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.27", i64 1)
|
||||
%"reg.28" = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.4, i64 1)
|
||||
%"reg.29" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.28", i64 1)
|
||||
%"reg.30" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.29", i64 1)
|
||||
%"reg.31" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.30", i64 1)
|
||||
%"reg.32" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.31", i64 1)
|
||||
%"reg.33" = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.32", i64 1)
|
||||
%out.4 = call i64 asm "add $2, $0", "=r,0,i" (i64 %"reg.33", i64 1)
|
||||
%out.5 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.5, i64 1)
|
||||
%out.6 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.6, i64 1)
|
||||
%out.7 = call i64 asm "add $2, $0", "=r,0,i" (i64 %in.7, i64 1)
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [-1, %"entry"], [%"loop_counter", %"loop"]
|
||||
ret i64 %"ret"
|
||||
}'''
|
||||
|
||||
llvm_module = llvm.parse_assembly(code)
|
||||
llvm_module.verify()
|
||||
tm = llvm.Target.from_default_triple().create_target_machine(
|
||||
features=llvm.get_host_cpu_features().flatten(),
|
||||
cpu=llvm.get_host_cpu_name(),
|
||||
opt=3)
|
||||
ee = llvm.create_mcjit_compiler(llvm_module, tm)
|
||||
ee.finalize_object()
|
||||
cfptr = ee.get_function_address('test')
|
||||
cfunc = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)(cfptr)
|
||||
N = 100
|
||||
ret = cfunc(N)
|
||||
|
||||
print(ret)
|
||||
if ret == 0:
|
||||
print("FAIL")
|
||||
elif ret == N-1:
|
||||
print("Probably good.")
|
BIN
dev_test/main.bc
BIN
dev_test/main.bc
Binary file not shown.
@@ -1,6 +0,0 @@
|
||||
#include <stdio.h>
|
||||
int test(int);
|
||||
int main() {
|
||||
printf("%d\n", test(123123123));
|
||||
return 0;
|
||||
}
|
@@ -1,32 +0,0 @@
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _main ## -- Begin function main
|
||||
.p2align 4, 0x90
|
||||
_main: ## @main
|
||||
.cfi_startproc
|
||||
## %bb.0:
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rsp, %rbp
|
||||
.cfi_def_cfa_register %rbp
|
||||
subq $16, %rsp
|
||||
movl $0, -4(%rbp)
|
||||
movl $123123123, %edi ## imm = 0x756B5B3
|
||||
callq _test
|
||||
leaq L_.str(%rip), %rdi
|
||||
movl %eax, %esi
|
||||
movb $0, %al
|
||||
callq _printf
|
||||
xorl %eax, %eax
|
||||
addq $16, %rsp
|
||||
popq %rbp
|
||||
retq
|
||||
.cfi_endproc
|
||||
## -- End function
|
||||
.section __TEXT,__cstring,cstring_literals
|
||||
L_.str: ## @.str
|
||||
.asciz "%d\n"
|
||||
|
||||
|
||||
.subsections_via_symbols
|
@@ -1,53 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import ctypes
|
||||
|
||||
import llvmlite.binding as llvm
|
||||
|
||||
|
||||
llvm.initialize()
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
|
||||
code = """
|
||||
define i64 @"test"(i64 %"N")
|
||||
{
|
||||
entry:
|
||||
%"loop_cond" = icmp slt i64 0, %"N"
|
||||
br i1 %"loop_cond", label %"loop", label %"end"
|
||||
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
%"in.0" = phi i32 [3, %"entry"], [%"out.0", %"loop"]
|
||||
|
||||
|
||||
%"reg.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"in.0", i32 1)
|
||||
%"out.0" = call i32 asm "add $2, $0", "=r,0,i" (i32 %"reg.0", i32 1)
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
|
||||
ret i64 %"ret"
|
||||
}
|
||||
"""
|
||||
|
||||
features = llvm.get_host_cpu_features().flatten()
|
||||
# znver1 on naples and skylake-avx512 on skylake-sp
|
||||
for cpu in ["skylake-avx512", "znver1"]:
|
||||
tm = llvm.Target.from_default_triple().create_target_machine(
|
||||
cpu=cpu, opt=2)
|
||||
tm.set_asm_verbosity(0)
|
||||
|
||||
module = llvm.parse_assembly(code)
|
||||
asm = tm.emit_assembly(module)
|
||||
print(asm)
|
||||
with llvm.create_mcjit_compiler(module, tm) as ee:
|
||||
ee.finalize_object()
|
||||
cfptr = ee.get_function_address('test')
|
||||
cfunc = ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)(cfptr)
|
||||
print('->', cfunc(100000))
|
||||
|
||||
|
1891070
dev_test/tblgen_output
1891070
dev_test/tblgen_output
File diff suppressed because it is too large
Load Diff
431278
dev_test/tblgen_output_ARM
431278
dev_test/tblgen_output_ARM
File diff suppressed because it is too large
Load Diff
BIN
dist/asmbench-0.1.0.tar.gz
vendored
BIN
dist/asmbench-0.1.0.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.1.1.tar.gz
vendored
BIN
dist/asmbench-0.1.1.1.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.1.2.tar.gz
vendored
BIN
dist/asmbench-0.1.1.2.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.1.3.tar.gz
vendored
BIN
dist/asmbench-0.1.1.3.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.1.tar.gz
vendored
BIN
dist/asmbench-0.1.1.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.2.tar.gz
vendored
BIN
dist/asmbench-0.1.2.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.3.tar.gz
vendored
BIN
dist/asmbench-0.1.3.tar.gz
vendored
Binary file not shown.
BIN
dist/asmbench-0.1.4.tar.gz
vendored
BIN
dist/asmbench-0.1.4.tar.gz
vendored
Binary file not shown.
BIN
dist/asmjit-0.1.1.tar.gz
vendored
BIN
dist/asmjit-0.1.1.tar.gz
vendored
Binary file not shown.
BIN
dist/asmjit-0.1.2.tar.gz
vendored
BIN
dist/asmjit-0.1.2.tar.gz
vendored
Binary file not shown.
BIN
dist/asmjit-0.1.tar.gz
vendored
BIN
dist/asmjit-0.1.tar.gz
vendored
Binary file not shown.
Submodule doc/asmbench-SC18SRC-poster deleted from 89206c1415
6
min.ll
6
min.ll
@@ -1,6 +0,0 @@
|
||||
define <4 x double> @testv(i32**, i32) {
|
||||
|
||||
%out = tail call <4 x double> asm "vaddpd $1, $2, $0", "=x,x,x,~{dirflag},~{fpsr},~{flags}"(<4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>, <4 x double> <double 0.123, double 0.123, double 0.123, double 0.123>)
|
||||
ret <4 x double> %out
|
||||
}
|
||||
|
21
min.s
21
min.s
@@ -1,21 +0,0 @@
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.section __TEXT,__literal8,8byte_literals
|
||||
.p2align 3 ## -- Begin function testv
|
||||
LCPI0_0:
|
||||
.quad 4593527504729830064 ## 0x3fbf7ced916872b0
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _testv
|
||||
.p2align 4, 0x90
|
||||
_testv: ## @testv
|
||||
.cfi_startproc
|
||||
## BB#0:
|
||||
vbroadcastsd LCPI0_0(%rip), %ymm0 ## ymm0 = [4593527504729830064,4593527504729830064,4593527504729830064,4593527504729830064]
|
||||
## InlineAsm Start
|
||||
vaddpd %ymm0, %ymm0, %ymm0
|
||||
## InlineAsm End
|
||||
retq
|
||||
.cfi_endproc
|
||||
## -- End function
|
||||
|
||||
.subsections_via_symbols
|
450
random_pf1.txt
450
random_pf1.txt
@@ -1,450 +0,0 @@
|
||||
## Selected Instructions
|
||||
VPERMILPSri, MULPSrr, ANDPDrr, VPSIGNBrr, PSIGNBrr, PMOVZXWDrr, PMINUWrr, PADDSWrr, VPSHUFHWri, MOVUPDrr
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.section __TEXT,__literal4,4byte_literals
|
||||
.p2align 2
|
||||
LCPI0_0:
|
||||
.long 1065361408
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movabsq $LCPI0_0, %rax
|
||||
vbroadcastss (%rax), %xmm0
|
||||
movq $-1, %rcx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
vpermilps $1, %xmm0, %xmm0
|
||||
mulps %xmm0, %xmm0
|
||||
andpd %xmm0, %xmm0
|
||||
vpsignb %xmm0, %xmm0, %xmm0
|
||||
psignb %xmm0, %xmm0
|
||||
pmovzxwd %xmm0, %xmm0
|
||||
pminuw %xmm0, %xmm0
|
||||
paddsw %xmm0, %xmm0
|
||||
vpshufhw $1, %xmm0, %xmm0
|
||||
movupd %xmm0, %xmm0
|
||||
## InlineAsm End
|
||||
leaq 1(%rcx), %rax
|
||||
addq $2, %rcx
|
||||
cmpq %rdi, %rcx
|
||||
movq %rax, %rcx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (24655919,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 24655919,
|
||||
'parallel_factor': 1,
|
||||
'returned': [24655918, 24655918, 24655918, 24655918],
|
||||
'runtimes': [0.13202582497615367,
|
||||
0.13208268792368472,
|
||||
0.13151856907643378,
|
||||
0.13161470007617027]}
|
||||
minimal throughput: 13.87 cy
|
||||
## Selected Instructions
|
||||
VFMADD132PDYr, VPADDWYrr, VFMADD132PSYr, VPADDDYrr, VSUBPDYrr, VPACKUSDWYrr, VPMULHUWYrr, VMINPDYrr, VPUNPCKLWDYrr, VBLENDVPSYrr
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.section __TEXT,__literal4,4byte_literals
|
||||
.p2align 2
|
||||
LCPI0_0:
|
||||
.long 1065361408
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movabsq $LCPI0_0, %rax
|
||||
vbroadcastss (%rax), %ymm0
|
||||
movq $-1, %rcx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
vfmadd132pd %ymm0, %ymm0, %ymm0
|
||||
vpaddw %ymm0, %ymm0, %ymm0
|
||||
vfmadd132ps %ymm0, %ymm0, %ymm0
|
||||
vpaddd %ymm0, %ymm0, %ymm0
|
||||
vsubpd %ymm0, %ymm0, %ymm0
|
||||
vpackusdw %ymm0, %ymm0, %ymm0
|
||||
vpmulhuw %ymm0, %ymm0, %ymm0
|
||||
vminpd %ymm0, %ymm0, %ymm0
|
||||
vpunpcklwd %ymm0, %ymm0, %ymm0
|
||||
vblendvps %ymm0, %ymm0, %ymm0, %ymm0
|
||||
## InlineAsm End
|
||||
leaq 1(%rcx), %rax
|
||||
addq $2, %rcx
|
||||
cmpq %rdi, %rcx
|
||||
movq %rax, %rcx
|
||||
jl LBB0_3
|
||||
vzeroupper
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (10000000,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 10000000,
|
||||
'parallel_factor': 1,
|
||||
'returned': [9999999, 9999999, 9999999, 9999999],
|
||||
'runtimes': [0.11892832000739872,
|
||||
0.11891822703182697,
|
||||
0.11902078497223556,
|
||||
0.12094117503147572]}
|
||||
minimal throughput: 30.92 cy
|
||||
## Selected Instructions
|
||||
VCVTSI642SDrr, VFMADD213SDr, DIVSDrr, VCVTSI642SDrr, MAXSDrr, VFNMADD213SDr, VFMADD132SDr, VMAXSDrr, VFNMADD132SDr, SQRTSDr
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.section __TEXT,__literal8,8byte_literals
|
||||
.p2align 3
|
||||
LCPI0_0:
|
||||
.quad 4607186816846528512
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movq $-1, %rcx
|
||||
movabsq $LCPI0_0, %rax
|
||||
vmovsd (%rax), %xmm0
|
||||
movl $3, %edx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
vcvtsi2sdq %rdx, %xmm0, %xmm0
|
||||
vfmadd213sd %xmm0, %xmm0, %xmm0
|
||||
divsd %xmm0, %xmm0
|
||||
vcvtsi2sdq %rdx, %xmm0, %xmm0
|
||||
maxsd %xmm0, %xmm0
|
||||
vfnmadd213sd %xmm0, %xmm0, %xmm0
|
||||
vfmadd132sd %xmm0, %xmm0, %xmm0
|
||||
vmaxsd %xmm0, %xmm0, %xmm0
|
||||
vfnmadd132sd %xmm0, %xmm0, %xmm0
|
||||
sqrtsd %xmm0, %xmm0
|
||||
## InlineAsm End
|
||||
leaq 1(%rcx), %rax
|
||||
addq $2, %rcx
|
||||
cmpq %rdi, %rcx
|
||||
movq %rax, %rcx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (5841530,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 5841530,
|
||||
'parallel_factor': 1,
|
||||
'returned': [5841529, 5841529, 5841529, 5841529],
|
||||
'runtimes': [0.13433505699504167,
|
||||
0.13318849296774715,
|
||||
0.13303690601605922,
|
||||
0.13309408095665276]}
|
||||
minimal throughput: 59.21 cy
|
||||
## Selected Instructions
|
||||
RCPSSr, VCVTSI2SSrr, MULSSrr, VCVTSD2SSrr, VROUNDSSr, VRCPSSr, VCVTSI2SSrr, VSQRTSSr, VFNMADD231SSr, VSQRTSSr
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.section __TEXT,__literal4,4byte_literals
|
||||
.p2align 2
|
||||
LCPI0_0:
|
||||
.long 1065361408
|
||||
.section __TEXT,__literal8,8byte_literals
|
||||
.p2align 3
|
||||
LCPI0_1:
|
||||
.quad 4607186816846528512
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movq $-1, %rcx
|
||||
movabsq $LCPI0_0, %rax
|
||||
vmovss (%rax), %xmm1
|
||||
movl $3, %edx
|
||||
movabsq $LCPI0_1, %rax
|
||||
vmovsd (%rax), %xmm0
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
rcpss %xmm1, %xmm1
|
||||
vcvtsi2ssl %edx, %xmm1, %xmm1
|
||||
mulss %xmm1, %xmm1
|
||||
vcvtsd2ss %xmm0, %xmm1, %xmm1
|
||||
vroundss $1, %xmm1, %xmm1, %xmm1
|
||||
vrcpss %xmm1, %xmm1, %xmm1
|
||||
vcvtsi2ssl %edx, %xmm1, %xmm1
|
||||
vsqrtss %xmm1, %xmm1, %xmm1
|
||||
vfnmadd231ss %xmm1, %xmm1, %xmm1
|
||||
vsqrtss %xmm1, %xmm1, %xmm1
|
||||
## InlineAsm End
|
||||
leaq 1(%rcx), %rax
|
||||
addq $2, %rcx
|
||||
cmpq %rdi, %rcx
|
||||
movq %rax, %rcx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (6011291,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 6011291,
|
||||
'parallel_factor': 1,
|
||||
'returned': [6011290, 6011290, 6011290, 6011290],
|
||||
'runtimes': [0.13239118899218738,
|
||||
0.13244657206814736,
|
||||
0.1326694720191881,
|
||||
0.13262002903502434]}
|
||||
minimal throughput: 57.26 cy
|
||||
## Selected Instructions
|
||||
ROR16ri, CMOVS16rr, SBB16ri, ADC16ri8, XOR16ri8, BTR16rr, XOR16ri8, SAR16r1, DEC16r, SUB16ri
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movw $3, %cx
|
||||
movq $-1, %rdx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
rorw %cx
|
||||
cmovsw %cx, %cx
|
||||
sbbw $1, %cx
|
||||
adcw $1, %cx
|
||||
xorw $1, %cx
|
||||
btrw %cx, %cx
|
||||
xorw $1, %cx
|
||||
sarw %cx
|
||||
decw %cx
|
||||
subw $1, %cx
|
||||
## InlineAsm End
|
||||
leaq 1(%rdx), %rax
|
||||
addq $2, %rdx
|
||||
cmpq %rdi, %rdx
|
||||
movq %rax, %rdx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (31283731,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 31283731,
|
||||
'parallel_factor': 1,
|
||||
'returned': [31283730, 31283730, 31283730, 31283730],
|
||||
'runtimes': [0.13291946100071073,
|
||||
0.13294463406782597,
|
||||
0.1332225619116798,
|
||||
0.13287500606384128]}
|
||||
minimal throughput: 11.04 cy
|
||||
## Selected Instructions
|
||||
SHLX32rr, CMOVO32rr, MOV32rr, CMOVS32rr, CRC32r32r8, SHR32r1, ADD32rr, CRC32r32r8, RCR32ri, SHR32r1
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movl $3, %esi
|
||||
movq $-1, %rdx
|
||||
movb $3, %cl
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
shlxl %esi, %esi, %eax
|
||||
cmovol %eax, %eax
|
||||
movl %eax, %esi
|
||||
cmovsl %esi, %esi
|
||||
crc32b %cl, %esi
|
||||
shrl %esi
|
||||
addl %esi, %esi
|
||||
crc32b %cl, %esi
|
||||
rcrl %esi
|
||||
shrl %esi
|
||||
## InlineAsm End
|
||||
leaq 1(%rdx), %rax
|
||||
addq $2, %rdx
|
||||
cmpq %rdi, %rdx
|
||||
movq %rax, %rdx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (24008543,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 24008543,
|
||||
'parallel_factor': 1,
|
||||
'returned': [24008542, 24008542, 24008542, 24008542],
|
||||
'runtimes': [0.13333229208365083,
|
||||
0.13314284407533705,
|
||||
0.13381975598167628,
|
||||
0.13447994901798666]}
|
||||
minimal throughput: 14.42 cy
|
||||
## Selected Instructions
|
||||
SHRX64rr, SBB64ri32, AND64ri8, MOV64rc, INC64r, SUB64ri32, POPCNT64rr, OR64ri8, BTS64rr, ROL64ri
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movq $-1, %rcx
|
||||
movl $3, %edx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
shrxq %rdx, %rdx, %rax
|
||||
sbbq $1, %rax
|
||||
andq $1, %rax
|
||||
movq %rax, %rax
|
||||
incq %rax
|
||||
subq $1, %rax
|
||||
popcntq %rax, %rdx
|
||||
orq $1, %rdx
|
||||
btsq %rdx, %rdx
|
||||
rolq %rdx
|
||||
## InlineAsm End
|
||||
leaq 1(%rcx), %rax
|
||||
addq $2, %rcx
|
||||
cmpq %rdi, %rcx
|
||||
movq %rax, %rcx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (27539225,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 27539225,
|
||||
'parallel_factor': 1,
|
||||
'returned': [27539224, 27539224, 27539224, 27539224],
|
||||
'runtimes': [0.1335972750093788,
|
||||
0.13322542910464108,
|
||||
0.13357082300353795,
|
||||
0.13376462296582758]}
|
||||
minimal throughput: 12.58 cy
|
||||
## Selected Instructions
|
||||
SAR8r1, SHR8ri, INC8r, AND8rr, RCR8ri, ROL8ri, SUB8ri, SBB8rr, NEG8r, NOT8r
|
||||
## Generated Assembly (1x parallel)
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _test
|
||||
.p2align 4, 0x90
|
||||
_test:
|
||||
.cfi_startproc
|
||||
testq %rdi, %rdi
|
||||
jle LBB0_1
|
||||
movb $3, %cl
|
||||
movq $-1, %rdx
|
||||
.p2align 4, 0x90
|
||||
LBB0_3:
|
||||
## InlineAsm Start
|
||||
sarb %cl
|
||||
shrb %cl
|
||||
incb %cl
|
||||
andb %cl, %cl
|
||||
rcrb %cl
|
||||
rolb %cl
|
||||
subb $1, %cl
|
||||
sbbb %cl, %cl
|
||||
negb %cl
|
||||
notb %cl
|
||||
## InlineAsm End
|
||||
leaq 1(%rdx), %rax
|
||||
addq $2, %rdx
|
||||
cmpq %rdi, %rdx
|
||||
movq %rax, %rdx
|
||||
jl LBB0_3
|
||||
retq
|
||||
LBB0_1:
|
||||
xorl %eax, %eax
|
||||
retq
|
||||
.cfi_endproc
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
|
||||
## Detailed Results
|
||||
{'arguments': (30431254,),
|
||||
'frequency': 2600000000.0,
|
||||
'iterations': 30431254,
|
||||
'parallel_factor': 1,
|
||||
'returned': [30431253, 30431253, 30431253, 30431253],
|
||||
'runtimes': [0.13894746906589717,
|
||||
0.1348069809610024,
|
||||
0.13318019802682102,
|
||||
0.13318415405228734]}
|
||||
minimal throughput: 11.38 cy
|
1138
random_pf10.txt
1138
random_pf10.txt
File diff suppressed because it is too large
Load Diff
BIN
tablegen.cprof
BIN
tablegen.cprof
Binary file not shown.
136
test.s
136
test.s
@@ -1,136 +0,0 @@
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.macosx_version_min 10, 13
|
||||
.globl _foo ## -- Begin function foo
|
||||
.p2align 4, 0x90
|
||||
_foo: ## @foo
|
||||
.cfi_startproc
|
||||
## BB#0:
|
||||
pushq %rbp
|
||||
Lcfi0:
|
||||
.cfi_def_cfa_offset 16
|
||||
Lcfi1:
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rsp, %rbp
|
||||
Lcfi2:
|
||||
.cfi_def_cfa_register %rbp
|
||||
xorl %eax, %eax
|
||||
testl %edi, %edi
|
||||
jle LBB0_2
|
||||
.p2align 4, 0x90
|
||||
LBB0_1: ## =>This Inner Loop Header: Depth=1
|
||||
## InlineAsm Start
|
||||
addl $23, %eax
|
||||
|
||||
## InlineAsm End
|
||||
## InlineAsm Start
|
||||
subl $13, %eax
|
||||
|
||||
## InlineAsm End
|
||||
## InlineAsm Start
|
||||
subl $10, %eax
|
||||
|
||||
## InlineAsm End
|
||||
incl %eax
|
||||
cmpl %edi, %eax
|
||||
jl LBB0_1
|
||||
LBB0_2:
|
||||
popq %rbp
|
||||
retq
|
||||
.cfi_endproc
|
||||
## -- End function
|
||||
.section __TEXT,__literal8,8byte_literals
|
||||
.p2align 3 ## -- Begin function benchmark
|
||||
LCPI1_0:
|
||||
.quad 4696837146684686336 ## double 1.0E+6
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _benchmark
|
||||
.p2align 4, 0x90
|
||||
_benchmark: ## @benchmark
|
||||
.cfi_startproc
|
||||
## BB#0:
|
||||
pushq %rbp
|
||||
Lcfi3:
|
||||
.cfi_def_cfa_offset 16
|
||||
Lcfi4:
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rsp, %rbp
|
||||
Lcfi5:
|
||||
.cfi_def_cfa_register %rbp
|
||||
pushq %r14
|
||||
pushq %rbx
|
||||
subq $48, %rsp
|
||||
Lcfi6:
|
||||
.cfi_offset %rbx, -32
|
||||
Lcfi7:
|
||||
.cfi_offset %r14, -24
|
||||
movq %rsi, %r14
|
||||
movss %xmm0, -20(%rbp) ## 4-byte Spill
|
||||
movl %edi, %ebx
|
||||
leaq -56(%rbp), %rdi
|
||||
xorl %esi, %esi
|
||||
callq _gettimeofday
|
||||
movl %ebx, %edi
|
||||
callq *%r14
|
||||
leaq -40(%rbp), %rdi
|
||||
xorl %esi, %esi
|
||||
callq _gettimeofday
|
||||
movq -40(%rbp), %rax
|
||||
subq -56(%rbp), %rax
|
||||
cvtsi2sdq %rax, %xmm1
|
||||
movl -32(%rbp), %eax
|
||||
subl -48(%rbp), %eax
|
||||
xorps %xmm0, %xmm0
|
||||
cvtsi2sdl %eax, %xmm0
|
||||
mulsd LCPI1_0(%rip), %xmm0
|
||||
addsd %xmm1, %xmm0
|
||||
movss -20(%rbp), %xmm1 ## 4-byte Reload
|
||||
## xmm1 = mem[0],zero,zero,zero
|
||||
cvtss2sd %xmm1, %xmm1
|
||||
divsd %xmm1, %xmm0
|
||||
leaq L_.str(%rip), %rdi
|
||||
movb $1, %al
|
||||
callq _printf
|
||||
addq $48, %rsp
|
||||
popq %rbx
|
||||
popq %r14
|
||||
popq %rbp
|
||||
retq
|
||||
.cfi_endproc
|
||||
## -- End function
|
||||
.section __TEXT,__literal4,4byte_literals
|
||||
.p2align 2 ## -- Begin function main
|
||||
LCPI2_0:
|
||||
.long 1326386456 ## float 2.4E+9
|
||||
.section __TEXT,__text,regular,pure_instructions
|
||||
.globl _main
|
||||
.p2align 4, 0x90
|
||||
_main: ## @main
|
||||
.cfi_startproc
|
||||
## BB#0:
|
||||
pushq %rbp
|
||||
Lcfi8:
|
||||
.cfi_def_cfa_offset 16
|
||||
Lcfi9:
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rsp, %rbp
|
||||
Lcfi10:
|
||||
.cfi_def_cfa_register %rbp
|
||||
movq 8(%rsi), %rdi
|
||||
callq _atoi
|
||||
leaq _foo(%rip), %rsi
|
||||
movss LCPI2_0(%rip), %xmm0 ## xmm0 = mem[0],zero,zero,zero
|
||||
movl %eax, %edi
|
||||
callq _benchmark
|
||||
xorl %eax, %eax
|
||||
popq %rbp
|
||||
retq
|
||||
.cfi_endproc
|
||||
## -- End function
|
||||
.section __TEXT,__cstring,cstring_literals
|
||||
L_.str: ## @.str
|
||||
.asciz "%.3f (clock cycles)\n"
|
||||
|
||||
.comm _latency,8,3 ## @latency
|
||||
.comm _ninst,8,3 ## @ninst
|
||||
|
||||
.subsections_via_symbols
|
Binary file not shown.
@@ -1,39 +0,0 @@
|
||||
ADD32ri
|
||||
ADD64ri32
|
||||
CMP32rm
|
||||
CMP32rr
|
||||
CMP64ri32
|
||||
CMP64rr
|
||||
INC64r
|
||||
MOVSX64rm32
|
||||
SUB32ri
|
||||
VADDPDYrm
|
||||
VADDSDrm
|
||||
VADDSDrr
|
||||
VADDSSrr
|
||||
VCVTSI642SSrr
|
||||
VCVTSS2SIrr_Int
|
||||
VFMADD213PDYr
|
||||
VFMADD213PDr
|
||||
VFMADD213PSYr
|
||||
VFMADD213PSr
|
||||
VFMADD213SDr
|
||||
VFMADD213SSr
|
||||
VINSERTF128rr
|
||||
VMULPDYrr
|
||||
VMULSDrm_Int
|
||||
VMULSDrr_Int
|
||||
VMULSSrr_Int
|
||||
VSUBPDYrm
|
||||
VSUBSDrm_Int
|
||||
VSUBSDrr_Int
|
||||
VSUBSSrr_Int
|
||||
|
||||
MOV64mr (store)
|
||||
MOV32rm
|
||||
MOV64rm
|
||||
VMOVSD??? mem_xmm
|
||||
VMOVSD??? xmm_mem
|
||||
|
||||
LEA32r <-- which ones?
|
||||
LEA64r <-- which ones?
|
@@ -1,51 +0,0 @@
|
||||
add-r32_imd
|
||||
add-r64_imd
|
||||
inc-r64
|
||||
mov-mem_r64
|
||||
mov-r32_imd
|
||||
movslq-r64_r32
|
||||
sub-r32_imd
|
||||
vaddpd-avx
|
||||
vaddsd-xmm_xmm_xmm
|
||||
vaddss-xmm_xmm_xmm
|
||||
vcvtsi2ss-xmm_xmm_r32
|
||||
|
||||
vcvtss2si-r32_xmm
|
||||
vfmadd213pd-avx
|
||||
vfmadd213pd-sse
|
||||
vfmadd213ps-avx
|
||||
vfmadd213ps-sse
|
||||
vfmadd213sd
|
||||
vfmadd213ss
|
||||
vinsertf128-ymm_ymm_imd
|
||||
|
||||
vmulpd-ymm_ymm_ymm
|
||||
vmulsd-xmm_xmm_xmm
|
||||
vmulss-xmm_xmm_xmm
|
||||
vsubsd-xmm_xmm_xmm
|
||||
vsubss-xmm_xmm_xmm
|
||||
|
||||
# LEAs:
|
||||
|
||||
lea-r32_mem
|
||||
lea-r64_mem
|
||||
lea-r64_mem2
|
||||
|
||||
# /w mem operand:
|
||||
|
||||
mov-r32_mem
|
||||
mov-r64_mem
|
||||
vmovsd-mem_xmm
|
||||
vmovsd-xmm_mem
|
||||
vaddpd-ymm_ymm_mem
|
||||
vaddsd-xmm_xmm_mem
|
||||
vmulsd-xmm_xmm_mem
|
||||
vsubpd-ymm_ymm_mem
|
||||
vsubsd-xmm_xmm_mem
|
||||
|
||||
# impossible to serialize:
|
||||
|
||||
cmp-r32_r32
|
||||
cmp-r64_imd
|
||||
cmp-r64_r64
|
||||
cmp-r32_mem
|
Reference in New Issue
Block a user