Files
OSACA/osaca/db_interface.py
2021-08-26 16:58:19 +02:00

646 lines
24 KiB
Python
Executable File

#!/usr/bin/env python3
import math
import os
import re
import sys
import warnings
from collections import OrderedDict
import ruamel.yaml
from osaca.semantics import MachineModel
def sanity_check(arch: str, verbose=False, internet_check=False, output_file=sys.stdout):
"""
Checks the database for missing TP/LT values, instructions might missing int the ISA DB and
duplicate instructions.
:param arch: micro-arch key to define DB to check
:type arch: str
:param verbose: verbose output flag, defaults to `False`
:type verbose: bool, optional
:param internet_check: indicates if OSACA should try to look up the src/dst distribution in the
internet, defaults to False
:type internet_check: boolean, optional
:param output_file: output stream specifying where to write output,
defaults to :class:`sys.stdout`
:type output_file: stream, optional
:return: True if everything checked out
"""
# load arch machine model
arch_mm = MachineModel(arch=arch)
data = arch_mm["instruction_forms"]
# load isa machine model
isa = arch_mm.get_ISA()
isa_mm = MachineModel(arch="isa/{}".format(isa))
num_of_instr = len(data)
# check arch DB entries
(
missing_throughput,
missing_latency,
missing_port_pressure,
suspicious_instructions,
duplicate_instr_arch,
bad_operand,
) = _check_sanity_arch_db(arch_mm, isa_mm, internet_check=internet_check)
# check ISA DB entries
duplicate_instr_isa, only_in_isa = _check_sanity_isa_db(arch_mm, isa_mm)
report = _get_sanity_report(
num_of_instr,
missing_throughput,
missing_latency,
missing_port_pressure,
suspicious_instructions,
duplicate_instr_arch,
duplicate_instr_isa,
only_in_isa,
bad_operand,
verbose=verbose,
colors=True if output_file == sys.stdout else False,
)
print(report, file=output_file)
return not any([missing_port_pressure, bad_operand])
def import_benchmark_output(arch, bench_type, filepath, output=sys.stdout):
"""
Import benchmark results from micro-benchmarks.
:param arch: target architecture key
:type arch: str
:param bench_type: key for defining type of benchmark output
:type bench_type: str
:param filepath: filepath to the output file
:type filepath: str
:param output: output stream to dump, defaults to sys.stdout
:type output: stream
"""
supported_bench_outputs = ["ibench", "asmbench"]
assert os.path.exists(filepath)
if bench_type not in supported_bench_outputs:
raise ValueError("Benchmark type is not supported.")
with open(filepath, "r") as f:
input_data = f.readlines()
db_entries = None
mm = MachineModel(arch)
if bench_type == "ibench":
db_entries = _get_ibench_output(input_data, mm.get_ISA())
elif bench_type == "asmbench":
db_entries = _get_asmbench_output(input_data, mm.get_ISA())
# write entries to DB
for entry in db_entries:
mm.set_instruction_entry(db_entries[entry])
if output is None:
print(mm.dump())
else:
mm.dump(stream=output)
##################
# HELPERS IBENCH #
##################
def _get_asmbench_output(input_data, isa):
"""
Parse asmbench output in the format
1 MNEMONIC[-OP1[_OP2][...]]
2 Latency: X cycles
3 Throughput: Y cycles
4
and creates per 4 lines in the input_data one entry in the database.
:param str input_data: content of asmbench output file
:param str isa: ISA of target architecture (x86, AArch64, ...)
: return: dictionary with all new db_entries
"""
db_entries = {}
for i in range(0, len(input_data), 4):
if input_data[i + 3].strip() != "":
print(
"asmbench output not in the correct format! Format must be: ",
file=sys.stderr,
)
print(
"-------------\nMNEMONIC[-OP1[_OP2][...]]\nLatency: X cycles\n"
"Throughput: Y cycles\n\n-------------",
file=sys.stderr,
)
print(
"Entry {} and all further entries won't be added.".format((i / 4) + 1),
file=sys.stderr,
)
break
else:
i_form = input_data[i].strip()
mnemonic = i_form.split("-")[0]
operands = i_form.split("-")[1].split("_")
operands = [_create_db_operand(op, isa) for op in operands]
entry = {
"name": mnemonic,
"operands": operands,
"throughput": _validate_measurement(float(input_data[i + 2].split()[1]), "tp"),
"latency": _validate_measurement(float(input_data[i + 1].split()[1]), "lt"),
"port_pressure": None,
}
if not entry["throughput"] or not entry["latency"]:
warnings.warn(
"Your measurement for {} looks suspicious".format(i_form)
+ " and was not added. Please inspect your benchmark."
)
db_entries[i_form] = entry
return db_entries
def _get_ibench_output(input_data, isa):
"""Parse the standard output of ibench and add instructions to DB."""
db_entries = {}
for line in input_data:
if "Using frequency" in line or len(line) == 0:
continue
instruction = line.split(":")[0]
key = "-".join(instruction.split("-")[:2])
if key in db_entries:
# add only TP/LT value
entry = db_entries[key]
else:
mnemonic = instruction.split("-")[0]
operands = instruction.split("-")[1].split("_")
operands = [_create_db_operand(op, isa) for op in operands]
entry = {
"name": mnemonic,
"operands": operands,
"throughput": None,
"latency": None,
"port_pressure": None,
}
if "TP" in instruction:
entry["throughput"] = _validate_measurement(float(line.split()[1]), "tp")
if not entry["throughput"]:
warnings.warn(
"Your THROUGHPUT measurement for {} looks suspicious".format(key)
+ " and was not added. Please inspect your benchmark."
)
elif "LT" in instruction:
entry["latency"] = _validate_measurement(float(line.split()[1]), "lt")
if not entry["latency"]:
warnings.warn(
"Your LATENCY measurement for {} looks suspicious".format(key)
+ " and was not added. Please inspect your benchmark."
)
db_entries[key] = entry
return db_entries
def _validate_measurement(measurement, mode):
"""
Check if latency has a maximum deviation of 0.05% and throughput is a reciprocal of a
an integer number.
"""
if mode == "lt":
if (
math.floor(measurement) * 1.05 >= measurement
or math.ceil(measurement) * 0.95 <= measurement
):
# Value is probably correct, so round it to the estimated value
return float(round(measurement))
# Check reciprocal only if it is a throughput value
elif mode == "tp":
reciprocals = [1 / x for x in range(1, 11)]
for reci in reciprocals:
if reci * 0.95 <= measurement <= reci * 1.05:
# Value is probably correct, so round it to the estimated value
return round(reci, 5)
# No value close to an integer or its reciprocal found, we assume the
# measurement is incorrect
return None
def _create_db_operand(operand, isa):
"""Get DB operand by input string and ISA."""
if isa == "aarch64":
return _create_db_operand_aarch64(operand)
elif isa == "x86":
return _create_db_operand_x86(operand)
def _create_db_operand_aarch64(operand):
"""Get DB operand for AArch64 by operand string."""
if operand == "i":
return {"class": "immediate", "imd": "int"}
elif operand in "wxbhsdq":
return {"class": "register", "prefix": operand}
elif operand.startswith("v"):
return {
"class": "register",
"prefix": "v",
"shape": operand[1:2] if operand[1:2] != "" else "d",
}
elif operand.startswith("m"):
return {
"class": "memory",
"base": "x" if "b" in operand else None,
"offset": "imd" if "o" in operand else None,
"index": "gpr" if "i" in operand else None,
"scale": 8 if "s" in operand else 1,
"pre-indexed": True if "r" in operand else False,
"post-indexed": True if "p" in operand else False,
}
else:
raise ValueError("Parameter {} is not a valid operand code".format(operand))
def _create_db_operand_x86(operand):
"""Get DB operand for AArch64 by operand string."""
if operand == "r":
return {"class": "register", "name": "gpr"}
elif operand in "xyz":
return {"class": "register", "name": operand + "mm"}
elif operand == "i":
return {"class": "immediate", "imd": "int"}
elif operand.startswith("m"):
return {
"class": "memory",
"base": "gpr" if "b" in operand else None,
"offset": "imd" if "o" in operand else None,
"index": "gpr" if "i" in operand else None,
"scale": 8 if "s" in operand else 1,
}
else:
raise ValueError("Parameter {} is not a valid operand code".format(operand))
########################
# HELPERS SANITY CHECK #
########################
def _scrape_from_felixcloutier(mnemonic):
"""Scrape src/dst information from felixcloutier website and return information for user."""
import requests
try:
from bs4 import BeautifulSoup
except ImportError:
print(
"Module BeautifulSoup not installed. Fetching instruction form information "
"online requires BeautifulSoup.\nUse 'pip install bs4' for installation.",
file=sys.stderr,
)
sys.exit(1)
index = "https://www.felixcloutier.com/x86/index.html"
base_url = "https://www.felixcloutier.com/x86/"
url = base_url + mnemonic.lower()
suspicious = True
operands = []
# GET website
r = requests.get(url=url)
if r.status_code == 200:
# Found result
operand_enc = BeautifulSoup(r.text, "html.parser").find(
"h2", attrs={"id": "instruction-operand-encoding"}
)
if operand_enc:
# operand encoding found, otherwise, no need to mark as suspicous
table = operand_enc.findNextSibling()
operands = _get_src_dst_from_table(table)
elif r.status_code == 404:
# Check for alternative href
index = BeautifulSoup(requests.get(url=index).text, "html.parser")
alternatives = [ref for ref in index.findAll("a") if ref.text == mnemonic.upper()]
if len(alternatives) > 0:
# alternative(s) found, take first one
url = base_url + alternatives[0].attrs["href"][2:]
operand_enc = BeautifulSoup(requests.get(url=url).text, "html.parser").find(
"h2", attrs={"id": "instruction-operand-encoding"}
)
if operand_enc:
# operand encoding found, otherwise, no need to mark as suspicous
table = operand_enc.findNextSibling()
operands = _get_src_dst_from_table(table)
if operands:
# Found src/dst assignment for NUM_OPERANDS
if not any(["r" in x and "w" in x for x in operands]):
suspicious = False
return (suspicious, " ".join(operands))
def _get_src_dst_from_table(table, num_operands=2):
"""Prettify bs4 table object to string for user"""
# Parse table
header = ["".join(x.string.lower().split()) for x in table.find("tr").findAll("td")]
data = table.findAll("tr")[1:]
data_dict = OrderedDict()
for i, row in enumerate(data):
data_dict[i] = {}
for j, col in enumerate(row.findAll("td")):
if col.string != "NA":
data_dict[i][header[j]] = col.string
# Get only the instruction forms with 2 operands
num_ops = [_get_number_of_operands(row) for _, row in data_dict.items()]
if num_operands in num_ops:
row = data_dict[num_ops.index(num_operands)]
reads_writes = []
for i in range(1, num_operands + 1):
m = re.search(r"(\([^\(\)]+\))", row["operand{}".format(i)])
if not m:
# no parentheses (probably immediate operand), assume READ
reads_writes.append("(r)")
continue
reads_writes.append("".join(m.group(0).split()))
# reverse reads_writes for AT&T syntax
reads_writes.reverse()
return reads_writes
return []
def _get_number_of_operands(data_dict_row):
"""Return the number of `Operand [X]` attributes in row"""
num = 0
for i in range(1, 5):
if "operand{}".format(i) in ["".join(x.split()).lower() for x in data_dict_row]:
num += 1
return num
def _check_sanity_arch_db(arch_mm, isa_mm, internet_check=True):
"""Do sanity check for ArchDB by given ISA."""
# prefixes of instruction forms which we assume to have non-default operands
suspicious_prefixes_x86 = ["vfm", "fm"]
suspicious_prefixes_arm = ["fml", "ldp", "stp", "str"]
# already known to be default-operand instruction forms with 2 operands
if arch_mm.get_ISA().lower() == "aarch64":
suspicious_prefixes = suspicious_prefixes_arm
if arch_mm.get_ISA().lower() == "x86":
suspicious_prefixes = suspicious_prefixes_x86
# returned lists
missing_throughput = []
missing_latency = []
missing_port_pressure = []
suspicious_instructions = []
duplicate_instr_arch = []
duplicate_strings = []
bad_operand = []
for instr_form in arch_mm["instruction_forms"]:
# check value in DB entry
if instr_form["throughput"] is None:
missing_throughput.append(instr_form)
if instr_form["latency"] is None:
missing_latency.append(instr_form)
if instr_form["port_pressure"] is None:
missing_port_pressure.append(instr_form)
# check entry against ISA DB
for prefix in suspicious_prefixes:
if instr_form["name"].lower().startswith(prefix):
# check if instruction in ISA DB
if isa_mm.get_instruction(instr_form["name"], instr_form["operands"]) is None:
# if not, mark them as suspicious and print it on the screen
suspicious_instructions.append(instr_form)
# instr forms with less than 3 operands might need an ISA DB entry due to src_reg operands
if (
len(instr_form["operands"]) < 3
and len(instr_form["operands"]) > 1
and "mov" not in instr_form["name"].lower()
and not instr_form["name"].lower().startswith("j")
and instr_form not in suspicious_instructions
and isa_mm.get_instruction(instr_form["name"], instr_form["operands"]) is None
):
# validate with data from internet if connected flag is set
if internet_check:
is_susp, info_string = _scrape_from_felixcloutier(instr_form["name"])
if is_susp:
instr_form["note"] = info_string
suspicious_instructions.append(instr_form)
else:
suspicious_instructions.append(instr_form)
# check for duplicates in DB
if arch_mm._check_for_duplicate(instr_form["name"], instr_form["operands"]):
duplicate_instr_arch.append(instr_form)
# Check operands
for operand in instr_form["operands"]:
if operand["class"] == "register" and not ("name" in operand or "prefix" in operand):
# Missing 'name' key
bad_operand.append(instr_form)
elif operand["class"] == "memory" and (
"base" not in operand
or "offset" not in operand
or "index" not in operand
or "scale" not in operand
):
# Missing at least one key necessary for memory operands
bad_operand.append(instr_form)
elif operand["class"] == "immediate" and "imd" not in operand:
# Missing 'imd' key
bad_operand.append(instr_form)
# every entry exists twice --> uniquify
tmp_list = []
for _ in range(0, len(duplicate_instr_arch)):
tmp = duplicate_instr_arch.pop()
if _get_full_instruction_name(tmp).lower() not in duplicate_strings:
duplicate_strings.append(_get_full_instruction_name(tmp).lower())
tmp_list.append(tmp)
duplicate_instr_arch = tmp_list
return (
missing_throughput,
missing_latency,
missing_port_pressure,
suspicious_instructions,
duplicate_instr_arch,
bad_operand,
)
def _check_sanity_isa_db(arch_mm, isa_mm):
"""Do sanity check for an ISA DB."""
# returned lists
duplicate_instr_isa = []
only_in_isa = []
for instr_form in isa_mm["instruction_forms"]:
# check if instr is missing in arch DB
if arch_mm.get_instruction(instr_form["name"], instr_form["operands"]) is None:
only_in_isa.append(instr_form)
# check for duplicates
if isa_mm._check_for_duplicate(instr_form["name"], instr_form["operands"]):
duplicate_instr_isa.append(instr_form)
# every entry exists twice --> uniquify
tmp_list = []
for i in range(0, len(duplicate_instr_isa)):
tmp = duplicate_instr_isa.pop()
if tmp not in duplicate_instr_isa:
tmp_list.append(tmp)
duplicate_instr_isa = tmp_list
return duplicate_instr_isa, only_in_isa
def _get_sanity_report(
total,
m_tp,
m_l,
m_pp,
suspic_instr,
dup_arch,
dup_isa,
only_isa,
bad_operands,
verbose=False,
colors=False,
):
"""Get sanity summary report."""
s = ""
# non-verbose summary
s += "SUMMARY\n----------------------\n"
s += "{}% ({}/{}) of instruction forms have no throughput value.\n".format(
round(100 * len(m_tp) / total), len(m_tp), total
)
s += "{}% ({}/{}) of instruction forms have no latency value.\n".format(
round(100 * len(m_l) / total), len(m_l), total
)
s += "{}% ({}/{}) of instruction forms have no port pressure assignment.\n".format(
round(100 * len(m_pp) / total), len(m_pp), total
)
s += "{}% ({}/{}) of instruction forms might miss an ISA DB entry.\n".format(
round(100 * len(suspic_instr) / total), len(suspic_instr), total
)
s += "{} duplicate instruction forms in uarch DB.\n".format(len(dup_arch))
s += "{} duplicate instruction forms in ISA DB.\n".format(len(dup_isa))
s += (
"{} instruction forms in ISA DB are not referenced by instruction ".format(len(only_isa))
+ "forms in uarch DB.\n"
)
s += "{} bad operands found in uarch DB\n".format(len(bad_operands))
s += "----------------------\n"
# verbose version
if verbose:
s += _get_sanity_report_verbose(
total,
m_tp,
m_l,
m_pp,
suspic_instr,
dup_arch,
dup_isa,
only_isa,
bad_operands,
colors=colors,
)
return s
def _get_sanity_report_verbose(
total,
m_tp,
m_l,
m_pp,
suspic_instr,
dup_arch,
dup_isa,
only_isa,
bad_operands,
colors=False,
):
"""Get the verbose part of the sanity report with all missing instruction forms."""
BRIGHT_CYAN = "\033[1;36;1m" if colors else ""
BRIGHT_BLUE = "\033[1;34;1m" if colors else ""
BRIGHT_RED = "\033[1;31;1m" if colors else ""
BRIGHT_MAGENTA = "\033[1;35;1m" if colors else ""
BRIGHT_YELLOW = "\033[1;33;1m" if colors else ""
CYAN = "\033[36m" if colors else ""
YELLOW = "\033[33m" if colors else ""
WHITE = "\033[0m" if colors else ""
s = "Instruction forms without throughput value:\n" if m_tp else ""
for instr_form in sorted(m_tp, key=lambda i: i["name"]):
s += "{}{}{}\n".format(BRIGHT_BLUE, _get_full_instruction_name(instr_form), WHITE)
s += "Instruction forms without latency value:\n" if m_l else ""
for instr_form in sorted(m_l, key=lambda i: i["name"]):
s += "{}{}{}\n".format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE)
s += "Instruction forms without port pressure assignment:\n" if m_pp else ""
for instr_form in sorted(m_pp, key=lambda i: i["name"]):
s += "{}{}{}\n".format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE)
s += "Instruction forms which might miss an ISA DB entry:\n" if suspic_instr else ""
for instr_form in sorted(suspic_instr, key=lambda i: i["name"]):
s += "{}{}{}{}\n".format(
BRIGHT_CYAN,
_get_full_instruction_name(instr_form),
" -- " + instr_form["note"] if "note" in instr_form else "",
WHITE,
)
s += "Duplicate instruction forms in uarch DB:\n" if dup_arch else ""
for instr_form in sorted(dup_arch, key=lambda i: i["name"]):
s += "{}{}{}\n".format(YELLOW, _get_full_instruction_name(instr_form), WHITE)
s += "Duplicate instruction forms in ISA DB:\n" if dup_isa else ""
for instr_form in sorted(dup_isa, key=lambda i: i["name"]):
s += "{}{}{}\n".format(BRIGHT_YELLOW, _get_full_instruction_name(instr_form), WHITE)
s += "Instruction forms existing in ISA DB but not in uarch DB:\n" if only_isa else ""
for instr_form in sorted(only_isa, key=lambda i: i["name"]):
s += "{}{}{}\n".format(CYAN, _get_full_instruction_name(instr_form), WHITE)
s += "{} bad operands found in uarch DB:\n".format(len(bad_operands)) if bad_operands else ""
for instr_form in sorted(bad_operands, key=lambda i: i["name"]):
s += "{}{}{}\n".format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE)
return s
###################
# GENERIC HELPERS #
###################
def _get_full_instruction_name(instruction_form):
"""Get full instruction form name/identifier string out of given instruction form."""
operands = []
for op in instruction_form["operands"]:
op_attrs = [
y + ":" + str(op[y])
for y in list(filter(lambda x: True if x != "class" else False, op))
]
operands.append("{}({})".format(op["class"], ",".join(op_attrs)))
return "{} {}".format(instruction_form["name"], ",".join(operands))
def __represent_none(self, data):
"""Get YAML None representation."""
return self.represent_scalar(u"tag:yaml.org,2002:null", u"~")
def _create_yaml_object():
"""Create YAML module with None representation."""
yaml_obj = ruamel.yaml.YAML()
yaml_obj.representer.add_representer(type(None), __represent_none)
return yaml_obj
def __dump_data_to_yaml(filepath, data):
"""Dump data to YAML file at given filepath."""
# first add 'normal' meta data in the right order (no ordered dict yet)
meta_data = dict(data)
del meta_data["instruction_forms"]
del meta_data["port_model_scheme"]
with open(filepath, "w") as f:
ruamel.yaml.dump(meta_data, f, allow_unicode=True)
with open(filepath, "a") as f:
# now add port model scheme in |-scheme for better readability
ruamel.yaml.dump(
{"port_model_scheme": data["port_model_scheme"]},
f,
allow_unicode=True,
default_style="|",
)
# finally, add instruction forms
ruamel.yaml.dump({"instruction_forms": data["instruction_forms"]}, f, allow_unicode=True)