OSACA/osaca/db_interface.py

#!/usr/bin/env python3

import math
import os
import re
import sys
import warnings
from collections import OrderedDict

import ruamel.yaml

from osaca.semantics import MachineModel


def sanity_check(arch: str, verbose=False, internet_check=False, output_file=sys.stdout):
    """
    Checks the database for missing TP/LT values, instructions might missing int the ISA DB and
    duplicate instructions.

    :param arch: micro-arch key to define DB to check
    :type arch: str
    :param verbose: verbose output flag, defaults to `False`
    :type verbose: bool, optional
    :param internet_check: indicates if OSACA should try to look up the src/dst distribution in the
                           internet, defaults to False
    :type internet_check: boolean, optional
    :param output_file: output stream specifying where to write output,
                        defaults to :class:`sys.stdout`
    :type output_file: stream, optional

    :return: True if everything checked out
    """
    # load arch machine model
    arch_mm = MachineModel(arch=arch)
    data = arch_mm["instruction_forms"]
    # load isa machine model
    isa = arch_mm.get_ISA()
    isa_mm = MachineModel(arch="isa/{}".format(isa))
    num_of_instr = len(data)

    # check arch DB entries
    (
        missing_throughput,
        missing_latency,
        missing_port_pressure,
        suspicious_instructions,
        duplicate_instr_arch,
        bad_operand,
    ) = _check_sanity_arch_db(arch_mm, isa_mm, internet_check=internet_check)
    # check ISA DB entries
    duplicate_instr_isa, only_in_isa = _check_sanity_isa_db(arch_mm, isa_mm)

    report = _get_sanity_report(
        num_of_instr,
        missing_throughput,
        missing_latency,
        missing_port_pressure,
        suspicious_instructions,
        duplicate_instr_arch,
        duplicate_instr_isa,
        only_in_isa,
        bad_operand,
        verbose=verbose,
        colors=True if output_file == sys.stdout else False,
    )
    print(report, file=output_file)

    return not any([missing_port_pressure, bad_operand])


def import_benchmark_output(arch, bench_type, filepath, output=sys.stdout):
    """
    Import benchmark results from micro-benchmarks.

    :param arch: target architecture key
    :type arch: str
    :param bench_type: key for defining type of benchmark output
    :type bench_type: str
    :param filepath: filepath to the output file
    :type filepath: str
    :param output: output stream to dump, defaults to sys.stdout
    :type output: stream
    """
    supported_bench_outputs = ["ibench", "asmbench"]
    assert os.path.exists(filepath)
    if bench_type not in supported_bench_outputs:
        raise ValueError("Benchmark type is not supported.")
    with open(filepath, "r") as f:
        input_data = f.readlines()
    db_entries = None
    mm = MachineModel(arch)
    if bench_type == "ibench":
        db_entries = _get_ibench_output(input_data, mm.get_ISA())
    elif bench_type == "asmbench":
        db_entries = _get_asmbench_output(input_data, mm.get_ISA())
    # write entries to DB
    for entry in db_entries:
        mm.set_instruction_entry(db_entries[entry])
    if output is None:
        print(mm.dump())
    else:
        mm.dump(stream=output)


##################
# HELPERS IBENCH #
##################


def _get_asmbench_output(input_data, isa):
    """
    Parse asmbench output in the format

    1 MNEMONIC[-OP1[_OP2][...]]
    2 Latency: X cycles
    3 Throughput: Y cycles
    4

    and creates per 4 lines in the input_data one entry in the database.

    :param str input_data: content of asmbench output file
    :param str isa: ISA of target architecture (x86, AArch64, ...)
    : return: dictionary with all new db_entries
    """
    db_entries = {}
    for i in range(0, len(input_data), 4):
        if input_data[i + 3].strip() != "":
            print(
                "asmbench output not in the correct format! Format must be: ",
                file=sys.stderr,
            )
            print(
                "-------------\nMNEMONIC[-OP1[_OP2][...]]\nLatency: X cycles\n"
                "Throughput: Y cycles\n\n-------------",
                file=sys.stderr,
            )
            print(
                "Entry {} and all further entries won't be added.".format((i / 4) + 1),
                file=sys.stderr,
            )
            break
        else:
            i_form = input_data[i].strip()
            mnemonic = i_form.split("-")[0]
            operands = i_form.split("-")[1].split("_")
            operands = [_create_db_operand(op, isa) for op in operands]
            entry = {
                "name": mnemonic,
                "operands": operands,
                "throughput": _validate_measurement(float(input_data[i + 2].split()[1]), "tp"),
                "latency": _validate_measurement(float(input_data[i + 1].split()[1]), "lt"),
                "port_pressure": None,
            }
            if not entry["throughput"] or not entry["latency"]:
                warnings.warn(
                    "Your measurement for {} looks suspicious".format(i_form)
                    + " and was not added. Please inspect your benchmark."
                )
            db_entries[i_form] = entry
    return db_entries


def _get_ibench_output(input_data, isa):
    """Parse the standard output of ibench and add instructions to DB."""
    db_entries = {}
    for line in input_data:
        if "Using frequency" in line or len(line) == 0:
            continue
        instruction = line.split(":")[0]
        key = "-".join(instruction.split("-")[:2])
        if key in db_entries:
            # add only TP/LT value
            entry = db_entries[key]
        else:
            mnemonic = instruction.split("-")[0]
            operands = instruction.split("-")[1].split("_")
            operands = [_create_db_operand(op, isa) for op in operands]
            entry = {
                "name": mnemonic,
                "operands": operands,
                "throughput": None,
                "latency": None,
                "port_pressure": None,
            }
        if "TP" in instruction:
            entry["throughput"] = _validate_measurement(float(line.split()[1]), "tp")
            if not entry["throughput"]:
                warnings.warn(
                    "Your THROUGHPUT measurement for {} looks suspicious".format(key)
                    + " and was not added. Please inspect your benchmark."
                )
        elif "LT" in instruction:
            entry["latency"] = _validate_measurement(float(line.split()[1]), "lt")
            if not entry["latency"]:
                warnings.warn(
                    "Your LATENCY measurement for {} looks suspicious".format(key)
                    + " and was not added. Please inspect your benchmark."
                )
        db_entries[key] = entry
    return db_entries


def _validate_measurement(measurement, mode):
    """
    Check if latency has a maximum deviation of 0.05% and throughput is a reciprocal of a
    an integer number.
    """
    if mode == "lt":
        if (
            math.floor(measurement) * 1.05 >= measurement
            or math.ceil(measurement) * 0.95 <= measurement
        ):
            # Value is probably correct, so round it to the estimated value
            return float(round(measurement))
        # Check reciprocal only if it is a throughput value
    elif mode == "tp":
        reciprocals = [1 / x for x in range(1, 11)]
        for reci in reciprocals:
            if reci * 0.95 <= measurement <= reci * 1.05:
                # Value is probably correct, so round it to the estimated value
                return round(reci, 5)
    # No value close to an integer or its reciprocal found, we assume the
    # measurement is incorrect
    return None


def _create_db_operand(operand, isa):
    """Get DB operand by input string and ISA."""
    if isa == "aarch64":
        return _create_db_operand_aarch64(operand)
    elif isa == "x86":
        return _create_db_operand_x86(operand)


def _create_db_operand_aarch64(operand):
    """Get DB operand for AArch64 by operand string."""
    if operand == "i":
        return {"class": "immediate", "imd": "int"}
    elif operand in "wxbhsdq":
        return {"class": "register", "prefix": operand}
    elif operand.startswith("v"):
        return {
            "class": "register",
            "prefix": "v",
            "shape": operand[1:2] if operand[1:2] != "" else "d",
        }
    elif operand.startswith("m"):
        return {
            "class": "memory",
            "base": "x" if "b" in operand else None,
            "offset": "imd" if "o" in operand else None,
            "index": "gpr" if "i" in operand else None,
            "scale": 8 if "s" in operand else 1,
            "pre-indexed": True if "r" in operand else False,
            "post-indexed": True if "p" in operand else False,
        }
    else:
        raise ValueError("Parameter {} is not a valid operand code".format(operand))


def _create_db_operand_x86(operand):
    """Get DB operand for AArch64 by operand string."""
    if operand == "r":
        return {"class": "register", "name": "gpr"}
    elif operand in "xyz":
        return {"class": "register", "name": operand + "mm"}
    elif operand == "i":
        return {"class": "immediate", "imd": "int"}
    elif operand.startswith("m"):
        return {
            "class": "memory",
            "base": "gpr" if "b" in operand else None,
            "offset": "imd" if "o" in operand else None,
            "index": "gpr" if "i" in operand else None,
            "scale": 8 if "s" in operand else 1,
        }
    else:
        raise ValueError("Parameter {} is not a valid operand code".format(operand))


########################
# HELPERS SANITY CHECK #
########################


def _scrape_from_felixcloutier(mnemonic):
    """Scrape src/dst information from felixcloutier website and return information for user."""
    import requests

    try:
        from bs4 import BeautifulSoup
    except ImportError:
        print(
            "Module BeautifulSoup not installed. Fetching instruction form information "
            "online requires BeautifulSoup.\nUse 'pip install bs4' for installation.",
            file=sys.stderr,
        )
        sys.exit(1)

    index = "https://www.felixcloutier.com/x86/index.html"
    base_url = "https://www.felixcloutier.com/x86/"
    url = base_url + mnemonic.lower()

    suspicious = True
    operands = []

    # GET website
    r = requests.get(url=url)
    if r.status_code == 200:
        # Found result
        operand_enc = BeautifulSoup(r.text, "html.parser").find(
            "h2", attrs={"id": "instruction-operand-encoding"}
        )
        if operand_enc:
            # operand encoding found, otherwise, no need to mark as suspicous
            table = operand_enc.findNextSibling()
            operands = _get_src_dst_from_table(table)
    elif r.status_code == 404:
        # Check for alternative href
        index = BeautifulSoup(requests.get(url=index).text, "html.parser")
        alternatives = [ref for ref in index.findAll("a") if ref.text == mnemonic.upper()]
        if len(alternatives) > 0:
            # alternative(s) found, take first one
            url = base_url + alternatives[0].attrs["href"][2:]
            operand_enc = BeautifulSoup(requests.get(url=url).text, "html.parser").find(
                "h2", attrs={"id": "instruction-operand-encoding"}
            )
            if operand_enc:
                # operand encoding found, otherwise, no need to mark as suspicous
                table = operand_enc.findNextSibling()
                operands = _get_src_dst_from_table(table)
    if operands:
        # Found src/dst assignment for NUM_OPERANDS
        if not any(["r" in x and "w" in x for x in operands]):
            suspicious = False
    return (suspicious, " ".join(operands))


def _get_src_dst_from_table(table, num_operands=2):
    """Prettify bs4 table object to string for user"""
    # Parse table
    header = ["".join(x.string.lower().split()) for x in table.find("tr").findAll("td")]
    data = table.findAll("tr")[1:]
    data_dict = OrderedDict()
    for i, row in enumerate(data):
        data_dict[i] = {}
        for j, col in enumerate(row.findAll("td")):
            if col.string != "NA":
                data_dict[i][header[j]] = col.string
    # Get only the instruction forms with 2 operands
    num_ops = [_get_number_of_operands(row) for _, row in data_dict.items()]
    if num_operands in num_ops:
        row = data_dict[num_ops.index(num_operands)]
        reads_writes = []
        for i in range(1, num_operands + 1):
            m = re.search(r"(\([^\(\)]+\))", row["operand{}".format(i)])
            if not m:
                # no parentheses (probably immediate operand), assume READ
                reads_writes.append("(r)")
                continue
            reads_writes.append("".join(m.group(0).split()))
        # reverse reads_writes for AT&T syntax
        reads_writes.reverse()
        return reads_writes
    return []


def _get_number_of_operands(data_dict_row):
    """Return the number of `Operand [X]` attributes in row"""
    num = 0
    for i in range(1, 5):
        if "operand{}".format(i) in ["".join(x.split()).lower() for x in data_dict_row]:
            num += 1
    return num


def _check_sanity_arch_db(arch_mm, isa_mm, internet_check=True):
    """Do sanity check for ArchDB by given ISA."""
    # prefixes of instruction forms which we assume to have non-default operands
    suspicious_prefixes_x86 = ["vfm", "fm"]
    suspicious_prefixes_arm = ["fml", "ldp", "stp", "str"]
    # already known to be default-operand instruction forms with 2 operands
    if arch_mm.get_ISA().lower() == "aarch64":
        suspicious_prefixes = suspicious_prefixes_arm
    if arch_mm.get_ISA().lower() == "x86":
        suspicious_prefixes = suspicious_prefixes_x86

    # returned lists
    missing_throughput = []
    missing_latency = []
    missing_port_pressure = []
    suspicious_instructions = []
    duplicate_instr_arch = []
    duplicate_strings = []
    bad_operand = []

    for instr_form in arch_mm["instruction_forms"]:
        # check value in DB entry
        if instr_form["throughput"] is None:
            missing_throughput.append(instr_form)
        if instr_form["latency"] is None:
            missing_latency.append(instr_form)
        if instr_form["port_pressure"] is None:
            missing_port_pressure.append(instr_form)
        # check entry against ISA DB
        for prefix in suspicious_prefixes:
            if instr_form["name"].lower().startswith(prefix):
                # check if instruction in ISA DB
                if isa_mm.get_instruction(instr_form["name"], instr_form["operands"]) is None:
                    # if not, mark them as suspicious and print it on the screen
                    suspicious_instructions.append(instr_form)
        # instr forms with less than 3 operands might need an ISA DB entry due to src_reg operands
        if (
            len(instr_form["operands"]) < 3
            and len(instr_form["operands"]) > 1
            and "mov" not in instr_form["name"].lower()
            and not instr_form["name"].lower().startswith("j")
            and instr_form not in suspicious_instructions
            and isa_mm.get_instruction(instr_form["name"], instr_form["operands"]) is None
        ):
            # validate with data from internet if connected flag is set
            if internet_check:
                is_susp, info_string = _scrape_from_felixcloutier(instr_form["name"])
                if is_susp:
                    instr_form["note"] = info_string
                    suspicious_instructions.append(instr_form)
            else:
                suspicious_instructions.append(instr_form)
        # check for duplicates in DB
        if arch_mm._check_for_duplicate(instr_form["name"], instr_form["operands"]):
            duplicate_instr_arch.append(instr_form)

        # Check operands
        for operand in instr_form["operands"]:
            if operand["class"] == "register" and not ("name" in operand or "prefix" in operand):
                # Missing 'name' key
                bad_operand.append(instr_form)
            elif operand["class"] == "memory" and (
                "base" not in operand
                or "offset" not in operand
                or "index" not in operand
                or "scale" not in operand
            ):
                # Missing at least one key necessary for memory operands
                bad_operand.append(instr_form)
            elif operand["class"] == "immediate" and "imd" not in operand:
                # Missing 'imd' key
                bad_operand.append(instr_form)
    # every entry exists twice --> uniquify
    tmp_list = []
    for _ in range(0, len(duplicate_instr_arch)):
        tmp = duplicate_instr_arch.pop()
        if _get_full_instruction_name(tmp).lower() not in duplicate_strings:
            duplicate_strings.append(_get_full_instruction_name(tmp).lower())
            tmp_list.append(tmp)
    duplicate_instr_arch = tmp_list
    return (
        missing_throughput,
        missing_latency,
        missing_port_pressure,
        suspicious_instructions,
        duplicate_instr_arch,
        bad_operand,
    )


def _check_sanity_isa_db(arch_mm, isa_mm):
    """Do sanity check for an ISA DB."""
    # returned lists
    duplicate_instr_isa = []
    only_in_isa = []

    for instr_form in isa_mm["instruction_forms"]:
        # check if instr is missing in arch DB
        if arch_mm.get_instruction(instr_form["name"], instr_form["operands"]) is None:
            only_in_isa.append(instr_form)
        # check for duplicates
        if isa_mm._check_for_duplicate(instr_form["name"], instr_form["operands"]):
            duplicate_instr_isa.append(instr_form)
    # every entry exists twice --> uniquify
    tmp_list = []
    for i in range(0, len(duplicate_instr_isa)):
        tmp = duplicate_instr_isa.pop()
        if tmp not in duplicate_instr_isa:
            tmp_list.append(tmp)
    duplicate_instr_isa = tmp_list

    return duplicate_instr_isa, only_in_isa


def _get_sanity_report(
    total,
    m_tp,
    m_l,
    m_pp,
    suspic_instr,
    dup_arch,
    dup_isa,
    only_isa,
    bad_operands,
    verbose=False,
    colors=False,
):
    """Get sanity summary report."""
    s = ""
    # non-verbose summary
    s += "SUMMARY\n----------------------\n"
    s += "{}% ({}/{}) of instruction forms have no throughput value.\n".format(
        round(100 * len(m_tp) / total), len(m_tp), total
    )
    s += "{}% ({}/{}) of instruction forms have no latency value.\n".format(
        round(100 * len(m_l) / total), len(m_l), total
    )
    s += "{}% ({}/{}) of instruction forms have no port pressure assignment.\n".format(
        round(100 * len(m_pp) / total), len(m_pp), total
    )
    s += "{}% ({}/{}) of instruction forms might miss an ISA DB entry.\n".format(
        round(100 * len(suspic_instr) / total), len(suspic_instr), total
    )
    s += "{} duplicate instruction forms in uarch DB.\n".format(len(dup_arch))
    s += "{} duplicate instruction forms in ISA DB.\n".format(len(dup_isa))
    s += (
        "{} instruction forms in ISA DB are not referenced by instruction ".format(len(only_isa))
        + "forms in uarch DB.\n"
    )
    s += "{} bad operands found in uarch DB\n".format(len(bad_operands))
    s += "----------------------\n"
    # verbose version
    if verbose:
        s += _get_sanity_report_verbose(
            total,
            m_tp,
            m_l,
            m_pp,
            suspic_instr,
            dup_arch,
            dup_isa,
            only_isa,
            bad_operands,
            colors=colors,
        )
    return s


def _get_sanity_report_verbose(
    total,
    m_tp,
    m_l,
    m_pp,
    suspic_instr,
    dup_arch,
    dup_isa,
    only_isa,
    bad_operands,
    colors=False,
):
    """Get the verbose part of the sanity report with all missing instruction forms."""
    BRIGHT_CYAN = "\033[1;36;1m" if colors else ""
    BRIGHT_BLUE = "\033[1;34;1m" if colors else ""
    BRIGHT_RED = "\033[1;31;1m" if colors else ""
    BRIGHT_MAGENTA = "\033[1;35;1m" if colors else ""
    BRIGHT_YELLOW = "\033[1;33;1m" if colors else ""
    CYAN = "\033[36m" if colors else ""
    YELLOW = "\033[33m" if colors else ""
    WHITE = "\033[0m" if colors else ""

    s = "Instruction forms without throughput value:\n" if m_tp else ""
    for instr_form in sorted(m_tp, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(BRIGHT_BLUE, _get_full_instruction_name(instr_form), WHITE)
    s += "Instruction forms without latency value:\n" if m_l else ""
    for instr_form in sorted(m_l, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE)
    s += "Instruction forms without port pressure assignment:\n" if m_pp else ""
    for instr_form in sorted(m_pp, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE)
    s += "Instruction forms which might miss an ISA DB entry:\n" if suspic_instr else ""
    for instr_form in sorted(suspic_instr, key=lambda i: i["name"]):
        s += "{}{}{}{}\n".format(
            BRIGHT_CYAN,
            _get_full_instruction_name(instr_form),
            " -- " + instr_form["note"] if "note" in instr_form else "",
            WHITE,
        )
    s += "Duplicate instruction forms in uarch DB:\n" if dup_arch else ""
    for instr_form in sorted(dup_arch, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(YELLOW, _get_full_instruction_name(instr_form), WHITE)
    s += "Duplicate instruction forms in ISA DB:\n" if dup_isa else ""
    for instr_form in sorted(dup_isa, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(BRIGHT_YELLOW, _get_full_instruction_name(instr_form), WHITE)
    s += "Instruction forms existing in ISA DB but not in uarch DB:\n" if only_isa else ""
    for instr_form in sorted(only_isa, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(CYAN, _get_full_instruction_name(instr_form), WHITE)
    s += "{} bad operands found in uarch DB:\n".format(len(bad_operands)) if bad_operands else ""
    for instr_form in sorted(bad_operands, key=lambda i: i["name"]):
        s += "{}{}{}\n".format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE)
    return s


###################
# GENERIC HELPERS #
###################


def _get_full_instruction_name(instruction_form):
    """Get full instruction form name/identifier string out of given instruction form."""
    operands = []
    for op in instruction_form["operands"]:
        op_attrs = [
            y + ":" + str(op[y])
            for y in list(filter(lambda x: True if x != "class" else False, op))
        ]
        operands.append("{}({})".format(op["class"], ",".join(op_attrs)))
    return "{}  {}".format(instruction_form["name"], ",".join(operands))


def __represent_none(self, data):
    """Get YAML None representation."""
    return self.represent_scalar(u"tag:yaml.org,2002:null", u"~")


def _create_yaml_object():
    """Create YAML module with None representation."""
    yaml_obj = ruamel.yaml.YAML()
    yaml_obj.representer.add_representer(type(None), __represent_none)
    return yaml_obj


def __dump_data_to_yaml(filepath, data):
    """Dump data to YAML file at given filepath."""
    # first add 'normal' meta data in the right order (no ordered dict yet)
    meta_data = dict(data)
    del meta_data["instruction_forms"]
    del meta_data["port_model_scheme"]
    with open(filepath, "w") as f:
        ruamel.yaml.dump(meta_data, f, allow_unicode=True)
    with open(filepath, "a") as f:
        # now add port model scheme in |-scheme for better readability
        ruamel.yaml.dump(
            {"port_model_scheme": data["port_model_scheme"]},
            f,
            allow_unicode=True,
            default_style="|",
        )
        # finally, add instruction forms
        ruamel.yaml.dump({"instruction_forms": data["instruction_forms"]}, f, allow_unicode=True)