OSACA/osaca/semantics/arch_semantics.py

#!/usr/bin/env python3
"""Semantics opbject responsible for architecture specific semantic operations"""
import sys
import warnings
from itertools import chain
from operator import itemgetter
from copy import deepcopy

from .hw_model import MachineModel
from .isa_semantics import INSTR_FLAGS, ISASemantics
from osaca.parser.memory import MemoryOperand
from osaca.parser.register import RegisterOperand


class ArchSemantics(ISASemantics):
    def __init__(self, parser, machine_model: MachineModel, path_to_yaml=None):
        super().__init__(parser, path_to_yaml=path_to_yaml)
        self._machine_model = machine_model

    def normalize_instruction_form(self, instruction_form):
        self.parser.normalize_instruction_form(
            instruction_form, self.isa_model, self._machine_model
        )

    def normalize_instruction_forms(self, instruction_forms):
        for instruction_form in instruction_forms:
            self.normalize_instruction_form(instruction_form)

    def _check_normalized(self, instruction_forms):
        for instruction_form in instruction_forms:
            instruction_form.check_normalized()

    # SUMMARY FUNCTION
    def add_semantics(self, kernel):
        """
        Applies performance data (throughput, latency, port pressure) and source/destination
        distribution to each instruction of a given kernel.

        :param list kernel: kernel to apply semantics
        """
        self._check_normalized(kernel)
        for instruction_form in kernel:
            self.assign_src_dst(instruction_form)
            self.assign_tp_lt(instruction_form)
        if self._machine_model.has_hidden_loads():
            self.set_hidden_loads(kernel)

    def assign_optimal_throughput(self, kernel, start=0):
        """
        Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.

        :param list kernel: kernel to apply optimal port utilization
        """
        self._check_normalized(kernel)
        INC = 0.01
        kernel.reverse()
        port_list = self._machine_model.get_ports()
        multiple_assignments = False
        for idx, instruction_form in enumerate(kernel[start:], start):
            multiple_assignments = False
            # if iform has multiple possible port assignments, check all in a DFS manner and take the best
            if isinstance(instruction_form.port_uops, dict):
                best_kernel = None
                best_kernel_tp = sys.maxsize
                for port_util_alt in list(instruction_form.port_uops.values())[1:]:
                    k_tmp = deepcopy(kernel)
                    k_tmp[idx].port_uops = deepcopy(port_util_alt)
                    k_tmp[idx].port_pressure = self._machine_model.average_port_pressure(
                        k_tmp[idx].port_uops
                    )
                    k_tmp.reverse()
                    self.assign_optimal_throughput(k_tmp, idx)
                    if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp:
                        best_kernel = k_tmp
                        best_kernel_tp = max(self.get_throughput_sum(best_kernel))
                # check the first option in the main branch and compare against the best option later
                multiple_assignments = True
                kernel[idx].port_uops = list(instruction_form.port_uops.values())[0]
            for uop in instruction_form.port_uops:
                cycles = uop[0]
                ports = list(uop[1])
                indices = [port_list.index(p) for p in ports]
                # check if port sum of used ports for uop are unbalanced
                port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel)))
                instr_ports = self._to_list(itemgetter(*indices)(instruction_form.port_pressure))
                if len(set(port_sums)) > 1:
                    # balance ports
                    # init list for keeping track of the current change
                    differences = [cycles / len(ports) for p in ports]
                    for _ in range(int(cycles * (1 / INC))):
                        if len(instr_ports) == 1:
                            # no balancing possible anymore
                            break
                        max_port_idx = port_sums.index(max(port_sums))
                        min_port_idx = port_sums.index(min(port_sums))
                        instr_ports[max_port_idx] -= INC
                        instr_ports[min_port_idx] += INC
                        differences[max_port_idx] -= INC
                        differences[min_port_idx] += INC
                        # instr_ports = [round(p, 2) for p in instr_ports]
                        self._itemsetter(*indices)(instruction_form.port_pressure, *instr_ports)
                        # check if min port is zero
                        if round(min(instr_ports), 2) <= 0:
                            # if port_pressure is not exactly 0.00, add the residual to
                            # the former port
                            if min(instr_ports) != 0.0:
                                min_port_idx = port_sums.index(min(port_sums))
                                instr_ports[min_port_idx] += min(instr_ports)
                                differences[min_port_idx] += min(instr_ports)
                                # we don't need to decrease difference for other port, just
                                # delete it
                                del differences[instr_ports.index(min(instr_ports))]
                                self._itemsetter(*indices)(
                                    instruction_form.port_pressure, *instr_ports
                                )
                                zero_index = [
                                    p
                                    for p in indices
                                    if round(instruction_form.port_pressure[p], 2) == 0
                                    or instruction_form.port_pressure[p] < 0.00
                                ][0]
                                instruction_form.port_pressure[zero_index] = 0.0
                            # Remove from further balancing
                            indices = [p for p in indices if instruction_form.port_pressure[p] > 0]
                            instr_ports = self._to_list(
                                itemgetter(*indices)(instruction_form.port_pressure)
                            )
                        # never remove more than the fixed utilization per uop and port, i.e.,
                        # cycles/len(ports)
                        if round(min(differences), 2) <= 0:
                            # don't worry if port_pressure isn't exactly 0 and just
                            # remove from further balancing by deleting index since
                            # pressure is not 0
                            del indices[differences.index(min(differences))]
                            instr_ports = self._to_list(
                                itemgetter(*indices)(instruction_form.port_pressure)
                            )
                            del differences[differences.index(min(differences))]
                        port_sums = self._to_list(
                            itemgetter(*indices)(self.get_throughput_sum(kernel))
                        )
        kernel.reverse()
        if multiple_assignments:
            if max(self.get_throughput_sum(kernel)) > best_kernel_tp:
                for i, instr in enumerate(best_kernel):
                    kernel[i].port_uops = best_kernel[i].port_uops
                    kernel[i].port_pressure = best_kernel[i].port_pressure

    def set_hidden_loads(self, kernel):
        """Hide loads behind stores if architecture supports hidden loads (depricated)"""
        self._check_normalized(kernel)
        loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr.flags]
        stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr.flags]
        # Filter instructions including load and store
        load_ids = [instr.line_number for instr in loads]
        store_ids = [instr.line_number for instr in stores]
        shared_ldst = list(set(load_ids).intersection(set(store_ids)))
        loads = [instr for instr in loads if instr.line_number not in shared_ldst]
        stores = [instr for instr in stores if instr.line_number not in shared_ldst]

        if len(stores) == 0 or len(loads) == 0:
            # nothing to do
            return
        if len(loads) <= len(stores):
            # Hide all loads
            for load in loads:
                load.flags += [INSTR_FLAGS.HIDDEN_LD]
                load.port_pressure = self._nullify_data_ports(load.port_pressure)
        else:
            for store in stores:
                # Get 'closest' load instruction
                min_distance_load = min(
                    [
                        (
                            abs(load_instr.line_number - store.line_number),
                            load_instr.line_number,
                        )
                        for load_instr in loads
                        if INSTR_FLAGS.HIDDEN_LD not in load_instr.flags
                    ]
                )
                load = [instr for instr in kernel if instr.line_number == min_distance_load[1]][0]
                # Hide load
                load.flags += [INSTR_FLAGS.HIDDEN_LD]
                load.port_pressure = self._nullify_data_ports(load.port_pressure)

    # get parser result and assign throughput and latency value to instruction form
    # mark instruction form with semantic flags
    def assign_tp_lt(self, instruction_form):
        """Assign throughput and latency to an instruction form."""
        instruction_form.check_normalized()
        flags = []
        port_number = len(self._machine_model["ports"])
        if instruction_form.mnemonic is None:
            # No instruction (label, comment, ...) --> ignore
            throughput = 0.0
            latency = 0.0
            latency_wo_load = latency
            instruction_form.port_pressure = [0.0 for i in range(port_number)]
            instruction_form.port_uops = []
        else:
            instruction_data = self._machine_model.get_instruction(
                instruction_form.mnemonic, instruction_form.operands
            )
            if instruction_data:
                # instruction form in DB
                (
                    throughput,
                    port_pressure,
                    latency,
                    latency_wo_load,
                ) = self._handle_instruction_found(
                    instruction_data, port_number, instruction_form, flags
                )
            else:
                # instruction could not be found in DB
                assign_unknown = True
                # check for equivalent register-operands DB entry if LD
                if (
                    INSTR_FLAGS.HAS_LD in instruction_form.flags
                    or INSTR_FLAGS.HAS_ST in instruction_form.flags
                ):
                    # dynamically combine LD/ST and reg form of instruction form
                    # substitute mem and look for reg-only variant
                    operands = self.substitute_mem_address(instruction_form.operands)
                    instruction_data_reg = self._machine_model.get_instruction(
                        instruction_form.mnemonic, operands
                    )
                    if instruction_data_reg:
                        assign_unknown = False
                        reg_type = self._parser.get_reg_type(
                            instruction_data_reg.operands[
                                operands.index(self._create_reg_wildcard())
                            ]
                        )
                        # dummy_reg = {"class": "register", "name": reg_type}
                        dummy_reg = RegisterOperand(name=reg_type)
                        data_port_pressure = [0.0 for _ in range(port_number)]
                        data_port_uops = []
                        if INSTR_FLAGS.HAS_LD in instruction_form.flags:
                            # LOAD performance data
                            load_perf_data = self._machine_model.get_load_throughput(
                                [
                                    x
                                    for x in instruction_form.semantic_operands["source"]
                                    + instruction_form.semantic_operands["src_dst"]
                                    if isinstance(x, MemoryOperand)
                                ][0]
                            )
                            # if multiple options, choose based on reg type
                            data_port_uops = [
                                ldp[1]
                                for ldp in load_perf_data
                                if ldp[0].dst is not None
                                and self._machine_model._check_operands(
                                    dummy_reg, RegisterOperand(name=ldp[0].dst)
                                )
                            ]
                            if len(data_port_uops) < 1:
                                data_port_uops = load_perf_data[0][1]
                            else:
                                data_port_uops = data_port_uops[0]
                            data_port_pressure = self._machine_model.average_port_pressure(
                                data_port_uops
                            )
                            if "load_throughput_multiplier" in self._machine_model:
                                multiplier = self._machine_model["load_throughput_multiplier"][
                                    reg_type
                                ]
                                data_port_pressure = [pp * multiplier for pp in data_port_pressure]
                        if INSTR_FLAGS.HAS_ST in instruction_form.flags:
                            # STORE performance data
                            destinations = (
                                instruction_form.semantic_operands["destination"]
                                + instruction_form.semantic_operands["src_dst"]
                            )
                            store_perf_data = self._machine_model.get_store_throughput(
                                [x for x in destinations if isinstance(x, MemoryOperand)][0],
                                dummy_reg,
                            )
                            st_data_port_uops = store_perf_data[0][1]

                            # zero data port pressure and remove HAS_ST flag if
                            #   - no mem operand in dst &&
                            #   - all mem operands in src_dst are pre-/post_indexed
                            # since it is no mem store
                            if (
                                self._parser.isa() == "aarch64"
                                and not isinstance(
                                    instruction_form.semantic_operands["destination"],
                                    MemoryOperand,
                                )
                                and all(
                                    [
                                        op.post_indexed or op.pre_indexed
                                        for op in instruction_form.semantic_operands["src_dst"]
                                        if isinstance(op, MemoryOperand)
                                    ]
                                )
                            ):
                                st_data_port_uops = []
                                instruction_form.flags.remove(INSTR_FLAGS.HAS_ST)

                            # sum up all data ports in case for LOAD and STORE
                            st_data_port_pressure = self._machine_model.average_port_pressure(
                                st_data_port_uops
                            )
                            if "store_throughput_multiplier" in self._machine_model:
                                multiplier = self._machine_model["store_throughput_multiplier"][
                                    reg_type
                                ]
                                st_data_port_pressure = [
                                    pp * multiplier for pp in st_data_port_pressure
                                ]
                            data_port_pressure = [
                                sum(x) for x in zip(data_port_pressure, st_data_port_pressure)
                            ]
                            data_port_uops += st_data_port_uops
                        throughput = max(max(data_port_pressure), instruction_data_reg.throughput)
                        latency = instruction_data_reg.latency
                        # Add LD and ST latency
                        latency += (
                            self._machine_model.get_load_latency(reg_type)
                            if INSTR_FLAGS.HAS_LD in instruction_form.flags
                            else 0
                        )
                        latency += (
                            self._machine_model.get_store_latency(reg_type)
                            if INSTR_FLAGS.HAS_ST in instruction_form.flags
                            else 0
                        )
                        latency_wo_load = instruction_data_reg.latency
                        # add latency of ADD if post- or pre_indexed load
                        # TODO more investigation: check dot-graph, wrong latency distribution!
                        # if (
                        #     latency_wo_load == 0
                        #     and self._isa == 'aarch64'
                        #     and any(
                        #         [
                        #             'post_indexed' in op['memory'] or
                        #             'pre_indexed' in op['memory']
                        #             for op in instruction_form.operands
                        #             if 'memory' in op
                        #         ]
                        #     )
                        # ):
                        #     latency_wo_load = 1.0
                        instruction_form.port_pressure = [
                            sum(x)
                            for x in zip(
                                data_port_pressure,
                                self._machine_model.average_port_pressure(
                                    instruction_data_reg.port_pressure
                                ),
                            )
                        ]
                        instruction_form.port_uops = list(
                            chain(instruction_data_reg.port_pressure, data_port_uops)
                        )

                if assign_unknown:
                    # --> mark as unknown and assume 0 cy for latency/throughput
                    throughput = 0.0
                    latency = 0.0
                    latency_wo_load = latency
                    instruction_form.port_pressure = [0.0 for i in range(port_number)]
                    # instruction_formport_uops = []
                    flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN]
        # flatten flag list
        flags = list(set(flags))
        if instruction_form.flags == []:
            instruction_form.flags = flags
        else:
            instruction_form.flags += flags
        instruction_form.throughput = throughput
        instruction_form.latency = latency
        instruction_form.latency_wo_load = latency_wo_load
        # for later CP and loop-carried dependency analysis
        instruction_form.latency_cp = 0
        instruction_form.latency_lcd = 0

    def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
        """Apply performance data to instruction if it was found in the archDB"""
        instruction_form.check_normalized()
        throughput = instruction_data.throughput
        port_pressure = self._machine_model.average_port_pressure(instruction_data.port_pressure)
        instruction_form.port_uops = instruction_data.port_pressure
        try:
            assert isinstance(port_pressure, list)
            assert len(port_pressure) == port_number
            instruction_form.port_pressure = port_pressure
            if sum(port_pressure) == 0 and throughput is not None:
                # port pressure on all ports 0 --> not bound to a port
                flags.append(INSTR_FLAGS.NOT_BOUND)
        except AssertionError:
            warnings.warn(
                "Port pressure could not be imported correctly from database. "
                + "Please check entry for:\n {}".format(instruction_form)
            )
            instruction_form.port_pressure = [0.0 for i in range(port_number)]
            instruction_form.port_uops = []
            flags.append(INSTR_FLAGS.TP_UNKWN)
        if throughput is None:
            # assume 0 cy and mark as unknown
            throughput = 0.0
            flags.append(INSTR_FLAGS.TP_UNKWN)
        latency = instruction_data.latency
        latency_wo_load = latency
        if latency is None:
            # assume 0 cy and mark as unknown
            latency = 0.0
            latency_wo_load = latency
            flags.append(INSTR_FLAGS.LT_UNKWN)
        if INSTR_FLAGS.HAS_LD in instruction_form.flags:
            flags.append(INSTR_FLAGS.LD)
        return throughput, port_pressure, latency, latency_wo_load

    def convert_op_to_reg(self, reg_type, regtype="0"):
        """Create register operand for a memory addressing operand"""
        if self._parser.isa() == "x86":
            if reg_type == "gpr":
                register = RegisterOperand(name="r" + str(int(regtype) + 9))
            else:
                register = RegisterOperand(name=reg_type + regtype)
        elif self._parser.isa() == "aarch64":
            register = RegisterOperand(name=regtype, prefix=reg_type)
        return register

    def _nullify_data_ports(self, port_pressure):
        """Set all ports to 0.0 for the ports of a machine model"""
        data_ports = self._machine_model.get_data_ports()
        for port in data_ports:
            index = self._machine_model.get_ports().index(port)
            port_pressure[index] = 0.0
        return port_pressure

    def _itemsetter(self, *items):
        if len(items) == 1:
            item = items[0]

            def g(obj, value):
                obj[item] = value

        else:

            def g(obj, *values):
                for item, value in zip(items, values):
                    obj[item] = value

        return g

    def _to_list(self, obj):
        if isinstance(obj, tuple):
            return list(obj)
        else:
            return [obj]

    @staticmethod
    def get_throughput_sum(kernel):
        """Get the overall throughput sum separated by port of all instructions of a kernel."""
        # ignoring all lines with throughput == 0.0, because there won't be anything to sum up
        # typically comment, label and non-instruction lines
        port_pressures = [instr.port_pressure for instr in kernel if instr.throughput != 0.0]
        # Essentially summing up each columns of port_pressures, where each column is one port
        # and each row is one line of the kernel
        # round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput
        tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
        return tp_sum