Files
OSACA/osaca/semantics/arch_semantics.py
2021-08-26 16:58:19 +02:00

434 lines
21 KiB
Python
Executable File

#!/usr/bin/env python3
"""Semantics opbject responsible for architecture specific semantic operations"""
import warnings
from itertools import chain
from operator import itemgetter
from .hw_model import MachineModel
from .isa_semantics import INSTR_FLAGS, ISASemantics
class ArchSemantics(ISASemantics):
GAS_SUFFIXES = "bswlqt"
def __init__(self, machine_model: MachineModel, path_to_yaml=None):
super().__init__(machine_model.get_ISA().lower(), path_to_yaml=path_to_yaml)
self._machine_model = machine_model
self._isa = machine_model.get_ISA().lower()
# SUMMARY FUNCTION
def add_semantics(self, kernel):
"""
Applies performance data (throughput, latency, port pressure) and source/destination
distribution to each instruction of a given kernel.
:param list kernel: kernel to apply semantics
"""
for instruction_form in kernel:
self.assign_src_dst(instruction_form)
self.assign_tp_lt(instruction_form)
if self._machine_model.has_hidden_loads():
self.set_hidden_loads(kernel)
def assign_optimal_throughput(self, kernel):
"""
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
:param list kernel: kernel to apply optimal port utilization
"""
INC = 0.01
kernel.reverse()
port_list = self._machine_model.get_ports()
for instruction_form in kernel:
for uop in instruction_form["port_uops"]:
cycles = uop[0]
ports = list(uop[1])
indices = [port_list.index(p) for p in ports]
# check if port sum of used ports for uop are unbalanced
port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel)))
instr_ports = self._to_list(
itemgetter(*indices)(instruction_form["port_pressure"])
)
if len(set(port_sums)) > 1:
# balance ports
# init list for keeping track of the current change
differences = [cycles / len(ports) for p in ports]
for _ in range(int(cycles * (1 / INC))):
if len(instr_ports) == 1:
# no balancing possible anymore
break
max_port_idx = port_sums.index(max(port_sums))
min_port_idx = port_sums.index(min(port_sums))
instr_ports[max_port_idx] -= INC
instr_ports[min_port_idx] += INC
differences[max_port_idx] -= INC
differences[min_port_idx] += INC
# instr_ports = [round(p, 2) for p in instr_ports]
self._itemsetter(*indices)(instruction_form["port_pressure"], *instr_ports)
# check if min port is zero
if round(min(instr_ports), 2) <= 0:
# if port_pressure is not exactly 0.00, add the residual to
# the former port
if min(instr_ports) != 0.0:
min_port_idx = port_sums.index(min(port_sums))
instr_ports[min_port_idx] += min(instr_ports)
differences[min_port_idx] += min(instr_ports)
# we don't need to decrease difference for other port, just
# delete it
del differences[instr_ports.index(min(instr_ports))]
self._itemsetter(*indices)(
instruction_form["port_pressure"], *instr_ports
)
zero_index = [
p
for p in indices
if round(instruction_form["port_pressure"][p], 2) == 0
][0]
instruction_form["port_pressure"][zero_index] = 0.0
# Remove from further balancing
indices = [
p for p in indices if instruction_form["port_pressure"][p] > 0
]
instr_ports = self._to_list(
itemgetter(*indices)(instruction_form["port_pressure"])
)
# never remove more than the fixed utilization per uop and port, i.e.,
# cycles/len(ports)
if round(min(differences), 2) <= 0:
# don't worry if port_pressure isn't exactly 0 and just
# remove from further balancing by deleting index since
# pressure is not 0
del indices[differences.index(min(differences))]
instr_ports = self._to_list(
itemgetter(*indices)(instruction_form["port_pressure"])
)
del differences[differences.index(min(differences))]
port_sums = self._to_list(
itemgetter(*indices)(self.get_throughput_sum(kernel))
)
kernel.reverse()
def set_hidden_loads(self, kernel):
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr["flags"]]
stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr["flags"]]
# Filter instructions including load and store
load_ids = [instr["line_number"] for instr in loads]
store_ids = [instr["line_number"] for instr in stores]
shared_ldst = list(set(load_ids).intersection(set(store_ids)))
loads = [instr for instr in loads if instr["line_number"] not in shared_ldst]
stores = [instr for instr in stores if instr["line_number"] not in shared_ldst]
if len(stores) == 0 or len(loads) == 0:
# nothing to do
return
if len(loads) <= len(stores):
# Hide all loads
for load in loads:
load["flags"] += [INSTR_FLAGS.HIDDEN_LD]
load["port_pressure"] = self._nullify_data_ports(load["port_pressure"])
else:
for store in stores:
# Get 'closest' load instruction
min_distance_load = min(
[
(
abs(load_instr["line_number"] - store["line_number"]),
load_instr["line_number"],
)
for load_instr in loads
if INSTR_FLAGS.HIDDEN_LD not in load_instr["flags"]
]
)
load = [instr for instr in kernel if instr["line_number"] == min_distance_load[1]][
0
]
# Hide load
load["flags"] += [INSTR_FLAGS.HIDDEN_LD]
load["port_pressure"] = self._nullify_data_ports(load["port_pressure"])
# get parser result and assign throughput and latency value to instruction form
# mark instruction form with semantic flags
def assign_tp_lt(self, instruction_form):
"""Assign throughput and latency to an instruction form."""
flags = []
port_number = len(self._machine_model["ports"])
if instruction_form["instruction"] is None:
# No instruction (label, comment, ...) --> ignore
throughput = 0.0
latency = 0.0
latency_wo_load = latency
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
instruction_form["port_uops"] = []
else:
instruction_data = self._machine_model.get_instruction(
instruction_form["instruction"], instruction_form["operands"]
)
if (
not instruction_data
and self._isa == "x86"
and instruction_form["instruction"][-1] in self.GAS_SUFFIXES
):
# check for instruction without GAS suffix
instruction_data = self._machine_model.get_instruction(
instruction_form["instruction"][:-1], instruction_form["operands"]
)
if instruction_data:
# instruction form in DB
(
throughput,
port_pressure,
latency,
latency_wo_load,
) = self._handle_instruction_found(
instruction_data, port_number, instruction_form, flags
)
else:
# instruction could not be found in DB
assign_unknown = True
# check for equivalent register-operands DB entry if LD
if (
INSTR_FLAGS.HAS_LD in instruction_form["flags"]
or INSTR_FLAGS.HAS_ST in instruction_form["flags"]
):
# dynamically combine LD/ST and reg form of instruction form
# substitute mem and look for reg-only variant
operands = self.substitute_mem_address(instruction_form["operands"])
instruction_data_reg = self._machine_model.get_instruction(
instruction_form["instruction"], operands
)
if (
not instruction_data_reg
and self._isa == "x86"
and instruction_form["instruction"][-1] in self.GAS_SUFFIXES
):
# check for instruction without GAS suffix
instruction_data_reg = self._machine_model.get_instruction(
instruction_form["instruction"][:-1], operands
)
if instruction_data_reg:
assign_unknown = False
reg_type = self._parser.get_reg_type(
instruction_data_reg["operands"][
operands.index(self._create_reg_wildcard())
]
)
data_port_pressure = [0.0 for _ in range(port_number)]
data_port_uops = []
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
# LOAD performance data
data_port_uops = self._machine_model.get_load_throughput(
[
x["memory"]
for x in instruction_form["semantic_operands"]["source"]
+ instruction_form["semantic_operands"]["src_dst"]
if "memory" in x
][0]
)
data_port_pressure = self._machine_model.average_port_pressure(
data_port_uops
)
if "load_throughput_multiplier" in self._machine_model:
multiplier = self._machine_model["load_throughput_multiplier"][
reg_type
]
data_port_pressure = [pp * multiplier for pp in data_port_pressure]
if INSTR_FLAGS.HAS_ST in instruction_form["flags"]:
# STORE performance data
destinations = (
instruction_form["semantic_operands"]["destination"]
+ instruction_form["semantic_operands"]["src_dst"]
)
st_data_port_uops = self._machine_model.get_store_throughput(
[x["memory"] for x in destinations if "memory" in x][0]
)
# zero data port pressure and remove HAS_ST flag if
# - no mem operand in dst &&
# - all mem operands in src_dst are pre-/post-indexed
# since it is no mem store
if (
self._isa == "aarch64"
and "memory"
not in instruction_form["semantic_operands"]["destination"]
and all(
[
"post_indexed" in op["memory"]
or "pre_indexed" in op["memory"]
for op in instruction_form["semantic_operands"]["src_dst"]
if "memory" in op
]
)
):
st_data_port_uops = []
instruction_form["flags"].remove(INSTR_FLAGS.HAS_ST)
# sum up all data ports in case for LOAD and STORE
st_data_port_pressure = self._machine_model.average_port_pressure(
st_data_port_uops
)
if "store_throughput_multiplier" in self._machine_model:
multiplier = self._machine_model["store_throughput_multiplier"][
reg_type
]
st_data_port_pressure = [
pp * multiplier for pp in st_data_port_pressure
]
data_port_pressure = [
sum(x) for x in zip(data_port_pressure, st_data_port_pressure)
]
data_port_uops += st_data_port_uops
throughput = max(
max(data_port_pressure), instruction_data_reg["throughput"]
)
latency = instruction_data_reg["latency"]
# Add LD and ST latency
latency += (
self._machine_model.get_load_latency(reg_type)
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]
else 0
)
latency += (
self._machine_model.get_store_latency(reg_type)
if INSTR_FLAGS.HAS_ST in instruction_form["flags"]
else 0
)
latency_wo_load = instruction_data_reg["latency"]
# add latency of ADD if post- or pre-indexed load
# TODO more investigation: check dot-graph, wrong latency distribution!
# if (
# latency_wo_load == 0
# and self._isa == 'aarch64'
# and any(
# [
# 'post_indexed' in op['memory'] or
# 'pre_indexed' in op['memory']
# for op in instruction_form['operands']
# if 'memory' in op
# ]
# )
# ):
# latency_wo_load = 1.0
instruction_form["port_pressure"] = [
sum(x)
for x in zip(
data_port_pressure,
self._machine_model.average_port_pressure(
instruction_data_reg["port_pressure"]
),
)
]
instruction_form["port_uops"] = list(
chain(instruction_data_reg["port_pressure"], data_port_uops)
)
if assign_unknown:
# --> mark as unknown and assume 0 cy for latency/throughput
throughput = 0.0
latency = 0.0
latency_wo_load = latency
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
instruction_form["port_uops"] = []
flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN]
# flatten flag list
flags = list(set(flags))
if "flags" not in instruction_form:
instruction_form["flags"] = flags
else:
instruction_form["flags"] += flags
instruction_form["throughput"] = throughput
instruction_form["latency"] = latency
instruction_form["latency_wo_load"] = latency_wo_load
# for later CP and loop-carried dependency analysis
instruction_form["latency_cp"] = 0
instruction_form["latency_lcd"] = 0
def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
"""Apply performance data to instruction if it was found in the archDB"""
throughput = instruction_data["throughput"]
port_pressure = self._machine_model.average_port_pressure(
instruction_data["port_pressure"]
)
instruction_form["port_uops"] = instruction_data["port_pressure"]
try:
assert isinstance(port_pressure, list)
assert len(port_pressure) == port_number
instruction_form["port_pressure"] = port_pressure
if sum(port_pressure) == 0 and throughput is not None:
# port pressure on all ports 0 --> not bound to a port
flags.append(INSTR_FLAGS.NOT_BOUND)
except AssertionError:
warnings.warn(
"Port pressure could not be imported correctly from database. "
+ "Please check entry for:\n {}".format(instruction_form)
)
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
instruction_form["port_uops"] = []
flags.append(INSTR_FLAGS.TP_UNKWN)
if throughput is None:
# assume 0 cy and mark as unknown
throughput = 0.0
flags.append(INSTR_FLAGS.TP_UNKWN)
latency = instruction_data["latency"]
latency_wo_load = latency
if latency is None:
# assume 0 cy and mark as unknown
latency = 0.0
latency_wo_load = latency
flags.append(INSTR_FLAGS.LT_UNKWN)
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
flags.append(INSTR_FLAGS.LD)
return throughput, port_pressure, latency, latency_wo_load
def convert_op_to_reg(self, reg_type, reg_id="0"):
"""Create register operand for a memory addressing operand"""
if self._isa == "x86":
if reg_type == "gpr":
register = {"register": {"name": "r" + str(int(reg_id) + 9)}}
else:
register = {"register": {"name": reg_type + reg_id}}
elif self._isa == "aarch64":
register = {"register": {"prefix": reg_type, "name": reg_id}}
return register
def _nullify_data_ports(self, port_pressure):
"""Set all ports to 0.0 for the ports of a machine model"""
data_ports = self._machine_model.get_data_ports()
for port in data_ports:
index = self._machine_model.get_ports().index(port)
port_pressure[index] = 0.0
return port_pressure
def _itemsetter(self, *items):
if len(items) == 1:
item = items[0]
def g(obj, value):
obj[item] = value
else:
def g(obj, *values):
for item, value in zip(items, values):
obj[item] = value
return g
def _to_list(self, obj):
if isinstance(obj, tuple):
return list(obj)
else:
return [obj]
@staticmethod
def get_throughput_sum(kernel):
"""Get the overall throughput sum separated by port of all instructions of a kernel."""
# ignoring all lines with throughput == 0.0, because there won't be anything to sum up
# typically comment, label and non-instruction lines
port_pressures = [instr["port_pressure"] for instr in kernel if instr["throughput"] != 0.0]
# Essentially summing up each columns of port_pressures, where each column is one port
# and each row is one line of the kernel
# round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput
tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
return tp_sum