This commit is contained in:
Andreas Abel
2023-03-12 13:48:05 +01:00
parent 6fa0df0469
commit faf75236ca
9 changed files with 111 additions and 38 deletions

View File

@@ -149,6 +149,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
| `-median` | Selects the median as the aggregate function. |
| `-min` | Selects the minimum as the aggregate function. |
| `-max` | Selects the maximum as the aggregate function. |
| `-range` | Outputs the range of the measured values (i.e., the minimum and the maximum). |
| `-basic_mode` | The effect of this option is described in the [Generated Code](#generated-code) section. |
| `-no_mem` | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. |
| `-no_normalization` | If this option is enabled, the measurement results are not divided by the number of repetitions. |

View File

@@ -23,6 +23,7 @@ int no_normalization = NO_NORMALIZATION_DEFAULT;
int basic_mode = BASIC_MODE_DEFAULT;
int use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
int aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
int output_range = OUTPUT_RANGE_DEFAULT;
int verbose = VERBOSE_DEFAULT;
int debug = DEBUG_DEFAULT;
@@ -786,24 +787,37 @@ void run_experiment(char* measurement_template, int64_t* results[], int n_counte
}
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter) {
int64_t agg = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100);
int64_t agg_base = get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100);
int64_t agg = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100, aggregate_function);
int64_t agg_base = get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100, aggregate_function);
char range_buf[50] = "";
if (output_range) {
int64_t min = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100, MIN);
int64_t min_base = get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100, MIN);
int64_t max = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100, MAX);
int64_t max_base = get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100, MAX);
if (no_normalization) {
snprintf(range_buf, sizeof(range_buf), " [%lld;%lld]", (long long)(min-max_base), (long long)(max-min_base));
} else {
int64_t min_n = normalize(min-max_base);
int64_t max_n = normalize(max-min_base);
snprintf(range_buf, sizeof(range_buf), " [%s%lld.%.2lld;%s%lld.%.2lld]", (min_n<0?"-":""), ll_abs(min_n/100), ll_abs(min_n)%100,
(max_n<0?"-":""), ll_abs(max_n/100), ll_abs(max_n)%100);
}
}
if (no_normalization) {
snprintf(buf, buf_len, "%s: %lld\n", desc, (long long)(agg-agg_base));
snprintf(buf, buf_len, "%s: %lld%s\n", desc, (long long)(agg-agg_base), range_buf);
} else {
int64_t n_rep = loop_count * unroll_count;
if (loop_count == 0) {
n_rep = unroll_count;
}
int64_t result = ((agg-agg_base) + n_rep/2)/n_rep;
snprintf(buf, buf_len, "%s: %s%lld.%.2lld\n", desc, (result<0?"-":""), ll_abs(result/100), ll_abs(result)%100);
int64_t result = normalize(agg-agg_base);
snprintf(buf, buf_len, "%s: %s%lld.%.2lld%s\n", desc, (result<0?"-":""), ll_abs(result/100), ll_abs(result)%100, range_buf);
}
return buf;
}
int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
if (aggregate_function == MIN) {
int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale, int agg_func) {
if (agg_func == MIN) {
int64_t min = values[0];
for (int i=0; i<length; i++) {
if (values[i] < min) {
@@ -811,7 +825,7 @@ int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
}
}
return min * scale;
} else if (aggregate_function == MAX) {
} else if (agg_func == MAX) {
int64_t max = values[0];
for (int i=0; i<length; i++) {
if (values[i] > max) {
@@ -822,7 +836,7 @@ int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
} else {
qsort(values, length, sizeof(int64_t), cmpInt64);
if (aggregate_function == AVG_20_80) {
if (agg_func == AVG_20_80) {
// computes the average of the values between the 20 and 80 percentile
int64_t sum = 0;
int count = 0;
@@ -836,6 +850,14 @@ int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
}
}
int64_t normalize(int64_t value) {
int64_t n_rep = loop_count * unroll_count;
if (loop_count == 0) {
n_rep = unroll_count;
}
return (value + n_rep/2)/n_rep;
}
int cmpInt64(const void *a, const void *b) {
return *(int64_t*)a - *(int64_t*)b;
}

View File

@@ -138,6 +138,10 @@ enum agg_enum {AVG_20_80, MIN, MAX, MED};
extern int aggregate_function;
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
// If enabled, the range of the measured values (i.e., the minimum and the maximum) are included in the output.
extern int output_range;
#define OUTPUT_RANGE_DEFAULT false;
extern int verbose;
#define VERBOSE_DEFAULT false;
@@ -271,7 +275,8 @@ void run_experiment(char* measurement_template, int64_t* results[], int n_counte
void create_and_run_one_time_init_code(void);
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale);
int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale, int agg_func);
int64_t normalize(int64_t value);
int cmpInt64(const void *a, const void *b);
long long ll_abs(long long val);

View File

@@ -47,6 +47,7 @@ def main():
parser.add_argument('-median', action='store_const', const='med', help='Selects the median as the aggregate function.')
parser.add_argument('-min', action='store_const', const='min', help='Selects the minimum as the aggregate function.')
parser.add_argument('-max', action='store_const', const='max', help='Selects the maximum as the aggregate function.')
parser.add_argument('-range', action='store_true', help='Outputs the range of the measured values (i.e., the minimum and the maximum).')
parser.add_argument('-no_mem', action='store_true', help='The code for reading the perf. ctrs. does not make memory accesses.')
parser.add_argument('-remove_empty_events', action='store_true', help='Removes events from the output that did not occur.')
parser.add_argument('-verbose', action='store_true', help='Outputs the results of all performance counter readings.')
@@ -65,6 +66,7 @@ def main():
initialWarmUpCount=args.initial_warm_up_count,
alignmentOffset=args.alignment_offset,
aggregateFunction=(args.avg or args.median or args.min or args.max or 'med'),
range=args.range,
noMem=args.no_mem,
verbose=args.verbose,
endToEnd=args.end_to_end)
@@ -87,11 +89,14 @@ def main():
if args.remove_empty_events:
for k in list(nbDict.keys()):
if max(nbDict[k]) == 0:
if max(nbDict[k][0]) == 0:
del nbDict[k]
if args.csv is not None:
csvString = '\n'.join(k + ',' + ','.join(map(str, v)) for k, v in nbDict.items())
if args.range:
csvString = '\n'.join(k + ',' + ','.join(map(str, sum(zip(v, vMin, vMax), ()))) for k, (v, vMin, vMax) in nbDict.items())
else:
csvString = '\n'.join(k + ',' + ','.join(map(str, v)) for k, (v, _, _) in nbDict.items())
if args.csv:
with open(args.csv, 'w') as f:
f.write(csvString + '\n')
@@ -106,8 +111,13 @@ def main():
fig = go.Figure()
fig.update_xaxes(title_text='Cycle')
for name, values in nbDict.items():
fig.add_trace(go.Scatter(y=values, mode='lines+markers', line_shape='linear', name=name, marker_size=5, hoverlabel = dict(namelength = -1)))
for name, (values, minValues, maxValues) in nbDict.items():
e = None
if args.range:
array = [(m-v) for (v, m) in zip(values, maxValues)]
arrayminus = [(v-m) for (v, m) in zip(values, minValues)]
e = dict(type='data', symmetric=False, array=array, arrayminus=arrayminus)
fig.add_trace(go.Scatter(y=values, error_y=e, mode='lines+markers', line_shape='linear', name=name, marker_size=5, hoverlabel = dict(namelength = -1)))
config = {'displayModeBar': True,
'modeBarButtonsToRemove': ['autoScale2d', 'select2d', 'lasso2d'],
@@ -123,7 +133,6 @@ def main():
htmlFilename = args.html or 'graph.html'
writeHtmlFile(htmlFilename, 'Graph', '', body, includeDOCTYPE=False) # if DOCTYPE is included, scaling doesn't work properly
os.chown(htmlFilename, int(os.environ['SUDO_UID']), int(os.environ['SUDO_GID']))
print('Output written to ' + htmlFilename)

View File

@@ -113,7 +113,10 @@ while [ "$1" ]; do
elif [[ "$1" == -avg* ]]; then
echo "avg" > /sys/nb/agg
shift
elif [[ "$1" == -r* ]]; then
elif [[ "$1" == -ra* ]]; then
echo "1" > /sys/nb/output_range
shift
elif [[ "$1" == -re* ]]; then
filter_output="grep -v 0.00"
shift
elif [[ "$1" == -h* ]]; then
@@ -140,6 +143,7 @@ while [ "$1" ]; do
echo " -median: Selects the median as the aggregate function."
echo " -min: Selects the minimum as the aggregate function."
echo " -max: Selects the maximum as the aggregate function."
echo " -range: Outputs the range of the measured values (i.e., the minimum and the maximum)."
echo " -basic_mode: Enables basic mode."
echo " -no_mem: The code for reading the perf. ctrs. does not make memory accesses."
echo " -no_normalization: The measurement results are not divided by the number of repetitions."

View File

@@ -348,6 +348,15 @@ static ssize_t no_normalization_store(struct kobject *kobj, struct kobj_attribut
}
static struct kobj_attribute no_normalization_attribute =__ATTR(no_normalization, 0660, no_normalization_show, no_normalization_store);
static ssize_t output_range_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return sprintf(buf, "%d\n", output_range);
}
static ssize_t output_range_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
sscanf(buf, "%d", &output_range);
return count;
}
static struct kobj_attribute output_range_attribute =__ATTR(output_range, 0660, output_range_show, output_range_store);
static ssize_t agg_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return sprintf(buf, "%d\n", aggregate_function);
}
@@ -495,6 +504,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha
basic_mode = BASIC_MODE_DEFAULT;
use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
output_range = OUTPUT_RANGE_DEFAULT;
verbose = VERBOSE_DEFAULT;
alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
drain_frontend = DRAIN_FRONTEND_DEFAULT;
@@ -798,14 +808,14 @@ static uint64_t get_max_programmable_ctr_value(void) {
static uint64_t get_end_to_end_cycles(void) {
run_experiment_with_freeze_on_PMI(measurement_results, 0, 0, 0);
uint64_t cycles = get_aggregate_value(measurement_results[FIXED_CTR_CORE_CYCLES], n_measurements, 1);
uint64_t cycles = get_aggregate_value(measurement_results[FIXED_CTR_CORE_CYCLES], n_measurements, 1, aggregate_function);
print_verbose("End-to-end cycles: %llu\n", cycles);
return cycles;
}
static uint64_t get_end_to_end_retired(void) {
run_experiment_with_freeze_on_PMI(measurement_results, 0, 0, 0);
uint64_t retired = get_aggregate_value(measurement_results[FIXED_CTR_INST_RETIRED], n_measurements, 1);
uint64_t retired = get_aggregate_value(measurement_results[FIXED_CTR_INST_RETIRED], n_measurements, 1, aggregate_function);
print_verbose("End-to-end retired instructions: %llu\n", retired);
return retired;
}
@@ -820,7 +830,7 @@ static uint64_t get_cycle_last_retired(bool include_lfence) {
uint64_t last_applicable_instr = get_end_to_end_retired() - 258 + include_lfence;
run_experiment_with_freeze_on_PMI(measurement_results, 0, 3 + 2, get_max_programmable_ctr_value() - last_applicable_instr);
uint64_t time_to_last_retired = get_aggregate_value(measurement_results[1], n_measurements, 1);
uint64_t time_to_last_retired = get_aggregate_value(measurement_results[1], n_measurements, 1, aggregate_function);
// The counters freeze a few cycles after an overflow happens; additionally the programmable and fixed counters do not freeze (or do not start) at exactly
// the same time. In the following, we search for the value that we have to write to the fixed counter such that the programmable counters stop immediately
@@ -828,7 +838,7 @@ static uint64_t get_cycle_last_retired(bool include_lfence) {
uint64_t cycle_last_retired = 0;
for (int64_t cycle=time_to_last_retired; cycle>=0; cycle--) {
run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
if (get_aggregate_value(measurement_results[2], n_measurements, 1) < last_applicable_instr) {
if (get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function) < last_applicable_instr) {
cycle_last_retired = cycle+1;
break;
}
@@ -847,7 +857,7 @@ static uint64_t get_cycle_first_added_to_IDQ(uint64_t cycle_last_retired_empty)
uint64_t prev_uops = 0;
for (int64_t cycle=cycle_last_retired_empty-3; cycle>=0; cycle--) {
run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
uint64_t uops = get_aggregate_value(measurement_results[2], n_measurements, 1);
uint64_t uops = get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function);
if ((prev_uops != 0) && (prev_uops - uops > 1)) {
cycle_first_added_to_IDQ = cycle + 1;
@@ -861,7 +871,7 @@ static uint64_t get_cycle_first_added_to_IDQ(uint64_t cycle_last_retired_empty)
// Programs the fixed cycle counter such that it overflows in the specified cycle, runs the benchmark,
// and stores the measurements of the programmable counters in results.
static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results) {
static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results, uint64_t* results_min, uint64_t* results_max) {
// on several microarchitectures, the counters 0 or 1 do not freeze at the same time as the other counters
int avoid_counters = 0;
if (displ_model == 0x97) { // Alder Lake
@@ -883,7 +893,9 @@ static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results) {
for (size_t c=0; c<n_used_counters; c++) {
if (pfc_descriptions[c]) {
results[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1);
results[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1, aggregate_function);
if (results_min) results_min[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1, MIN);
if (results_max) results_max[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1, MAX);
cur_pfc_config++;
}
}
@@ -921,12 +933,12 @@ static int run_nanoBench_cycle_by_cycle(struct seq_file *m, void *v) {
uint64_t cycle_last_retired_empty = get_cycle_last_retired(false);
uint64_t* results_empty = vmalloc(sizeof(uint64_t[n_pfc_configs]));
perform_measurements_for_cycle(cycle_last_retired_empty, results_empty);
perform_measurements_for_cycle(cycle_last_retired_empty, results_empty, NULL, NULL);
uint64_t cycle_last_retired_empty_with_lfence = get_cycle_last_retired(true);
uint64_t* results_empty_with_lfence = vmalloc(sizeof(uint64_t[n_pfc_configs]));
perform_measurements_for_cycle(cycle_last_retired_empty_with_lfence, results_empty_with_lfence);
perform_measurements_for_cycle(cycle_last_retired_empty_with_lfence, results_empty_with_lfence, NULL, NULL);
uint64_t first_cycle = 0;
uint64_t last_cycle = 0;
@@ -946,9 +958,11 @@ static int run_nanoBench_cycle_by_cycle(struct seq_file *m, void *v) {
}
uint64_t (*results)[n_pfc_configs] = vmalloc(sizeof(uint64_t[last_cycle+1][n_pfc_configs]));
uint64_t (*results_min)[n_pfc_configs] = output_range?vmalloc(sizeof(uint64_t[last_cycle+1][n_pfc_configs])):NULL;
uint64_t (*results_max)[n_pfc_configs] = output_range?vmalloc(sizeof(uint64_t[last_cycle+1][n_pfc_configs])):NULL;
for (uint64_t cycle=first_cycle; cycle<=last_cycle; cycle++) {
perform_measurements_for_cycle(cycle, results[cycle]);
perform_measurements_for_cycle(cycle, results[cycle], output_range?results_min[cycle]:NULL, output_range?results_max[cycle]:NULL);
}
disable_perf_ctrs_globally();
@@ -965,6 +979,7 @@ static int run_nanoBench_cycle_by_cycle(struct seq_file *m, void *v) {
seq_printf(m, ",%lld", results_empty_with_lfence[i]);
for (long cycle=first_cycle; cycle<=last_cycle; cycle++) {
seq_printf(m, ",%lld", results[cycle][i]);
if (output_range) seq_printf(m, ",%lld,%lld", results_min[cycle][i], results_max[cycle][i]);
}
seq_printf(m, "\n");
}
@@ -1094,6 +1109,7 @@ static int __init nb_init(void) {
error |= sysfs_create_file(nb_kobject, &end_to_end_attribute.attr);
error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr);
error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
error |= sysfs_create_file(nb_kobject, &output_range_attribute.attr);
error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
error |= sysfs_create_file(nb_kobject, &no_normalization_attribute.attr);

View File

@@ -95,7 +95,7 @@ paramDict = dict()
# Otherwise, reset() needs to be called first.
def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, fixedCounters=None, nMeasurements=None, unrollCount=None,
loopCount=None, warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None,
aggregateFunction=None, basicMode=None, noMem=None, noNormalization=None, verbose=None, endToEnd=None):
aggregateFunction=None, range=None, basicMode=None, noMem=None, noNormalization=None, verbose=None, endToEnd=None):
if config is not None:
if paramDict.get('config', None) != config:
configFile = '/tmp/ramdisk/config'
@@ -162,6 +162,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
writeFile('/sys/nb/agg', aggregateFunction)
paramDict['aggregateFunction'] = aggregateFunction
if range is not None:
if paramDict.get('range', None) != range:
writeFile('/sys/nb/output_range', str(int(range)))
paramDict['range'] = range
if basicMode is not None:
if paramDict.get('basicMode', None) != basicMode:
writeFile('/sys/nb/basic_mode', str(int(basicMode)))
@@ -254,7 +259,10 @@ def runNanoBench(code='', codeObjFile=None, codeBinFile=None,
if not ':' in line: continue
lineSplit = line.split(':')
counter = lineSplit[0].strip()
value = float(lineSplit[1].strip())
if paramDict.get('range'):
value = tuple(map(float, re.match(r' (.*) \[(.*);(.*)\]', lineSplit[1]).groups()))
else:
value = float(lineSplit[1].strip())
ret[counter] = value
return ret
@@ -286,14 +294,21 @@ def runNanoBenchCycleByCycle(code='', codeObjFile=None, codeBinFile=None,
counter = lineSplit[0].strip()
valueEmpty = int(lineSplit[1])
valueEmptyWithLfence = int(lineSplit[2])
values = [int(v) for v in lineSplit[3:]]
nbDict[counter] = (valueEmpty, valueEmptyWithLfence, values)
minValues = []
maxValues = []
if paramDict.get('range'):
values = list(map(int, lineSplit[3::3]))
minValues = list(map(int, lineSplit[4::3]))
maxValues = list(map(int, lineSplit[5::3]))
else:
values = list(map(int, lineSplit[3:]))
nbDict[counter] = (valueEmpty, valueEmptyWithLfence, values, minValues, maxValues)
if paramDict.get('verbose'):
print('\n'.join((k + ': ' + str(v)) for k, v in nbDict.items()))
if paramDict.get('endToEnd'):
return OrderedDict((k, v) for k, (_, _, v) in nbDict.items() if "_internal" not in k)
return OrderedDict((k, (v, vMin, vMax)) for k, (_, _, v, vMin, vMax) in nbDict.items() if "_internal" not in k)
else:
instRetired = nbDict['INST_RETIRED_internal'][2]
if len(instRetired) < 3:
@@ -308,7 +323,7 @@ def runNanoBenchCycleByCycle(code='', codeObjFile=None, codeBinFile=None,
return None
result = OrderedDict()
for k, (valueEmpty, valueEmptyWithLfence, values) in nbDict.items():
for k, (valueEmpty, valueEmptyWithLfence, values, minValues, maxValues) in nbDict.items():
if "_internal" in k: continue
leftMin = values[0]
@@ -325,7 +340,7 @@ def runNanoBenchCycleByCycle(code='', codeObjFile=None, codeBinFile=None,
elif 'IDQ' in k:
rightMax = values[cycleOfLfenceUop - 1]
result[k] = [max(0, min(v, rightMax) - leftMin) for v in values[:cycleLastInstrRetired + 1]]
result[k] = tuple([max(0, min(v, rightMax) - leftMin) for v in vx[:cycleLastInstrRetired + 1]] for vx in [values, minValues, maxValues])
return result

View File

@@ -43,7 +43,7 @@ while [ "$1" ]; do
debug="gdb -ex=run --args"
args="$args $1"
shift
elif [[ "$1" == -r* ]]; then
elif [[ "$1" == -re* ]]; then
filter_output="grep -v 0.00"
shift
else

View File

@@ -88,6 +88,7 @@ int main(int argc, char **argv) {
{"median", no_argument, &aggregate_function, MED},
{"min", no_argument, &aggregate_function, MIN},
{"max", no_argument, &aggregate_function, MAX},
{"range", no_argument, &output_range, true},
{"basic_mode", no_argument, &basic_mode, true},
{"no_mem", no_argument, &no_mem, true},
{"no_normalization", no_argument, &no_normalization, true},