ranges

2025-07-21 07:01:04 +02:00 · 2023-03-12 13:48:05 +01:00
parent 6fa0df0469
commit faf75236ca
9 changed files with 111 additions and 38 deletions
--- a/README.md
+++ b/README.md
@@ -149,6 +149,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
 | `-median`                    | Selects the median as the aggregate function. |
 | `-min`                       | Selects the minimum as the aggregate function. |
 | `-max`                       | Selects the maximum as the aggregate function. |
+| `-range`                     | Outputs the range of the measured values (i.e., the minimum and the maximum). |
 | `-basic_mode`                | The effect of this option is described in the [Generated Code](#generated-code) section. |
 | `-no_mem`                    | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. |
 | `-no_normalization`          | If this option is enabled, the measurement results are not divided by the number of repetitions. |
--- a/common/nanoBench.c
+++ b/common/nanoBench.c
@@ -23,6 +23,7 @@ int no_normalization = NO_NORMALIZATION_DEFAULT;
 int basic_mode = BASIC_MODE_DEFAULT;
 int use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
 int aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
+int output_range = OUTPUT_RANGE_DEFAULT;
 int verbose = VERBOSE_DEFAULT;
 int debug = DEBUG_DEFAULT;

@@ -786,24 +787,37 @@ void run_experiment(char* measurement_template, int64_t* results[], int n_counte
 }

 char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter) {
-    int64_t agg = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100);
-    int64_t agg_base = get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100);
+    int64_t agg = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100, aggregate_function);
+    int64_t agg_base = get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100, aggregate_function);
+
+    char range_buf[50] = "";
+    if (output_range) {
+        int64_t min = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100, MIN);
+        int64_t min_base =  get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100, MIN);
+        int64_t max = get_aggregate_value(measurement_results[counter], n_measurements, no_normalization?1:100, MAX);
+        int64_t max_base =  get_aggregate_value(measurement_results_base[counter], n_measurements, no_normalization?1:100, MAX);
+
+        if (no_normalization) {
+            snprintf(range_buf, sizeof(range_buf), " [%lld;%lld]", (long long)(min-max_base), (long long)(max-min_base));
+        } else {
+            int64_t min_n = normalize(min-max_base);
+            int64_t max_n = normalize(max-min_base);
+            snprintf(range_buf, sizeof(range_buf), " [%s%lld.%.2lld;%s%lld.%.2lld]", (min_n<0?"-":""), ll_abs(min_n/100), ll_abs(min_n)%100,
+                                                                                 (max_n<0?"-":""), ll_abs(max_n/100), ll_abs(max_n)%100);
+        }
+    }

    if (no_normalization) {
-        snprintf(buf, buf_len, "%s: %lld\n", desc, (long long)(agg-agg_base));
+        snprintf(buf, buf_len, "%s: %lld%s\n", desc, (long long)(agg-agg_base), range_buf);
    } else {
-        int64_t n_rep = loop_count * unroll_count;
-        if (loop_count == 0) {
-            n_rep = unroll_count;
-        }
-        int64_t result = ((agg-agg_base) + n_rep/2)/n_rep;
-        snprintf(buf, buf_len, "%s: %s%lld.%.2lld\n", desc, (result<0?"-":""), ll_abs(result/100), ll_abs(result)%100);
+        int64_t result = normalize(agg-agg_base);
+        snprintf(buf, buf_len, "%s: %s%lld.%.2lld%s\n", desc, (result<0?"-":""), ll_abs(result/100), ll_abs(result)%100, range_buf);
    }
    return buf;
 }

-int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
-    if (aggregate_function == MIN) {
+int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale, int agg_func) {
+    if (agg_func == MIN) {
        int64_t min = values[0];
        for (int i=0; i<length; i++) {
            if (values[i] < min) {
@@ -811,7 +825,7 @@ int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
            }
        }
        return min * scale;
-    } else if (aggregate_function == MAX)  {
+    } else if (agg_func == MAX)  {
        int64_t max = values[0];
        for (int i=0; i<length; i++) {
            if (values[i] > max) {
@@ -822,7 +836,7 @@ int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
    } else {
        qsort(values, length, sizeof(int64_t), cmpInt64);

-        if (aggregate_function == AVG_20_80) {
+        if (agg_func == AVG_20_80) {
            // computes the average of the values between the 20 and 80 percentile
            int64_t sum = 0;
            int count = 0;
@@ -836,6 +850,14 @@ int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale) {
    }
 }

+int64_t normalize(int64_t value) {
+    int64_t n_rep = loop_count * unroll_count;
+    if (loop_count == 0) {
+        n_rep = unroll_count;
+    }
+    return (value + n_rep/2)/n_rep;
+}
+
 int cmpInt64(const void *a, const void *b) {
    return *(int64_t*)a - *(int64_t*)b;
 }
--- a/common/nanoBench.h
+++ b/common/nanoBench.h
@@ -138,6 +138,10 @@ enum agg_enum {AVG_20_80, MIN, MAX, MED};
 extern int aggregate_function;
 #define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;

+// If enabled, the range of the measured values (i.e., the minimum and the maximum) are included in the output.
+extern int output_range;
+#define OUTPUT_RANGE_DEFAULT false;
+
 extern int verbose;
 #define VERBOSE_DEFAULT false;

@@ -271,7 +275,8 @@ void run_experiment(char* measurement_template, int64_t* results[], int n_counte
 void create_and_run_one_time_init_code(void);

 char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
-int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale);
+int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale, int agg_func);
+int64_t normalize(int64_t value);
 int cmpInt64(const void *a, const void *b);
 long long ll_abs(long long val);

--- a/cycleByCycle.py
+++ b/cycleByCycle.py
@@ -47,6 +47,7 @@ def main():
   parser.add_argument('-median', action='store_const', const='med', help='Selects the median as the aggregate function.')
   parser.add_argument('-min', action='store_const', const='min', help='Selects the minimum as the aggregate function.')
   parser.add_argument('-max', action='store_const', const='max', help='Selects the maximum as the aggregate function.')
+   parser.add_argument('-range', action='store_true', help='Outputs the range of the measured values (i.e., the minimum and the maximum).')
   parser.add_argument('-no_mem', action='store_true', help='The code for reading the perf. ctrs. does not make memory accesses.')
   parser.add_argument('-remove_empty_events', action='store_true', help='Removes events from the output that did not occur.')
   parser.add_argument('-verbose', action='store_true', help='Outputs the results of all performance counter readings.')
@@ -65,6 +66,7 @@ def main():
                          initialWarmUpCount=args.initial_warm_up_count,
                          alignmentOffset=args.alignment_offset,
                          aggregateFunction=(args.avg or args.median or args.min or args.max or 'med'),
+                          range=args.range,
                          noMem=args.no_mem,
                          verbose=args.verbose,
                          endToEnd=args.end_to_end)
@@ -87,11 +89,14 @@ def main():

   if args.remove_empty_events:
      for k in list(nbDict.keys()):
-         if max(nbDict[k]) == 0:
+         if max(nbDict[k][0]) == 0:
            del nbDict[k]

   if args.csv is not None:
-      csvString = '\n'.join(k + ',' + ','.join(map(str, v)) for k, v in nbDict.items())
+      if args.range:
+         csvString = '\n'.join(k + ',' + ','.join(map(str, sum(zip(v, vMin, vMax), ()))) for k, (v, vMin, vMax) in nbDict.items())
+      else:
+         csvString = '\n'.join(k + ',' + ','.join(map(str, v)) for k, (v, _, _) in nbDict.items())
      if args.csv:
         with open(args.csv, 'w') as f:
            f.write(csvString + '\n')
@@ -106,8 +111,13 @@ def main():
      fig = go.Figure()
      fig.update_xaxes(title_text='Cycle')

-      for name, values in nbDict.items():
-         fig.add_trace(go.Scatter(y=values, mode='lines+markers', line_shape='linear', name=name, marker_size=5, hoverlabel = dict(namelength = -1)))
+      for name, (values, minValues, maxValues) in nbDict.items():
+         e = None
+         if args.range:
+            array = [(m-v) for (v, m) in zip(values, maxValues)]
+            arrayminus = [(v-m) for (v, m) in zip(values, minValues)]
+            e = dict(type='data', symmetric=False, array=array, arrayminus=arrayminus)
+         fig.add_trace(go.Scatter(y=values, error_y=e, mode='lines+markers', line_shape='linear', name=name, marker_size=5, hoverlabel = dict(namelength = -1)))

      config = {'displayModeBar': True,
                'modeBarButtonsToRemove': ['autoScale2d', 'select2d', 'lasso2d'],
@@ -123,7 +133,6 @@ def main():

      htmlFilename = args.html or 'graph.html'
      writeHtmlFile(htmlFilename, 'Graph', '', body, includeDOCTYPE=False) # if DOCTYPE is included, scaling doesn't work properly
-      os.chown(htmlFilename, int(os.environ['SUDO_UID']), int(os.environ['SUDO_GID']))
      print('Output written to ' + htmlFilename)


--- a/kernel-nanoBench.sh
+++ b/kernel-nanoBench.sh
@@ -113,7 +113,10 @@ while [ "$1" ]; do
    elif [[ "$1" == -avg* ]]; then
        echo "avg" > /sys/nb/agg
        shift
-    elif [[ "$1" == -r* ]]; then
+    elif [[ "$1" == -ra* ]]; then
+        echo "1" > /sys/nb/output_range
+        shift
+    elif [[ "$1" == -re* ]]; then
        filter_output="grep -v 0.00"
        shift
    elif [[ "$1" == -h* ]]; then
@@ -140,6 +143,7 @@ while [ "$1" ]; do
        echo "  -median:                        Selects the median as the aggregate function."
        echo "  -min:                           Selects the minimum as the aggregate function."
        echo "  -max:                           Selects the maximum as the aggregate function."
+        echo "  -range:                         Outputs the range of the measured values (i.e., the minimum and the maximum)."
        echo "  -basic_mode:                    Enables basic mode."
        echo "  -no_mem:                        The code for reading the perf. ctrs. does not make memory accesses."
        echo "  -no_normalization:              The measurement results are not divided by the number of repetitions."
--- a/kernel/nb_km.c
+++ b/kernel/nb_km.c
@@ -348,6 +348,15 @@ static ssize_t no_normalization_store(struct kobject *kobj, struct kobj_attribut
 }
 static struct kobj_attribute no_normalization_attribute =__ATTR(no_normalization, 0660, no_normalization_show, no_normalization_store);

+static ssize_t output_range_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
+    return sprintf(buf, "%d\n", output_range);
+}
+static ssize_t output_range_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
+    sscanf(buf, "%d", &output_range);
+    return count;
+}
+static struct kobj_attribute output_range_attribute =__ATTR(output_range, 0660, output_range_show, output_range_store);
+
 static ssize_t agg_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
    return sprintf(buf, "%d\n", aggregate_function);
 }
@@ -495,6 +504,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha
    basic_mode = BASIC_MODE_DEFAULT;
    use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
    aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
+    output_range = OUTPUT_RANGE_DEFAULT;
    verbose = VERBOSE_DEFAULT;
    alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
    drain_frontend = DRAIN_FRONTEND_DEFAULT;
@@ -798,14 +808,14 @@ static uint64_t get_max_programmable_ctr_value(void) {

 static uint64_t get_end_to_end_cycles(void) {
    run_experiment_with_freeze_on_PMI(measurement_results, 0, 0, 0);
-    uint64_t cycles = get_aggregate_value(measurement_results[FIXED_CTR_CORE_CYCLES], n_measurements, 1);
+    uint64_t cycles = get_aggregate_value(measurement_results[FIXED_CTR_CORE_CYCLES], n_measurements, 1, aggregate_function);
    print_verbose("End-to-end cycles: %llu\n", cycles);
    return cycles;
 }

 static uint64_t get_end_to_end_retired(void) {
    run_experiment_with_freeze_on_PMI(measurement_results, 0, 0, 0);
-    uint64_t retired = get_aggregate_value(measurement_results[FIXED_CTR_INST_RETIRED], n_measurements, 1);
+    uint64_t retired = get_aggregate_value(measurement_results[FIXED_CTR_INST_RETIRED], n_measurements, 1, aggregate_function);
    print_verbose("End-to-end retired instructions: %llu\n", retired);
    return retired;
 }
@@ -820,7 +830,7 @@ static uint64_t get_cycle_last_retired(bool include_lfence) {
    uint64_t last_applicable_instr = get_end_to_end_retired() - 258 + include_lfence;

    run_experiment_with_freeze_on_PMI(measurement_results, 0, 3 + 2, get_max_programmable_ctr_value() - last_applicable_instr);
-    uint64_t time_to_last_retired = get_aggregate_value(measurement_results[1], n_measurements, 1);
+    uint64_t time_to_last_retired = get_aggregate_value(measurement_results[1], n_measurements, 1, aggregate_function);

    // The counters freeze a few cycles after an overflow happens; additionally the programmable and fixed counters do not freeze (or do not start) at exactly
    // the same time. In the following, we search for the value that we have to write to the fixed counter such that the programmable counters stop immediately
@@ -828,7 +838,7 @@ static uint64_t get_cycle_last_retired(bool include_lfence) {
    uint64_t cycle_last_retired = 0;
    for (int64_t cycle=time_to_last_retired; cycle>=0; cycle--) {
        run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
-        if (get_aggregate_value(measurement_results[2], n_measurements, 1) < last_applicable_instr) {
+        if (get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function) < last_applicable_instr) {
            cycle_last_retired = cycle+1;
            break;
        }
@@ -847,7 +857,7 @@ static uint64_t get_cycle_first_added_to_IDQ(uint64_t cycle_last_retired_empty)
    uint64_t prev_uops = 0;
    for (int64_t cycle=cycle_last_retired_empty-3; cycle>=0; cycle--) {
        run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
-        uint64_t uops = get_aggregate_value(measurement_results[2], n_measurements, 1);
+        uint64_t uops = get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function);

        if ((prev_uops != 0) && (prev_uops - uops > 1)) {
            cycle_first_added_to_IDQ = cycle + 1;
@@ -861,7 +871,7 @@ static uint64_t get_cycle_first_added_to_IDQ(uint64_t cycle_last_retired_empty)

 // Programs the fixed cycle counter such that it overflows in the specified cycle, runs the benchmark,
 // and stores the measurements of the programmable counters in results.
-static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results) {
+static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results, uint64_t* results_min, uint64_t* results_max) {
    // on several microarchitectures, the counters 0 or 1 do not freeze at the same time as the other counters
    int avoid_counters = 0;
    if (displ_model == 0x97) { // Alder Lake
@@ -883,7 +893,9 @@ static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results) {

        for (size_t c=0; c<n_used_counters; c++) {
            if (pfc_descriptions[c]) {
-                results[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1);
+                results[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1, aggregate_function);
+                if (results_min) results_min[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1, MIN);
+                if (results_max) results_max[cur_pfc_config] = get_aggregate_value(measurement_results[c], n_measurements, 1, MAX);
                cur_pfc_config++;
            }
        }
@@ -921,12 +933,12 @@ static int run_nanoBench_cycle_by_cycle(struct seq_file *m, void *v) {

    uint64_t cycle_last_retired_empty = get_cycle_last_retired(false);
    uint64_t* results_empty = vmalloc(sizeof(uint64_t[n_pfc_configs]));
-    perform_measurements_for_cycle(cycle_last_retired_empty, results_empty);
+    perform_measurements_for_cycle(cycle_last_retired_empty, results_empty, NULL, NULL);


    uint64_t cycle_last_retired_empty_with_lfence = get_cycle_last_retired(true);
    uint64_t* results_empty_with_lfence = vmalloc(sizeof(uint64_t[n_pfc_configs]));
-    perform_measurements_for_cycle(cycle_last_retired_empty_with_lfence, results_empty_with_lfence);
+    perform_measurements_for_cycle(cycle_last_retired_empty_with_lfence, results_empty_with_lfence, NULL, NULL);

    uint64_t first_cycle = 0;
    uint64_t last_cycle = 0;
@@ -946,9 +958,11 @@ static int run_nanoBench_cycle_by_cycle(struct seq_file *m, void *v) {
    }

    uint64_t (*results)[n_pfc_configs] = vmalloc(sizeof(uint64_t[last_cycle+1][n_pfc_configs]));
+    uint64_t (*results_min)[n_pfc_configs] = output_range?vmalloc(sizeof(uint64_t[last_cycle+1][n_pfc_configs])):NULL;
+    uint64_t (*results_max)[n_pfc_configs] = output_range?vmalloc(sizeof(uint64_t[last_cycle+1][n_pfc_configs])):NULL;

    for (uint64_t cycle=first_cycle; cycle<=last_cycle; cycle++) {
-        perform_measurements_for_cycle(cycle, results[cycle]);
+        perform_measurements_for_cycle(cycle, results[cycle], output_range?results_min[cycle]:NULL, output_range?results_max[cycle]:NULL);
    }

    disable_perf_ctrs_globally();
@@ -965,6 +979,7 @@ static int run_nanoBench_cycle_by_cycle(struct seq_file *m, void *v) {
        seq_printf(m, ",%lld", results_empty_with_lfence[i]);
        for (long cycle=first_cycle; cycle<=last_cycle; cycle++) {
            seq_printf(m, ",%lld", results[cycle][i]);
+            if (output_range) seq_printf(m, ",%lld,%lld", results_min[cycle][i], results_max[cycle][i]);
        }
        seq_printf(m, "\n");
    }
@@ -1094,6 +1109,7 @@ static int __init nb_init(void) {
    error |= sysfs_create_file(nb_kobject, &end_to_end_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
+    error |= sysfs_create_file(nb_kobject, &output_range_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &no_normalization_attribute.attr);
--- a/kernelNanoBench.py
+++ b/kernelNanoBench.py
@@ -95,7 +95,7 @@ paramDict = dict()
 # Otherwise, reset() needs to be called first.
 def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, fixedCounters=None, nMeasurements=None, unrollCount=None,
                           loopCount=None, warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None,
-                           aggregateFunction=None, basicMode=None, noMem=None, noNormalization=None, verbose=None, endToEnd=None):
+                           aggregateFunction=None, range=None, basicMode=None, noMem=None, noNormalization=None, verbose=None, endToEnd=None):
   if config is not None:
      if paramDict.get('config', None) != config:
         configFile = '/tmp/ramdisk/config'
@@ -162,6 +162,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
         writeFile('/sys/nb/agg', aggregateFunction)
         paramDict['aggregateFunction'] = aggregateFunction

+   if range is not None:
+      if paramDict.get('range', None) != range:
+         writeFile('/sys/nb/output_range', str(int(range)))
+         paramDict['range'] = range
+
   if basicMode is not None:
      if paramDict.get('basicMode', None) != basicMode:
         writeFile('/sys/nb/basic_mode', str(int(basicMode)))
@@ -254,7 +259,10 @@ def runNanoBench(code='', codeObjFile=None, codeBinFile=None,
      if not ':' in line: continue
      lineSplit = line.split(':')
      counter = lineSplit[0].strip()
-      value = float(lineSplit[1].strip())
+      if paramDict.get('range'):
+         value = tuple(map(float, re.match(r' (.*) \[(.*);(.*)\]', lineSplit[1]).groups()))
+      else:
+         value = float(lineSplit[1].strip())
      ret[counter] = value
   return ret

@@ -286,14 +294,21 @@ def runNanoBenchCycleByCycle(code='', codeObjFile=None, codeBinFile=None,
      counter = lineSplit[0].strip()
      valueEmpty = int(lineSplit[1])
      valueEmptyWithLfence = int(lineSplit[2])
-      values = [int(v) for v in lineSplit[3:]]
-      nbDict[counter] = (valueEmpty, valueEmptyWithLfence, values)
+      minValues = []
+      maxValues = []
+      if paramDict.get('range'):
+         values = list(map(int, lineSplit[3::3]))
+         minValues = list(map(int, lineSplit[4::3]))
+         maxValues = list(map(int, lineSplit[5::3]))
+      else:
+         values = list(map(int, lineSplit[3:]))
+      nbDict[counter] = (valueEmpty, valueEmptyWithLfence, values, minValues, maxValues)

   if paramDict.get('verbose'):
      print('\n'.join((k + ': ' + str(v)) for k, v in nbDict.items()))

   if paramDict.get('endToEnd'):
-      return OrderedDict((k, v) for k, (_, _, v) in nbDict.items() if "_internal" not in k)
+      return OrderedDict((k, (v, vMin, vMax)) for k, (_, _, v, vMin, vMax) in nbDict.items() if "_internal" not in k)
   else:
      instRetired = nbDict['INST_RETIRED_internal'][2]
      if len(instRetired) < 3:
@@ -308,7 +323,7 @@ def runNanoBenchCycleByCycle(code='', codeObjFile=None, codeBinFile=None,
         return None

      result = OrderedDict()
-      for k, (valueEmpty, valueEmptyWithLfence, values) in nbDict.items():
+      for k, (valueEmpty, valueEmptyWithLfence, values, minValues, maxValues) in nbDict.items():
         if "_internal" in k: continue

         leftMin = values[0]
@@ -325,7 +340,7 @@ def runNanoBenchCycleByCycle(code='', codeObjFile=None, codeBinFile=None,
         elif 'IDQ' in k:
            rightMax = values[cycleOfLfenceUop - 1]

-         result[k] = [max(0, min(v, rightMax) - leftMin) for v in values[:cycleLastInstrRetired + 1]]
+         result[k] = tuple([max(0, min(v, rightMax) - leftMin) for v in vx[:cycleLastInstrRetired + 1]] for vx in [values, minValues, maxValues])

      return result

--- a/nanoBench.sh
+++ b/nanoBench.sh
@@ -43,7 +43,7 @@ while [ "$1" ]; do
        debug="gdb -ex=run --args"
        args="$args $1"
        shift
-    elif [[ "$1" == -r* ]]; then
+    elif [[ "$1" == -re* ]]; then
        filter_output="grep -v 0.00"
        shift
    else
--- a/user/nanoBench_main.c
+++ b/user/nanoBench_main.c
@@ -88,6 +88,7 @@ int main(int argc, char **argv) {
        {"median", no_argument, &aggregate_function, MED},
        {"min", no_argument, &aggregate_function, MIN},
        {"max", no_argument, &aggregate_function, MAX},
+        {"range", no_argument, &output_range, true},
        {"basic_mode", no_argument, &basic_mode, true},
        {"no_mem", no_argument, &no_mem, true},
        {"no_normalization", no_argument, &no_normalization, true},