android13/packages/modules/NeuralNetworks/tools/systrace_parser/parser/aggregate.py

""" NNAPI systrace parser - aggegation of timing from multiple threads """

# TODO:
# - phase and layer totals
import math

from parser.naming import layers, phases, subphases
from parser.naming import (PHASE_OVERALL, PHASE_TERMINATION, PHASE_WARMUP,
                           PHASE_BENCHMARK, PHASE_EXECUTION, PHASE_INITIALIZATION,
                           PHASE_INPUTS_AND_OUTPUTS, PHASE_RESULTS)
from parser.naming import LAYER_APPLICATION, LAYER_IPC, LAYER_DRIVER
LAYER_TOTAL = "LT"  # Total across layers

def aggregate_times(tracker_map, special_case_lr_pe=True):
  """ Takes the trackers for each thread and produces timing statistics for
      all layers and phases.

      Returns (times, self_times, has_warmup and has_benchmark, execution_counts),
      where:
        - times and self_times are nested dictionaries of the form
          phase -> layer -> time with the following notes:
          - phase is flattened over all phases, except PHASE_WARMUP and
            PHASE_BENCHMARK, where the structure is phase -> phase -> layer -> time
          - PHASE_WARMUP and PHASE_BENCHMARK only nest execution and its
            subphases
          - PHASE_WARMUP and PHASE_BENCHMARK are not present if the trace does
            not contain them
          - the first level phase contains total over PHASE_WARMUP and
            PHASE_BENCHMARK if present
          - time may be math.nan if the data is not present in the trace
          - in addition to the layer from parser.naming, LAYER_TOTAL holds
            the total time spent in that layer over all phases
        - execution_counts contains a dictionary of the form
          {PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK} -> no of executions
  """
  all_application_phases = [PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK]
  # Calculate execution counts
  execution_counts = dict()
  for app_phase in all_application_phases:
    execution_count = 0
    for pid in tracker_map:
      execution_count = max(execution_count, tracker_map[pid].get_execution_count(app_phase))
    execution_counts[app_phase] = execution_count
  has_warmup = bool(execution_counts[PHASE_WARMUP])
  has_benchmark = bool(execution_counts[PHASE_BENCHMARK])
  if not (has_warmup and has_benchmark):
    all_application_phases = [PHASE_OVERALL]

  # Create dicts
  times = {}
  self_times = {}
  if has_warmup and has_benchmark:
    for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
      times[app_phase] = {}
      self_times[app_phase] = {}
      for phase in _phase_and_subphases(PHASE_EXECUTION):
        times[app_phase][phase] = {}
        self_times[app_phase][phase] = {}
  for phase in phases + [PHASE_OVERALL] + subphases[PHASE_EXECUTION]:
    times[phase] = {}
    self_times[phase] = {}

  # Gather total times from all threads, calculate layer and phase totals
  for layer in layers:
    for phase0 in [PHASE_OVERALL] + phases:
      for phase in _phase_and_subphases(phase0):
        t = 0.0
        tag = layer + "_" + phase
        for app_phase in all_application_phases:
          t0 = 0.0
          if layer == LAYER_DRIVER and phase == PHASE_EXECUTION:
            # Calculate driver execution times from begins and ends
            begins = []
            ends = []
            for pid in tracker_map:
              begins = begins + tracker_map[pid].get_ld_pe_begins(app_phase)
              ends = ends + tracker_map[pid].get_ld_pe_ends(app_phase)
            assert len(begins) == len(ends)
            begins.sort()
            ends.sort()
            for i in range(0, len(begins)):
              t0 += (ends[i] - begins[i])
          else:
            for pid in tracker_map:
              t0 += tracker_map[pid].get_stat(tag, app_phase, special_case_lr_pe)
          if phase0 == PHASE_EXECUTION and (app_phase != PHASE_OVERALL):
            times[app_phase][phase][layer] = zero_to_nan_if_missing(t0, phase, layer)
          t += t0
        times[phase][layer] = zero_to_nan_if_missing(t, phase, layer)
    if not times[PHASE_OVERALL][layer]:
      times[PHASE_OVERALL][layer] = sum(nan_to_zero(times[phase][layer]) for phase in phases)
  for phase0 in [PHASE_OVERALL] + phases:
    for phase in _phase_and_subphases(phase0):
      times[phase][LAYER_TOTAL] = max_ignoring_nans(times[phase].values())
      if phase0 == PHASE_EXECUTION and (has_warmup and has_benchmark):
        for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
          times[app_phase][phase][LAYER_TOTAL] = max_ignoring_nans(times[app_phase][phase].values())

  # Calculate self-times for each layer
  for phase0 in [PHASE_OVERALL] + phases:
    for phase in _phase_and_subphases(phase0):
      self_times[phase][LAYER_TOTAL] = times[phase][LAYER_TOTAL]
      if phase0 == PHASE_EXECUTION and (has_warmup and has_benchmark):
        for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
          self_times[app_phase][phase][LAYER_TOTAL] = times[app_phase][phase][LAYER_TOTAL]
      t = 0.0
      for layer in reversed(layers):
        if math.isnan(times[phase][layer]):
          self_times[phase][layer] = math.nan
        elif times[phase][layer] == 0.0:
          self_times[phase][layer] = 0.0
        elif (phase == PHASE_OVERALL and
              (layer == LAYER_DRIVER or layer == LAYER_IPC) and
              times[PHASE_EXECUTION][LAYER_DRIVER] == 0.0):
          # Driver was only used for initialization phase, did not support
          # execution of the model
          if layer == LAYER_DRIVER:
            self_times[phase][layer] = times[phase][layer]
          else:
            self_times[phase][layer] = times[phase][layer] - times[phase][LAYER_DRIVER]
        else:
          self_times[phase][layer] = times[phase][layer] - t
          t = times[phase][layer]
      if phase0 == PHASE_EXECUTION and (has_benchmark or has_warmup):
        for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
          t = 0.0
          for layer in reversed(layers):
            if math.isnan(times[app_phase][phase][layer]):
              self_times[app_phase][phase][layer] = math.nan
            elif times[app_phase][phase][layer] == 0.0:
              self_times[app_phase][phase][layer] = 0.0
            else:
              self_times[app_phase][phase][layer] = times[app_phase][phase][layer] - t
              t = times[app_phase][phase][layer]

  return (times, self_times, has_warmup and has_benchmark, execution_counts)

def zero_to_nan_if_missing(f, phase, layer):
  """ Turn zero time to a NaN to indicate missing data, when we think that
      the data is really missing. Data should only be missing from the
      Application layer (applications may not have any tracing) and
      the subphases of Execution in the Driver layer (other phases are
      discernible from the automatic HIDL tracepoints)."""
  if f == 0.0:
    if layer == LAYER_APPLICATION:
      return math.nan
    if layer == LAYER_DRIVER and phase in subphases[PHASE_EXECUTION]:
      return math.nan
  return f

def nan_to_zero(f):
  if math.isnan(f):
    return 0.0
  return f

def _phase_and_subphases(phase):
  if phase == PHASE_OVERALL:
    return [phase]
  if phase == PHASE_WARMUP or phase == PHASE_BENCHMARK:
    return []
  return [phase] + subphases.get(phase, [])

def max_ignoring_nans(xs):
  return max(map(nan_to_zero, xs))