259 lines
11 KiB
Python
259 lines
11 KiB
Python
""" NNAPI Systrace parser - tracking of call tree based on trace lines
|
|
|
|
See contract-between-code-and-parser.txt for the
|
|
specification (cases in the specification are referred to with SPEC).
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag,
|
|
get_function_name_from_mark, make_tag)
|
|
from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION
|
|
from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT
|
|
from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK
|
|
from parser.tree import SingleThreadCallTree
|
|
|
|
class AppPhase(object):
|
|
""" Class to track the overall phase of the program. Used to split up warmup and benchmark.
|
|
|
|
Needs to be separate from the call trees to propagate the difference to driver.
|
|
"""
|
|
def __init__(self):
|
|
self.reset()
|
|
|
|
def current(self):
|
|
if self.stack:
|
|
return self.stack[-1]
|
|
else:
|
|
return PHASE_OVERALL
|
|
|
|
def push(self, phase):
|
|
self.stack.append(phase)
|
|
|
|
def pop(self):
|
|
self.stack.pop()
|
|
|
|
def reset(self):
|
|
self.stack = []
|
|
|
|
class Tracker(object):
|
|
""" Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree
|
|
as well as keeping track of entry and exit times for functions.
|
|
|
|
Exposes statistics for a single thread, transforming the call tree as needed.
|
|
All statistics are in milliseconds.
|
|
|
|
Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat().
|
|
|
|
Subphases of Execution are aggregated towards the overall Execution phase as needed.
|
|
"""
|
|
def __init__(self, tgid, is_driver, app_phase):
|
|
self.tgid = tgid
|
|
self.is_driver = is_driver
|
|
self.app_phase = app_phase
|
|
|
|
# Match the trace string
|
|
# "[NN_LA_PP]funcE1" in "B|<thread1>|[NN_LA_PP]funcE1"
|
|
# "[NN_LC_PCO]funcC1" in "B|<thread1>|[SW][NN_LC_PCO]funcC1"
|
|
self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?")
|
|
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.stats = {}
|
|
self.items = {}
|
|
self.mytree = SingleThreadCallTree()
|
|
self.begins_and_ends_ms = {}
|
|
self.la_pe_counts = {}
|
|
self.debugstring = "\n"
|
|
|
|
def handle_mark(self, time, mark):
|
|
""" Handle a single trace item (scoped entry and exit).
|
|
Translates:
|
|
- Automatically generated HIDL traces into NNTRACE layers and phases
|
|
- SPEC:Switch phase during function into dummy items
|
|
- SPEC:Subtracting time when nesting is violated into "subtract"
|
|
markers
|
|
- CPU/Driver layer distinction based on whether the process is the
|
|
driver or an application
|
|
This function is called multiple times for a single application run,
|
|
afterwards the statistics can be calculated.
|
|
"""
|
|
if mark[0] == "B":
|
|
switch = False
|
|
subtract = False
|
|
# Workarounds for wrong tracepoints in early versions
|
|
# TODO(mikie): remove later
|
|
if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark):
|
|
mark = mark.replace("_PT", "_PE")
|
|
# Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333)
|
|
if ("getSupportedExtensions" in mark):
|
|
mark = mark.replace("_PC", "_PI")
|
|
elif ("[SW][NN_LA_PR]executeWithCompilation" in mark):
|
|
mark = mark.replace("[SW]", "")
|
|
if MARKER_SWITCH in mark:
|
|
switch = True
|
|
if MARKER_SUBTRACT in mark:
|
|
subtract = True
|
|
if switch:
|
|
# End previous item
|
|
self.handle_mark(time, "E")
|
|
# Push a placeholder item that will get popped by the 'real' end of the
|
|
# previous item.
|
|
self.mytree.push_dummy(time)
|
|
m = self.matcher.search(mark)
|
|
if m is None:
|
|
tag = translate_hidl_mark_to_nn_and_tag(mark)
|
|
if tag is None:
|
|
raise Exception("Couldn't parse mark " + mark)
|
|
else:
|
|
tag = m.group(1)
|
|
[_, layer, phase] = tag.split("_")
|
|
if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
|
|
self.app_phase.push(phase)
|
|
if not self.is_driver:
|
|
layer = layer.replace(LAYER_DRIVER, LAYER_CPU)
|
|
else:
|
|
layer = layer.replace(LAYER_CPU, LAYER_DRIVER)
|
|
if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION:
|
|
self.la_pe_counts[self.app_phase.current()] = (
|
|
self.la_pe_counts.get(self.app_phase.current(), 0) + 1)
|
|
self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract)
|
|
elif mark[0] == "E":
|
|
try:
|
|
node = self.mytree.pop(time)
|
|
if node.is_dummy(): # Placeholder item
|
|
pass
|
|
else:
|
|
if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
|
|
self.app_phase.pop()
|
|
function = node.app_phase + "::" + get_function_name_from_mark(node.mark)
|
|
self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) +
|
|
[[float(node.start_time_s) * 1000.0,
|
|
float(node.end_time_s) * 1000.0]])
|
|
except IndexError as e:
|
|
raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e
|
|
|
|
def is_complete(self):
|
|
""" Checks if we've seen all end tracepoints for the begin tracepoints.
|
|
"""
|
|
return self.mytree.current.is_root()
|
|
|
|
def calculate_stats(self):
|
|
assert self.is_complete()
|
|
self.mytree.remove_ignored()
|
|
self.mytree.remove_dummies()
|
|
self.mytree.copy_subtracted_init_and_wrong_la()
|
|
self.mytree.add_missing_la_nodes()
|
|
# self.mytree.print()
|
|
self.mytree.validate_nesting()
|
|
|
|
def recurse(node, prev_layer, prev_phase, indent, in_pe_layers):
|
|
[begun, mark, layer, phase] = [
|
|
node.start_time_s, node.mark, node.layer, node.phase()]
|
|
time = node.end_time_s
|
|
tag = None
|
|
elapsed0 = "DETAIL"
|
|
elapsed1 = node.elapsed_less_subtracted_ms()
|
|
if elapsed1 is None:
|
|
raise Exception("Elapsed for {} returned None".format(node.to_str()))
|
|
|
|
if not node.is_added_detail() and not node.subtract:
|
|
tag = node.app_phase + "_" + layer + "_" + phase
|
|
elapsed0 = elapsed1
|
|
self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0
|
|
self.items[tag] = self.items.get(tag, []) + [
|
|
mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + tag]
|
|
if phase in subphases[PHASE_EXECUTION]:
|
|
if not in_pe_layers.get(layer):
|
|
pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION)
|
|
self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0
|
|
self.items[pe_tag] = self.items.get(pe_tag, []) + [
|
|
mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + pe_tag]
|
|
if phase == PHASE_EXECUTION:
|
|
in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1
|
|
for c in node.children:
|
|
recurse(c, layer or prev_layer, phase or prev_phase,
|
|
indent + " ", in_pe_layers)
|
|
if phase == PHASE_EXECUTION:
|
|
in_pe_layers[layer] = in_pe_layers[layer] - 1
|
|
return
|
|
|
|
for top in self.mytree.root.children:
|
|
recurse(top, None, None, "", {})
|
|
self.debugstring = self.mytree.to_str()
|
|
|
|
# We need to special case the driver execution time because:
|
|
# - The existing drivers don't have tracing, so we rely on HIDL traces
|
|
# - Best we can do is to take the start of the HIDL server side call as
|
|
# the starting point (which includes a bit of overhead, but not much) and
|
|
# the start of the callback as the end point (which should be pretty
|
|
# accurate)
|
|
# Note that the begin and end may be on different threads, hence the
|
|
# calculation needs to happen in aggregation rather than here.
|
|
def get_ld_pe_begins(self, app_phase):
|
|
return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server")
|
|
|
|
def get_ld_pe_ends(self, app_phase):
|
|
return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client")
|
|
|
|
def get_stat(self, tag, app_phase, special_case_pe=True):
|
|
if not self.stats and not self.mytree.is_empty():
|
|
self.calculate_stats()
|
|
if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe:
|
|
# Execution is exposed as an asynchronous event from the runtime, we
|
|
# calculate the runtime time as starting from when the async operation is
|
|
# kicked off until wait finishes + synchronous setup and teardown calls.
|
|
# This has two limitations:
|
|
# - multithreaded usage will not work correctly
|
|
# - should the application spend so much time before calling wait that
|
|
# execution has already finished, the time would get allocated to the
|
|
# runtime incorrectly
|
|
async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute")
|
|
async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait")
|
|
elapsed = 0.0
|
|
for i in range(0, len(async_starts)):
|
|
elapsed = elapsed + (async_ends[i] - async_starts[i])
|
|
for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free",
|
|
"ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free",
|
|
"ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput",
|
|
"ANeuralNetworksExecution_setInputFromMemory",
|
|
"ANeuralNetworksExecution_setOutputFromMemory"]:
|
|
sync_starts = self.get_begins(app_phase, sync)
|
|
sync_ends = self.get_ends(app_phase, sync)
|
|
for i in range(0, len(sync_starts)):
|
|
elapsed = elapsed + (sync_ends[i] - sync_starts[i])
|
|
return elapsed
|
|
return self.stats.get(app_phase + "_" + tag, 0.0)
|
|
|
|
def get_execution_count(self, app_phase):
|
|
# ANeuralNetworksExecution_create is reliable and comes from the runtime,
|
|
# but not available pre-P
|
|
count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create"))
|
|
if count > 0:
|
|
return count
|
|
# Application may have added tracepoints
|
|
return self.la_pe_counts.get(app_phase, 0)
|
|
|
|
def get_begins(self, app_phase, function):
|
|
name = app_phase + "::" + function
|
|
return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])]
|
|
def get_ends(self, app_phase, function):
|
|
name = app_phase + "::" + function
|
|
return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])]
|
|
|
|
def print_stats(self):
|
|
if not self.stats:
|
|
self.calculate_stats()
|
|
print(self.tgid, "Driver" if self.is_driver else "App")
|
|
for tag in self.stats:
|
|
print(tag, self.stats[tag])
|
|
if self.items.get(tag):
|
|
for item in self.items[tag]:
|
|
print(" ", item)
|
|
else:
|
|
print(" ", "calculated only")
|
|
|
|
def print(self):
|
|
self.mytree.print()
|