1665 lines
73 KiB
C++
1665 lines
73 KiB
C++
/*
|
|
* Copyright (C) 2017 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#define LOG_TAG "ExecutionBuilder"
|
|
|
|
#include "ExecutionBuilder.h"
|
|
|
|
#include <ControlFlow.h>
|
|
#include <CpuExecutor.h>
|
|
#include <LegacyUtils.h>
|
|
#include <Tracing.h>
|
|
#include <android-base/logging.h>
|
|
#include <nnapi/IBurst.h>
|
|
#include <nnapi/IPreparedModel.h>
|
|
#include <nnapi/Types.h>
|
|
|
|
#include <algorithm>
|
|
#include <limits>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <tuple>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "BurstBuilder.h"
|
|
#include "CompilationBuilder.h"
|
|
#include "Manager.h"
|
|
#include "ModelArgumentInfo.h"
|
|
#include "ModelBuilder.h"
|
|
#include "Telemetry.h"
|
|
#include "TypeManager.h"
|
|
|
|
namespace android {
|
|
namespace nn {
|
|
|
|
// Partial validation of output shapes returned from driver, to ensure they
|
|
// conform to a very specific set of rules.
|
|
static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
|
|
const std::vector<OutputShape>& shapes) {
|
|
// Enforces the following rules (some of which are from b/154054474):
|
|
// - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
|
|
// If the vector is not empty, it must have as many entries as the step model has outputs.
|
|
// - If NONE, then either shapes vector is empty, or every shape is
|
|
// marked isSufficient and, if a tensor, has known rank.
|
|
// - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty. At least one entry
|
|
// is marked !isSufficient.
|
|
switch (executionStatus) {
|
|
case ErrorStatus::NONE: {
|
|
NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
|
|
<< "With execution ErrorStatus " << executionStatus
|
|
<< " output shapes vector must be empty or of length " << model->outputCount()
|
|
<< " but has length " << shapes.size();
|
|
NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
|
|
[](const OutputShape& shape) { return shape.isSufficient; }))
|
|
<< "With execution ErrorStatus " << executionStatus
|
|
<< " at least one output shape is unexpectedly marked !isSufficient";
|
|
|
|
const TypeManager* tm = TypeManager::get();
|
|
for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
|
|
++outputIndex) {
|
|
const Operand& outputOperand = model->getOutputOperand(outputIndex);
|
|
NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
|
|
(shapes[outputIndex].dimensions.size() != 0))
|
|
<< "With execution ErrorStatus " << executionStatus << " output#"
|
|
<< outputIndex << " shape unexpectedly has zero rank";
|
|
}
|
|
|
|
break;
|
|
}
|
|
case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
|
|
NN_RET_CHECK(shapes.size() == model->outputCount())
|
|
<< "With execution ErrorStatus " << executionStatus
|
|
<< " output shapes vector must be of length " << model->outputCount()
|
|
<< " but has length " << shapes.size();
|
|
NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
|
|
[](const OutputShape& shape) { return !shape.isSufficient; }))
|
|
<< "With execution ErrorStatus " << executionStatus
|
|
<< " at least one output shape must have been marked !isSufficient";
|
|
break;
|
|
}
|
|
default: {
|
|
NN_RET_CHECK(shapes.size() == 0)
|
|
<< "With execution ErrorStatus " << executionStatus
|
|
<< " output shapes vector must be empty but has length " << shapes.size();
|
|
break;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
|
|
const std::vector<OutputShape>& shapes) {
|
|
return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
|
|
model, shapes);
|
|
}
|
|
|
|
static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
|
|
return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
|
|
}
|
|
|
|
static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
|
|
const char* tag, bool allowUnspecified) {
|
|
if (newType != nullptr) {
|
|
const Extension::OperandTypeInformation* info = nullptr;
|
|
if (isExtension(operand.type)) {
|
|
NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
|
|
}
|
|
if (validateOperandType(*newType, info, tag, allowUnspecified) !=
|
|
ANEURALNETWORKS_NO_ERROR) {
|
|
LOG(ERROR) << tag << ": Invalid newType";
|
|
return false;
|
|
}
|
|
if (operand.dimensions.size() == 0) {
|
|
return true;
|
|
}
|
|
if (operand.dimensions.size() != newType->dimensionCount) {
|
|
LOG(ERROR) << tag << ": Setting with incompatible dimension count (existing = "
|
|
<< operand.dimensions.size() << ", new = " << newType->dimensionCount << ")";
|
|
return false;
|
|
}
|
|
for (uint32_t i = 0; i < newType->dimensionCount; i++) {
|
|
if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
|
|
LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
|
|
return false;
|
|
}
|
|
}
|
|
} else {
|
|
if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
|
|
tensorHasUnspecifiedDimensions(operand)) {
|
|
LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
|
|
: mCompilation(compilation),
|
|
mModel(compilation->mModel),
|
|
mPlan(&compilation->mPlan),
|
|
mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)),
|
|
mInputs(mModel->inputCount()),
|
|
mOutputs(mModel->outputCount()) {
|
|
VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
|
|
<< " inputs and " << mOutputs.size() << " outputs";
|
|
}
|
|
|
|
SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation)
|
|
: ExecutionBuilder(compilation) {
|
|
CHECK(mPlan->isSimple());
|
|
}
|
|
|
|
CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation)
|
|
: ExecutionBuilder(compilation) {
|
|
CHECK(mPlan->isCompound());
|
|
}
|
|
|
|
const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
|
|
return mPlan->getSourceModels().getModel(index);
|
|
}
|
|
|
|
int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
|
|
const void* buffer, size_t length) {
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
uint32_t count = static_cast<uint32_t>(mInputs.size());
|
|
if (index >= count) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (!checkDimensionInfo(mModel->getInputOperand(index), type,
|
|
"ANeuralNetworksExecution_setInput", buffer == nullptr)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (length > 0xFFFFFFFF) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
uint32_t l = static_cast<uint32_t>(length);
|
|
if (!mInputs[index].unspecified()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
|
|
"provided";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
int n;
|
|
std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
|
|
mModel->getInputOperand(index), type, const_cast<void*>(buffer), l,
|
|
mInputAndOutputPaddingEnabled);
|
|
mHasCalledSetInputOutput = true;
|
|
return n;
|
|
}
|
|
|
|
int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
|
|
const RuntimeMemory* memory, size_t offset,
|
|
size_t length) {
|
|
// Should be similar to StepExecutor::setInputOrOutputFromMemory()
|
|
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
uint32_t count = static_cast<uint32_t>(mInputs.size());
|
|
if (index >= count) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
|
|
<< count;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (!checkDimensionInfo(mModel->getInputOperand(index), type,
|
|
"ANeuralNetworksExecution_setInputFromMemory", false)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
|
|
length)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
// For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
|
|
// allow the client to specify offset == 0 && length == 0 indicating that the entire memory
|
|
// region is used. We update the length here because the drivers are still expecting a real
|
|
// length. For other memories that do not allow this semantic, it is checked in
|
|
// MemoryValidatorBase::validate before reaching here.
|
|
if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
|
|
length = memory->getSize();
|
|
}
|
|
// TODO validate the rest
|
|
uint32_t poolIndex = mMemories.add(memory);
|
|
if (!mInputs[index].unspecified()) {
|
|
LOG(ERROR)
|
|
<< "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
|
|
"been provided";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
int n;
|
|
std::tie(n, mInputs[index]) =
|
|
ModelArgumentInfo::createFromMemory(mModel->getInputOperand(index), type, poolIndex,
|
|
offset, length, mInputAndOutputPaddingEnabled);
|
|
mHasCalledSetInputOutput = true;
|
|
return n;
|
|
}
|
|
|
|
int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
|
|
void* buffer, size_t length) {
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
uint32_t count = static_cast<uint32_t>(mOutputs.size());
|
|
if (index >= count) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
|
|
"ANeuralNetworksExecution_setOutput", true)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (length > 0xFFFFFFFF) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
uint32_t l = static_cast<uint32_t>(length);
|
|
if (!mOutputs[index].unspecified()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
|
|
"provided";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
int n;
|
|
std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromPointer(
|
|
mModel->getOutputOperand(index), type, buffer, l, mInputAndOutputPaddingEnabled);
|
|
mHasCalledSetInputOutput = true;
|
|
return n;
|
|
}
|
|
|
|
int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
|
|
const RuntimeMemory* memory, size_t offset,
|
|
size_t length) {
|
|
// Should be similar to StepExecutor::setInputOrOutputFromMemory()
|
|
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
uint32_t count = static_cast<uint32_t>(mOutputs.size());
|
|
if (index >= count) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
|
|
<< count;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
|
|
"ANeuralNetworksExecution_setOutputFromMemory", true)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
|
|
length)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
// For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
|
|
// allow the client to specify offset == 0 && length == 0 indicating that the entire memory
|
|
// region is used. We update the length here because the drivers are still expecting a real
|
|
// length. For other memories that do not allow this semantic, it is checked in
|
|
// MemoryValidatorBase::validate before reaching here.
|
|
if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
|
|
length = memory->getSize();
|
|
}
|
|
// TODO validate the rest
|
|
uint32_t poolIndex = mMemories.add(memory);
|
|
if (!mOutputs[index].unspecified()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
|
|
"already been provided";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
int n;
|
|
std::tie(n, mOutputs[index]) =
|
|
ModelArgumentInfo::createFromMemory(mModel->getOutputOperand(index), type, poolIndex,
|
|
offset, length, mInputAndOutputPaddingEnabled);
|
|
mHasCalledSetInputOutput = true;
|
|
return n;
|
|
}
|
|
|
|
int ExecutionBuilder::setMeasureTiming(bool measure) {
|
|
if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
|
|
<< "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
|
|
<< "that was not created by ANeuralNetworksCompilation_createForDevices "
|
|
<< "with numDevices = 1";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
mMeasureTiming = measure;
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
|
|
if (!completed()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
|
|
"execution has finished.";
|
|
*duration = UINT64_MAX;
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (completedWith() != Completion::NO_ERROR) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
|
|
"that has encountered an error.";
|
|
*duration = UINT64_MAX;
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
|
|
if (!mMeasureTiming) {
|
|
*duration = UINT64_MAX;
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
|
|
Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
|
|
Timing timingFenced = timingLaunched;
|
|
if (mFencedExecutionCallback != nullptr) {
|
|
auto result = mFencedExecutionCallback();
|
|
if (!result.has_value()) {
|
|
LOG(ERROR) << "Fenced execution callback failed: " << result.error().message;
|
|
*duration = UINT64_MAX;
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
std::tie(timingLaunched, timingFenced) = std::move(result).value();
|
|
}
|
|
const OptionalDuration selectedDuration = [durationCode, &timingLaunched,
|
|
&timingFenced]() -> OptionalDuration {
|
|
switch (durationCode) {
|
|
case ANEURALNETWORKS_DURATION_ON_HARDWARE:
|
|
return timingLaunched.timeOnDevice;
|
|
case ANEURALNETWORKS_DURATION_IN_DRIVER:
|
|
return timingLaunched.timeInDriver;
|
|
case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
|
|
return timingFenced.timeOnDevice;
|
|
case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
|
|
return timingFenced.timeInDriver;
|
|
default:
|
|
LOG(FATAL) << "unexpected";
|
|
return std::nullopt;
|
|
}
|
|
}();
|
|
if (selectedDuration.has_value()) {
|
|
constexpr uint64_t kMaxTiming = std::numeric_limits<uint64_t>::max() - 1;
|
|
using CommonType = std::common_type_t<Duration::rep, uint64_t>;
|
|
const auto count = std::min<CommonType>(selectedDuration.value().count(), kMaxTiming);
|
|
*duration = static_cast<uint64_t>(count);
|
|
} else {
|
|
constexpr uint64_t kNoTiming = std::numeric_limits<uint64_t>::max();
|
|
*duration = kNoTiming;
|
|
}
|
|
|
|
VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
|
|
if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
|
|
"created from an ANeuralNetworksCompilation that was not created by "
|
|
"ANeuralNetworksCompilation_createForDevices with numDevices = 1";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (duration > 0) {
|
|
mTimeoutDuration = duration;
|
|
} else {
|
|
mTimeoutDuration.reset();
|
|
}
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
|
|
return mTimeoutDuration;
|
|
}
|
|
|
|
TimePoint ExecutionBuilder::getComputeStartTimePoint() const {
|
|
CHECK(computationStarted()) << "getComputeStartTimePoint called before "
|
|
<< "execution has started.";
|
|
return mComputeStartTimePoint;
|
|
}
|
|
|
|
int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (duration > operation_while::kTimeoutNsMaximum) {
|
|
LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
|
|
<< "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
|
|
duration = operation_while::kTimeoutNsMaximum;
|
|
}
|
|
mLoopTimeoutDuration = duration;
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::enableInputAndOutputPadding(bool enable) {
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (mHasCalledSetInputOutput) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after an input "
|
|
"or output is set.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
mInputAndOutputPaddingEnabled = enable;
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::setReusable(bool reusable) {
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_setReusable called after the "
|
|
"execution has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
mReusable = reusable;
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::addExtensionAttribute(const char* extensionName,
|
|
uint16_t attributeCodeWithinExtension, const void* data,
|
|
size_t length) {
|
|
if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called on an "
|
|
"ANeuralNetworksExecution created from an ANeuralNetworksCompilation that "
|
|
"was not created by ANeuralNetworksCompilation_createForDevices with "
|
|
"numDevices = 1";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (computationStarted()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called after the execution "
|
|
"has started.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
int32_t attributeToken = 0;
|
|
if (!TypeManager::get()->getExtensionType(extensionName, attributeCodeWithinExtension,
|
|
&attributeToken)) {
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
if (std::find_if(mMetadata.begin(), mMetadata.end(), [attributeToken](const auto& entry) {
|
|
return attributeToken == entry.token;
|
|
}) != mMetadata.end()) {
|
|
LOG(ERROR) << "ANeuralNetworksCompilation_addExtensionAttribute called more than once for "
|
|
"the same attribute";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
const uint8_t* dataPtr = reinterpret_cast<const uint8_t*>(data);
|
|
mMetadata.push_back({attributeToken, std::vector<uint8_t>(dataPtr, dataPtr + length)});
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
|
|
if (!completed()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
|
|
"execution has finished.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (completedWith() == Completion::OTHER_ERROR) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
|
|
"that has encountered an error.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
|
|
uint32_t count = static_cast<uint32_t>(mOutputs.size());
|
|
if (index >= count) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
|
|
<< " " << count;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
const auto& dims = mOutputs[index].dimensions();
|
|
if (dims.empty()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
|
|
"dimensions of a scalar";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
std::copy(dims.begin(), dims.end(), dimensions);
|
|
return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
|
|
: ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
|
|
}
|
|
|
|
int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
|
|
if (!completed()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
|
|
"execution has finished.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (completedWith() == Completion::OTHER_ERROR) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
|
|
"that has encountered an error.";
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
uint32_t count = static_cast<uint32_t>(mOutputs.size());
|
|
if (index >= count) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
|
|
<< count;
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
*rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
|
|
return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
|
|
: ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
|
|
}
|
|
|
|
bool ExecutionBuilder::checkAndSetComputationState(const char* name) {
|
|
std::lock_guard<std::mutex> lock(mStateMutex);
|
|
if (!mReusable && mState == State::COMPLETED) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_" << name
|
|
<< " called on a non-reusable execution that has already completed";
|
|
return false;
|
|
}
|
|
if (mState == State::COMPUTATION) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_" << name
|
|
<< " called on an execution that has already started";
|
|
return false;
|
|
}
|
|
mState = State::COMPUTATION;
|
|
return true;
|
|
}
|
|
|
|
// TODO(b/132321855): validate that we have full types for all inputs and outputs,
|
|
// that the graph is not cyclic,
|
|
static int validateRequest(const std::vector<ModelArgumentInfo>& inputs,
|
|
const std::vector<ModelArgumentInfo>& outputs) {
|
|
for (auto& p : inputs) {
|
|
if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all inputs specified";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
}
|
|
for (auto& p : outputs) {
|
|
if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all outputs specified";
|
|
return ANEURALNETWORKS_BAD_DATA;
|
|
}
|
|
}
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
int ExecutionBuilder::getValidationResultCode() {
|
|
if (!mValidationResultCode.has_value()) {
|
|
mValidationResultCode = validateRequest(mInputs, mOutputs);
|
|
}
|
|
return mValidationResultCode.value();
|
|
}
|
|
|
|
bool ExecutionBuilder::areOutputsFullySpecified() {
|
|
if (!mOutputsFullySpecified.has_value()) {
|
|
mOutputsFullySpecified = true;
|
|
for (uint32_t i = 0; i < mOutputs.size(); i++) {
|
|
if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
|
|
TypeManager::get()->isTensorType(mModel->getOutputOperand(i).type) &&
|
|
tensorHasUnspecifiedDimensions(mModel->getOutputOperand(i).type,
|
|
mOutputs[i].initialDimensions())) {
|
|
mOutputsFullySpecified = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return mOutputsFullySpecified.value();
|
|
}
|
|
|
|
int ExecutionBuilder::prepareForCompute(const char* name, ExecutionMode mode) {
|
|
if (!checkAndSetComputationState(name)) {
|
|
return ANEURALNETWORKS_BAD_STATE;
|
|
}
|
|
if (int n = getValidationResultCode(); n != ANEURALNETWORKS_NO_ERROR) {
|
|
return finishComputation(n, {}, mode);
|
|
}
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
|
|
// Attempt synchronous execution of full model on CPU.
|
|
// TODO: How should we handle timing in this case?
|
|
// For Q this is irrelevant: We only support timing in conjunction
|
|
// with an explicit device list; and we do not support CPU fallback
|
|
// with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
|
|
static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
|
|
ExecutionBuilder* executionBuilder) {
|
|
CHECK(executionBuilder != nullptr);
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
|
|
VLOG(EXECUTION) << "cpuFallbackFull";
|
|
|
|
// Get fallback executor.
|
|
StepExecutor executor(executionBuilder, executionBuilder->getModel(),
|
|
DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr,
|
|
/*reusable=*/false);
|
|
executor.mapInputsAndOutputsTrivially();
|
|
|
|
// Attempt fallback execution.
|
|
return executor.computeOnCpuFallback();
|
|
}
|
|
|
|
// Attempt synchronous execution on CPU.
|
|
// TODO: How should we handle timing in this case?
|
|
// For Q this is irrelevant: We only support timing in conjunction
|
|
// with an explicit device list; and we do not support CPU fallback
|
|
// with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
|
|
static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
|
|
cpuFallbackPartial(const ExecutionPlan& plan,
|
|
std::shared_ptr<ExecutionPlan::Controller> controller) {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
|
|
VLOG(EXECUTION) << "cpuFallbackPartial";
|
|
|
|
// Get fallback executor.
|
|
std::shared_ptr<StepExecutor> executor;
|
|
int n1 = plan.fallback(controller, &executor, nullptr, nullptr);
|
|
if (n1 != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n1, {}, {}, nullptr};
|
|
}
|
|
CHECK(executor != nullptr);
|
|
|
|
// Attempt fallback execution.
|
|
auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
|
|
return {n2, std::move(outputShapes), timing, executor};
|
|
}
|
|
|
|
std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal(
|
|
const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal");
|
|
VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal";
|
|
|
|
if (mExecutor == nullptr) {
|
|
mExecutor = mPlan->makeStepExecutor(mReusable, this);
|
|
}
|
|
|
|
auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr;
|
|
auto [n, outputShapes, timing] = mExecutor->compute(deadline, burstController);
|
|
|
|
if (n == ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, std::move(outputShapes), timing};
|
|
}
|
|
|
|
// ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable.
|
|
if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
|
|
return {n, std::move(outputShapes), {}};
|
|
}
|
|
|
|
// If CPU fallback is not allowed and there was an error, end execution.
|
|
if (!mAllowCpuFallback) {
|
|
return {n, {}, {}};
|
|
}
|
|
|
|
// If CPU execution was already attempted, do not perform CPU fallback.
|
|
if (mExecutor->isCpu()) {
|
|
return {n, {}, {}};
|
|
}
|
|
|
|
// If the code has reached this point, a potentially recoverable error
|
|
// occurred during the execution. Do an execution fallback on the CPU.
|
|
return cpuFallbackFull(this);
|
|
}
|
|
|
|
std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal(
|
|
const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal");
|
|
VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)";
|
|
|
|
auto controller = mPlan->makeController(this, burstBuilder);
|
|
std::vector<OutputShape> outputShapes = getInitialOutputShapes();
|
|
|
|
// On this iteration, do I need to repeat the previous step because it
|
|
// reported insufficient size?
|
|
bool doInsufficientSizeFallback = false;
|
|
|
|
while (true) {
|
|
VLOG(EXECUTION) << "looking for next StepExecutor";
|
|
|
|
// Get the current step of the execution.
|
|
std::shared_ptr<StepExecutor> executor;
|
|
SharedBurst burstController;
|
|
int n = doInsufficientSizeFallback
|
|
? mPlan->fallback(controller, &executor, &burstController, &outputShapes)
|
|
: mPlan->next(controller, &executor, &burstController, &outputShapes);
|
|
doInsufficientSizeFallback = false;
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
// During the interpreted execution of control flow, a loop timeout
|
|
// might occur in ExecutionPlan::next().
|
|
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
|
|
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
|
|
if (mAllowCpuFallback && !missedDeadline) break;
|
|
return {n, {}, {}};
|
|
}
|
|
|
|
// If the code reached the end of the plan without error, then return
|
|
// with no error.
|
|
if (executor == nullptr) {
|
|
return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}};
|
|
}
|
|
const bool executorIsCpu = executor->isCpu();
|
|
|
|
// Attempt to execute a single step of the execution.
|
|
auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController);
|
|
|
|
// Update global outputs and dynamic temporaries.
|
|
StepExecutor::UpdateOutputShapes updateOutputShapes = {};
|
|
if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
|
|
&updateOutputShapes)) {
|
|
stepN = ANEURALNETWORKS_OP_FAILED;
|
|
}
|
|
|
|
// If execution was successful, continue to next step.
|
|
if (stepN == ANEURALNETWORKS_NO_ERROR) {
|
|
if (updateOutputShapes.zeroSizedInput) {
|
|
// We'll need to do full model CPU fallback
|
|
VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
|
|
stepN = ANEURALNETWORKS_OP_FAILED;
|
|
} else {
|
|
CHECK(executor->areDynamicTemporariesAllocated());
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
|
|
VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
|
|
if (updateOutputShapes.mainOutputInsufficient ||
|
|
!updateOutputShapes.updatedDynamicTemporary) {
|
|
// Either:
|
|
// - At least one main model output is not of sufficient size; or
|
|
// - we didn't learn anything new about dynamic temporaries.
|
|
// Neither of these is recoverable, so end execution.
|
|
return {stepN, outputShapes, {}};
|
|
}
|
|
// Every main model output is of sufficient size. This implies that
|
|
// at least one dynamic temporary is not of sufficient size. This
|
|
// is recoverable.
|
|
doInsufficientSizeFallback = true;
|
|
continue;
|
|
}
|
|
|
|
// If CPU fallback is not allowed and there was an error, end execution.
|
|
if (!mAllowCpuFallback) {
|
|
return {stepN, {}, {}};
|
|
}
|
|
|
|
// If CPU execution was already attempted, perform a full CPU fallback.
|
|
if (executorIsCpu) {
|
|
break;
|
|
}
|
|
|
|
// If the code reaches this point, attempt a partial fallback to CPU.
|
|
CHECK(mAllowCpuFallback);
|
|
if (updateOutputShapes.zeroSizedInput) {
|
|
// Do not attempt a partial fallback.
|
|
break;
|
|
}
|
|
while (true) {
|
|
auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] =
|
|
cpuFallbackPartial(*mPlan, controller);
|
|
|
|
// Update global outputs and dynamic temporaries.
|
|
StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
|
|
if (fallbackExecutor != nullptr &&
|
|
!fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
|
|
&outputShapes, &fallbackUpdateOutputShapes)) {
|
|
fallbackN = ANEURALNETWORKS_OP_FAILED;
|
|
}
|
|
|
|
// If execution was successful, continue to next step.
|
|
if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
|
|
if (fallbackUpdateOutputShapes.zeroSizedInput) {
|
|
// We'll need to do full model CPU fallback
|
|
VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
|
|
fallbackN = ANEURALNETWORKS_OP_FAILED;
|
|
break;
|
|
}
|
|
CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
|
|
goto nextStep;
|
|
}
|
|
|
|
if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
|
|
VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
|
|
<< toString(fallbackUpdateOutputShapes);
|
|
if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
|
|
!fallbackUpdateOutputShapes.updatedDynamicTemporary) {
|
|
// Either:
|
|
// - At least one main model output is not of sufficient size; or
|
|
// - we didn't learn anything new about dynamic temporaries.
|
|
// Neither of these is recoverable, so end execution.
|
|
return {fallbackN, outputShapes, {}};
|
|
}
|
|
// Every main model output is of sufficient size. This implies
|
|
// that at least one dynamic temporary is not of sufficient
|
|
// size. This is recoverable.
|
|
continue;
|
|
}
|
|
|
|
// If the code reaches this point, then there was an error with the
|
|
// fallback. In this case, attempt full fallback.
|
|
break;
|
|
}
|
|
|
|
// If the code reaches this point, then there was an error with the
|
|
// fallback. In this case, attempt full fallback.
|
|
break;
|
|
|
|
nextStep:
|
|
// Bottom of the outer loop
|
|
continue;
|
|
}
|
|
|
|
// If the code has reached this point, a potentially recoverable error
|
|
// occurred during the step executions. Instead, do a full execution
|
|
// fallback on the CPU.
|
|
return cpuFallbackFull(this);
|
|
}
|
|
|
|
static bool waitForSyncFences(const std::vector<int>& waitFor) {
|
|
for (int syncFd : waitFor) {
|
|
if (syncFd > 0) {
|
|
auto r = syncWait(syncFd, -1);
|
|
if (r != FenceState::SIGNALED) {
|
|
VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal(
|
|
const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline) {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal");
|
|
VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal";
|
|
|
|
if (mExecutor == nullptr) {
|
|
mExecutor = mPlan->makeStepExecutor(mReusable, this);
|
|
}
|
|
|
|
auto [n, syncFd, callback] =
|
|
mExecutor->computeFenced(waitFor, timeoutDurationAfterFence, deadline);
|
|
|
|
if (n == ANEURALNETWORKS_NO_ERROR) {
|
|
return {ANEURALNETWORKS_NO_ERROR, syncFd, callback};
|
|
}
|
|
|
|
// If CPU fallback is not allowed and there was an error, end execution.
|
|
if (!mAllowCpuFallback) {
|
|
return {n, -1, nullptr};
|
|
}
|
|
|
|
// If CPU execution was already attempted, return from the function with an error.
|
|
if (mExecutor->isCpu()) {
|
|
return {n, -1, nullptr};
|
|
}
|
|
|
|
// If the code has reached this point, a potentially recoverable error
|
|
// occurred during the step executions. Instead, do a full execution
|
|
// fallback on the CPU.
|
|
VLOG(EXECUTION) << "Performing full fallback on the CPU.";
|
|
if (!waitForSyncFences(waitFor)) {
|
|
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
|
|
}
|
|
auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this);
|
|
reportTimingWithoutFencedExecutionCallback(fallbackTiming);
|
|
return {fallbackN, -1, nullptr};
|
|
}
|
|
|
|
// In case of partitioned execution, computeFencedInternal call will return the sync
|
|
// fence and the fenced compute callback returned from the last partition.
|
|
// Any failed partition will result in whole execution fallback to CPU if
|
|
// mAllowCpuFallback is set to true.
|
|
std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal(
|
|
const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline) {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal");
|
|
VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)";
|
|
|
|
// We should have detected this earlier in the call chain and fallen back to
|
|
// non-fenced execution. This is an implementation limitation: In order to
|
|
// support dynamic temporarires in this code, we'd need to implement
|
|
// something like the following:
|
|
// - If a partition has outputs of unknown size, compute that partition in a
|
|
// non fenced fashion, just as if it were scheduled on a driver that does
|
|
// not support fenced execution.
|
|
// - Implement something similar to the code in CompoundExecutionBuilder::computeInternal()
|
|
// that handles a step execution that fails with
|
|
// ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
|
|
CHECK(!mCompilation->hasDynamicTemporaries());
|
|
|
|
// Initiate waitForFds, syncFence for the first step.
|
|
std::vector<int> waitForFds = waitFor;
|
|
base::unique_fd syncFence;
|
|
ExecuteFencedInfoCallback executeFencedInfoCallback;
|
|
|
|
std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
|
|
while (true) {
|
|
VLOG(EXECUTION) << "looking for next StepExecutor";
|
|
|
|
// Get the current step of the execution.
|
|
std::shared_ptr<StepExecutor> executor;
|
|
int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence.get());
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
// During the interpreted execution of control flow, a loop timeout
|
|
// might occur in ExecutionPlan::next().
|
|
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
|
|
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
|
|
if (mAllowCpuFallback && !missedDeadline) break;
|
|
// Return -1 for the sync fence fd, and nullptr for the callback.
|
|
return {n, -1, nullptr};
|
|
}
|
|
|
|
// If the code reached the end of the plan without error, then return
|
|
// with no error.
|
|
if (executor == nullptr) {
|
|
return {ANEURALNETWORKS_NO_ERROR, syncFence.release(), executeFencedInfoCallback};
|
|
}
|
|
|
|
// Attempt to compute a single step of the execution.
|
|
auto [stepN, syncFd, callback] =
|
|
executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
|
|
|
|
// Update waitForFds, syncFence for the next step.
|
|
syncFence.reset(syncFd);
|
|
executeFencedInfoCallback = callback;
|
|
waitForFds.clear();
|
|
if (syncFd >= 0) {
|
|
waitForFds = {syncFd};
|
|
}
|
|
|
|
// If execution was successful, continue to next step.
|
|
if (stepN == ANEURALNETWORKS_NO_ERROR) {
|
|
continue;
|
|
}
|
|
// If CPU fallback is not allowed and there was an error, end execution.
|
|
if (!mAllowCpuFallback) {
|
|
return {stepN, -1, nullptr};
|
|
}
|
|
|
|
// If the code reaches this point, then there was an error with the
|
|
// fallback. In this case, attempt full fallback.
|
|
break;
|
|
}
|
|
|
|
// If the code has reached this point, a potentially recoverable error
|
|
// occurred during the step executions. Instead, do a full execution
|
|
// fallback on the CPU.
|
|
VLOG(EXECUTION) << "Performing full fallback on the CPU.";
|
|
if (!waitForSyncFences(waitFor)) {
|
|
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
|
|
}
|
|
auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this);
|
|
return {fullN, -1, nullptr};
|
|
}
|
|
|
|
int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
|
|
uint64_t timeoutDurationAfterFence, int* syncFence) {
|
|
CHECK(syncFence != nullptr);
|
|
NN_RETURN_IF_ERROR(
|
|
prepareForCompute("startComputeWithDependencies", ExecutionMode::ASYNC_WITH_DEPS));
|
|
if (timeoutDurationAfterFence > 0) {
|
|
if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
|
|
LOG(ERROR)
|
|
<< "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
|
|
"duration on an ANeuralNetworksExecution "
|
|
"created from an ANeuralNetworksCompilation that was not created by "
|
|
"ANeuralNetworksCompilation_createForDevices with numDevices = 1";
|
|
return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS);
|
|
}
|
|
}
|
|
if (!areOutputsFullySpecified()) {
|
|
LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
|
|
" not all outputs have fully specified dimensions";
|
|
return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS);
|
|
}
|
|
|
|
// Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because
|
|
// fenced executions do not support dynamic output shape.
|
|
|
|
mComputeStartTimePoint = Clock::now();
|
|
VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
|
|
int result;
|
|
const auto deadline = makeDeadline(mTimeoutDuration);
|
|
std::tie(result, *syncFence, mFencedExecutionCallback) =
|
|
computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline);
|
|
// If there is an error, call finishComputation to mark the computation as completed.
|
|
// Otherwise, we will call finishComputation in SyncFenceEvent::wait().
|
|
if (result != ANEURALNETWORKS_NO_ERROR) {
|
|
// TODO(miaowang): support dynamic output shape only with memory domain.
|
|
// For now just return empty output shapes.
|
|
result = finishComputation(result, {}, ExecutionMode::ASYNC_WITH_DEPS);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int ExecutionBuilder::compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
|
|
BurstBuilder* burstBuilder) {
|
|
CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
|
|
<< "synchronizationCallback and burstBuilder cannot simultaneously be used";
|
|
|
|
const bool synchronous = (synchronizationCallback == nullptr);
|
|
if (!synchronous) {
|
|
*synchronizationCallback = nullptr;
|
|
}
|
|
|
|
const char* name = burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
|
|
const ExecutionMode mode = burstBuilder
|
|
? ExecutionMode::BURST
|
|
: synchronous ? ExecutionMode::SYNC : ExecutionMode::ASYNC;
|
|
NN_RETURN_IF_ERROR(prepareForCompute(name, mode));
|
|
|
|
// Validate input memory dimensions. We need to do the validation in every computation because
|
|
// the memory dimensions may change between computations.
|
|
for (auto& p : mInputs) {
|
|
if (p.state() == ModelArgumentInfo::MEMORY) {
|
|
const RuntimeMemory* memory = mMemories[p.locationAndLength().poolIndex];
|
|
if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
|
|
return finishComputation(ANEURALNETWORKS_OP_FAILED, {}, mode);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Reset output dimensions.
|
|
if (!areOutputsFullySpecified()) {
|
|
for (auto& output : mOutputs) {
|
|
output.reset();
|
|
}
|
|
}
|
|
|
|
const auto deadline = makeDeadline(mTimeoutDuration);
|
|
mComputeStartTimePoint = Clock::now();
|
|
if (synchronous) {
|
|
if (burstBuilder) {
|
|
VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
|
|
} else {
|
|
VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
|
|
}
|
|
const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder);
|
|
if (mMeasureTiming) {
|
|
mTimingWithoutFencedExecutionCallback = timing;
|
|
}
|
|
return finishComputation(n, outputShapes, mode);
|
|
} else /* asynchronous */ {
|
|
// TODO: For asynchronous execution, entire plan-based-path should run in an
|
|
// asynchronous thread -- take the asynchronous thread logic out of
|
|
// CpuExecution::compute() and use it to wrap the plan-based-path.
|
|
|
|
// TODO: use a thread pool
|
|
// TODO(mikie): this could have NNTRACE so we could measure the overhead
|
|
// of spinning up a new thread.
|
|
|
|
// Prepare the callback for asynchronous execution.
|
|
// std::shared_ptr<ExecutionCallback> object is returned when the
|
|
// execution has been successfully launched, otherwise a
|
|
// nullptr is returned. The executionCallback is
|
|
// abstracted in the NN API as an "event".
|
|
auto executionCallback = std::make_shared<ExecutionCallback>();
|
|
executionCallback->setOnFinish(
|
|
[this, mode](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
|
|
return finishComputation(error, outputShapes, mode);
|
|
});
|
|
const auto asyncStartCompute = [this, deadline, executionCallback] {
|
|
const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr);
|
|
const auto status = convertResultCodeToErrorStatus(n);
|
|
executionCallback->notify(status, outputShapes, timing);
|
|
};
|
|
if (DeviceManager::get()->syncExecRuntime()) {
|
|
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
|
|
asyncStartCompute();
|
|
} else {
|
|
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
|
|
std::thread asyncExecution(asyncStartCompute);
|
|
executionCallback->bindThread(std::move(asyncExecution));
|
|
}
|
|
*synchronizationCallback = executionCallback;
|
|
return ANEURALNETWORKS_NO_ERROR;
|
|
}
|
|
}
|
|
|
|
std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
|
|
std::vector<OutputShape> outputShapes(mOutputs.size());
|
|
std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
|
|
[](const auto& x) -> OutputShape {
|
|
std::vector<uint32_t> dimensions;
|
|
if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
|
|
dimensions = x.dimensions();
|
|
}
|
|
return {.dimensions = std::move(dimensions), .isSufficient = true};
|
|
});
|
|
return outputShapes;
|
|
}
|
|
|
|
// Check if the dimensions "to" is updatable by dimensions "from", where "from" must
|
|
// have no lower a specification level.
|
|
static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
|
|
if (to.size() == 0) return true;
|
|
NN_RET_CHECK_EQ(to.size(), from.size());
|
|
for (uint32_t i = 0; i < to.size(); i++) {
|
|
NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
|
|
return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
|
|
outputShape.dimensions.size() &&
|
|
(std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
|
|
outputShape.dimensions.end());
|
|
}
|
|
|
|
bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
|
|
const std::vector<OutputShape>& outputShapes) {
|
|
NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
|
|
|
|
if (outputShapes.size() == 0) {
|
|
return true;
|
|
}
|
|
NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
|
|
for (uint32_t i = 0; i < outputShapes.size(); i++) {
|
|
// Check if only unspecified dimensions or rank are overwritten.
|
|
NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
|
|
const OperandType operandType = mModel->getOutputOperand(i).type;
|
|
NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
|
|
outputShapes[i].dimensions));
|
|
}
|
|
for (uint32_t i = 0; i < outputShapes.size(); i++) {
|
|
mOutputs[i].dimensions() = outputShapes[i].dimensions;
|
|
mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool ExecutionBuilder::updateMemories() {
|
|
for (const auto& output : mOutputs) {
|
|
if (output.state() != ModelArgumentInfo::MEMORY) continue;
|
|
const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
|
|
NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int ExecutionBuilder::finishComputation(int result, const std::vector<OutputShape>& outputShapes,
|
|
ExecutionMode mode) {
|
|
const auto status = convertResultCodeToErrorStatus(result);
|
|
if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
|
|
result = ANEURALNETWORKS_OP_FAILED;
|
|
}
|
|
bool success = result == ANEURALNETWORKS_NO_ERROR;
|
|
for (const auto& output : mOutputs) {
|
|
if (output.state() != ModelArgumentInfo::MEMORY) continue;
|
|
const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
|
|
memory->getValidator().setInitialized(success);
|
|
}
|
|
switch (result) {
|
|
case ANEURALNETWORKS_NO_ERROR:
|
|
mCompletion = Completion::NO_ERROR;
|
|
break;
|
|
case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
|
|
mCompletion = Completion::OUTPUT_INSUFFICIENT_SIZE;
|
|
break;
|
|
default:
|
|
mCompletion = Completion::OTHER_ERROR;
|
|
break;
|
|
}
|
|
{
|
|
std::lock_guard<std::mutex> lock(mStateMutex);
|
|
CHECK(mState != State::PREPARATION)
|
|
<< "ExecutionBuilder::finishComputation is called in the preparation state";
|
|
CHECK(mState != State::COMPLETED) << "ExecutionBuilder::finishComputation is called twice";
|
|
mState = State::COMPLETED;
|
|
}
|
|
telemetry::onExecutionFinish(this, mode, result);
|
|
return result;
|
|
}
|
|
|
|
std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
|
|
return "{ .updatedDynamicTemporary = " +
|
|
std::to_string(updateOutputShapes.updatedDynamicTemporary) +
|
|
", .mainOutputInsufficient = " +
|
|
std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
|
|
}
|
|
|
|
bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
|
|
std::vector<OutputShape>* to, UpdateOutputShapes* update) {
|
|
CHECK(update != nullptr);
|
|
*update = {.updatedDynamicTemporary = false,
|
|
.mainOutputInsufficient = false,
|
|
.zeroSizedInput = false};
|
|
|
|
NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
|
|
|
|
if (from.size() == 0) {
|
|
return true;
|
|
}
|
|
|
|
if (VLOG_IS_ON(EXECUTION)) {
|
|
for (const auto& shape : from) {
|
|
VLOG(EXECUTION) << "updateOutputShapes: " << shape;
|
|
}
|
|
}
|
|
|
|
if (mExecutionStep != nullptr) {
|
|
const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
|
|
NN_RET_CHECK_LE(indexMapping.size(), from.size());
|
|
for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
|
|
const uint32_t toIndex = indexMapping[i];
|
|
NN_RET_CHECK_GT(to->size(), toIndex);
|
|
NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
|
|
(*to)[toIndex] = from[i];
|
|
update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
|
|
if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
|
|
isZeroSizedTensor(executionResultCode, from[i])) {
|
|
update->zeroSizedInput = true;
|
|
}
|
|
}
|
|
|
|
if (!mDynamicTemporaries->empty()) {
|
|
// TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
|
|
std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
|
|
for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
|
|
operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
|
|
}
|
|
|
|
const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
|
|
for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
|
|
const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
|
|
const auto it =
|
|
operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
|
|
if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
|
|
continue;
|
|
}
|
|
const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
|
|
VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
|
|
<< " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
|
|
<< sourceOperandIndex.second << ") is a dynamic temporary";
|
|
// This is a temporary, but it might not be a dynamic temporary.
|
|
const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
|
|
if (loc == std::nullopt) {
|
|
continue;
|
|
}
|
|
NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
|
|
bool changedShape = false;
|
|
const uint32_t actualSize = TypeManager::get()->getSizeOfData(
|
|
mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
|
|
if (actualSize > 0) {
|
|
changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
|
|
from[i].dimensions, actualSize);
|
|
} else if (!from[i].isSufficient) {
|
|
NN_RET_CHECK(loc->paddedLength < UINT32_MAX / 2)
|
|
<< "output#" << i << " paddedLength overflow";
|
|
changedShape = mDynamicTemporaries->redeclare(
|
|
sourceOperandIndex, from[i].dimensions, 2 * loc->paddedLength);
|
|
} else {
|
|
// The combination of not-fully-specified dimensions
|
|
// and isSufficient means that we have no
|
|
// information about whether the size of the dynamic
|
|
// temporary is adequate.
|
|
VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
|
|
if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
|
|
NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
|
|
// This is a zero-sized tensor, and by
|
|
// definition, any dynamic temporary is an input
|
|
// to an execution step.
|
|
update->zeroSizedInput = true;
|
|
}
|
|
}
|
|
if (changedShape) {
|
|
// TODO: find a better place for this comment.
|
|
//
|
|
// isUpdatable(a, b) imposes a partial ordering a <=
|
|
// b. Every fully specified dimensions vector is an
|
|
// upper bound of that ordering. Therefore, any
|
|
// change in dimensions moves towards an upper
|
|
// bound, and hence there are a finite number of
|
|
// such changes possible.
|
|
//
|
|
// actualSize can only be computed from dimensions
|
|
// that are an upper bound. Therefore, once
|
|
// actualSize is computed, it will not change.
|
|
//
|
|
// If dimensions are not fully specified, and
|
|
// estimated size changes, it increases. There is
|
|
// an upper bound on estimated size to avoid
|
|
// overflow.
|
|
//
|
|
// Therefore, if we retry only when dimensions or
|
|
// size chage, and we stop retrying if we would
|
|
// otherwise overflow, we should only retry a finite
|
|
// number of times.
|
|
update->updatedDynamicTemporary = true;
|
|
}
|
|
}
|
|
mDynamicTemporaries->vlogDump("finished updateOutputShapes");
|
|
}
|
|
} else {
|
|
NN_RET_CHECK_EQ(from.size(), to->size());
|
|
for (uint32_t i = 0, e = from.size(); i < e; i++) {
|
|
NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
|
|
(*to)[i] = from[i];
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
|
|
std::shared_ptr<Device> device,
|
|
std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
|
|
const ExecutionStep* step, DynamicTemporaries* dynamicTemporaries)
|
|
: mExecutionBuilder(executionBuilder),
|
|
mExecutionStep(step),
|
|
mDynamicTemporaries(dynamicTemporaries),
|
|
mModel(model),
|
|
mDevice(device),
|
|
mPreparedModel(preparedModel),
|
|
mInputs(model->inputCount()),
|
|
mOutputs(model->outputCount()),
|
|
mReusable(reusable) {
|
|
CHECK(mDevice != nullptr);
|
|
CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
|
|
CHECK(!(reusable && dynamicTemporaries != nullptr));
|
|
VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
|
|
<< mOutputs.size() << " outputs";
|
|
}
|
|
|
|
bool StepExecutor::areDynamicTemporariesAllocated() const {
|
|
return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
|
|
}
|
|
|
|
void StepExecutor::mapInputsAndOutputsTrivially() {
|
|
mInputs = mExecutionBuilder->mInputs;
|
|
mOutputs = mExecutionBuilder->mOutputs;
|
|
mMemories = mExecutionBuilder->mMemories;
|
|
}
|
|
|
|
void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
|
|
ModelArgumentInfo* executorInputOrOutput,
|
|
const Dimensions* builderDimensions) {
|
|
auto updateDimensions = [executorInputOrOutput, builderDimensions] {
|
|
if (!builderDimensions) {
|
|
return;
|
|
}
|
|
executorInputOrOutput->dimensions() = *builderDimensions;
|
|
};
|
|
|
|
*executorInputOrOutput = builderInputOrOutput;
|
|
switch (executorInputOrOutput->state()) {
|
|
default:
|
|
CHECK(false) << "unexpected ModelArgumentInfo::state";
|
|
break;
|
|
case ModelArgumentInfo::HAS_NO_VALUE:
|
|
case ModelArgumentInfo::UNSPECIFIED:
|
|
break;
|
|
case ModelArgumentInfo::POINTER:
|
|
updateDimensions();
|
|
break;
|
|
case ModelArgumentInfo::MEMORY: {
|
|
updateDimensions();
|
|
const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
|
|
const RuntimeMemory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
|
|
const uint32_t executorPoolIndex = mMemories.add(memory);
|
|
executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
|
|
const RuntimeMemory* memory, uint32_t offset,
|
|
uint32_t length, const Dimensions& dimensions,
|
|
ModelArgumentInfo* inputOrOutputInfo) {
|
|
// Should be similar to
|
|
// ExecutionBuilder::setInputFromMemory()
|
|
// ExecutionBuilder::setOutputFromMemory()
|
|
|
|
uint32_t poolIndex = mMemories.add(memory);
|
|
CHECK(inputOrOutputInfo->unspecified());
|
|
int n;
|
|
std::tie(n, *inputOrOutputInfo) =
|
|
ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
|
|
/*type=*/nullptr, poolIndex, offset, length);
|
|
if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
|
|
CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
|
|
inputOrOutputInfo->dimensions() = dimensions;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
static std::string toString(std::vector<uint32_t> dimensions) {
|
|
std::string ret = "(";
|
|
bool wroteOne = false;
|
|
for (uint32_t dimension : dimensions) {
|
|
if (wroteOne) {
|
|
ret += ", ";
|
|
} else {
|
|
wroteOne = true;
|
|
}
|
|
ret += std::to_string(dimension);
|
|
}
|
|
ret += ")";
|
|
return ret;
|
|
};
|
|
|
|
static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
|
|
for (unsigned i = 0; i < args.size(); i++) {
|
|
const auto& arg = args[i];
|
|
std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
|
|
switch (arg.state()) {
|
|
case ModelArgumentInfo::POINTER:
|
|
VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
|
|
<< toString(arg.dimensions());
|
|
break;
|
|
case ModelArgumentInfo::MEMORY:
|
|
VLOG(EXECUTION) << prefix << "MEMORY("
|
|
<< "pool=" << arg.locationAndLength().poolIndex << ", "
|
|
<< "off=" << arg.locationAndLength().offset << ") dim"
|
|
<< toString(arg.dimensions());
|
|
break;
|
|
case ModelArgumentInfo::HAS_NO_VALUE:
|
|
VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
|
|
break;
|
|
case ModelArgumentInfo::UNSPECIFIED:
|
|
VLOG(EXECUTION) << prefix << "UNSPECIFIED";
|
|
break;
|
|
default:
|
|
VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool StepExecutor::isCpu() const {
|
|
return mDevice == DeviceManager::getCpuDevice();
|
|
}
|
|
|
|
std::pair<int, std::shared_ptr<RuntimeExecution>> StepExecutor::getReusableExecution() {
|
|
CHECK(mReusable);
|
|
if (mExecution == nullptr) {
|
|
CHECK(mPreparedModel != nullptr);
|
|
const MeasureTiming measure = measureTiming(mExecutionBuilder);
|
|
const OptionalDuration loopTimeoutDuration =
|
|
makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
|
|
auto [n, execution] = mPreparedModel->createReusableExecution(
|
|
mInputs, mOutputs, mMemories.getObjects(), measure, loopTimeoutDuration,
|
|
mExecutionBuilder->getMetadata());
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, nullptr};
|
|
}
|
|
mExecution = std::move(execution);
|
|
}
|
|
return {ANEURALNETWORKS_NO_ERROR, mExecution};
|
|
}
|
|
|
|
std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
|
|
const OptionalTimePoint& deadline, const SharedBurst& burstController) {
|
|
if (VLOG_IS_ON(EXECUTION)) {
|
|
logArguments("input", mInputs);
|
|
logArguments("output", mOutputs);
|
|
}
|
|
|
|
int n;
|
|
std::vector<OutputShape> outputShapes;
|
|
Timing timing;
|
|
if (mReusable) {
|
|
auto [nCreate, execution] = getReusableExecution();
|
|
if (nCreate != ANEURALNETWORKS_NO_ERROR) {
|
|
return {nCreate, {}, {}};
|
|
}
|
|
std::tie(n, outputShapes, timing) = execution->compute(burstController, deadline);
|
|
} else {
|
|
CHECK(mPreparedModel != nullptr);
|
|
const MeasureTiming measure = measureTiming(mExecutionBuilder);
|
|
const OptionalDuration loopTimeoutDuration =
|
|
makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
|
|
std::tie(n, outputShapes, timing) = mPreparedModel->execute(
|
|
mInputs, mOutputs, mMemories.getObjects(), burstController, measure, deadline,
|
|
loopTimeoutDuration, mExecutionBuilder->getMetadata());
|
|
}
|
|
mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
|
|
return {n, std::move(outputShapes), std::move(timing)};
|
|
}
|
|
|
|
std::tuple<int, int, ExecuteFencedInfoCallback> StepExecutor::computeFenced(
|
|
const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline) {
|
|
if (VLOG_IS_ON(EXECUTION)) {
|
|
logArguments("input", mInputs);
|
|
logArguments("output", mOutputs);
|
|
}
|
|
|
|
OptionalDuration optionalTimeoutDurationAfterFence;
|
|
if (timeoutDurationAfterFence > 0) {
|
|
optionalTimeoutDurationAfterFence = makeTimeoutDuration(timeoutDurationAfterFence);
|
|
}
|
|
|
|
int n;
|
|
int syncFenceFd;
|
|
ExecuteFencedInfoCallback executeFencedInfoCallback;
|
|
Timing timing;
|
|
if (mReusable) {
|
|
auto [nCreate, execution] = getReusableExecution();
|
|
if (nCreate != ANEURALNETWORKS_NO_ERROR) {
|
|
return {nCreate, -1, nullptr};
|
|
}
|
|
std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) =
|
|
execution->computeFenced(waitFor, deadline, optionalTimeoutDurationAfterFence);
|
|
} else {
|
|
CHECK(mPreparedModel != nullptr);
|
|
const MeasureTiming measure = measureTiming(mExecutionBuilder);
|
|
const OptionalDuration loopTimeoutDuration =
|
|
makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
|
|
std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = mPreparedModel->executeFenced(
|
|
mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
|
|
loopTimeoutDuration, optionalTimeoutDurationAfterFence,
|
|
mExecutionBuilder->getMetadata());
|
|
}
|
|
if (syncFenceFd < 0 && executeFencedInfoCallback == nullptr) {
|
|
mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
|
|
}
|
|
return {n, syncFenceFd, executeFencedInfoCallback};
|
|
}
|
|
|
|
// For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
|
|
std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
|
|
VLOG(EXECUTION) << "Re-compile the model on CPU";
|
|
const ModelFactory makeModel = [this] { return mModel->makeModel(); };
|
|
// TODO: Propagate user preference and compilation priority to this point instead of using
|
|
// default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
|
|
// ANEURALNETWORKS_PRIORITY_MEDIUM
|
|
const ExecutionPreference preference =
|
|
static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
|
|
const Priority priority = convertToCanonicalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
|
|
auto [n, preparedModel] = DeviceManager::getCpuDevice()->prepareModel(
|
|
makeModel, preference, priority, {}, {}, {}, {}, {});
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, {}, {}};
|
|
}
|
|
|
|
// Prepare device memories for CPU fallback.
|
|
std::vector<const RuntimeMemory*> memories = mMemories.getObjects();
|
|
std::vector<bool> isUsedAsInput(memories.size(), false);
|
|
std::vector<bool> isUsedAsOutput(memories.size(), false);
|
|
std::vector<std::unique_ptr<RuntimeMemory>> blobAhwbs;
|
|
|
|
// Mark the input and output usages.
|
|
for (auto& input : mInputs) {
|
|
if (input.state() == ModelArgumentInfo::MEMORY) {
|
|
const uint32_t poolIndex = input.locationAndLength().poolIndex;
|
|
isUsedAsInput[poolIndex] = true;
|
|
}
|
|
}
|
|
for (auto& output : mOutputs) {
|
|
if (output.state() == ModelArgumentInfo::MEMORY) {
|
|
const uint32_t poolIndex = output.locationAndLength().poolIndex;
|
|
// Cannot allocate output buffers with unknown shapes.
|
|
if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
|
|
LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
|
|
"has unknown shape.";
|
|
return {ANEURALNETWORKS_OP_FAILED, {}, {}};
|
|
}
|
|
isUsedAsOutput[poolIndex] = true;
|
|
}
|
|
}
|
|
|
|
// Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
|
|
for (uint32_t i = 0; i < memories.size(); i++) {
|
|
const RuntimeMemory* memory = mMemories[i];
|
|
if (memory->getIBuffer() != nullptr) {
|
|
const uint32_t size = memory->getValidator().getMetadata().logicalSize;
|
|
auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
|
|
if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
|
|
return {nAhwb, {}, {}};
|
|
}
|
|
if (isUsedAsInput[i]) {
|
|
n = copyIBufferToMemory(memory->getIBuffer(), blobAhwb->getMemory());
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, {}, {}};
|
|
}
|
|
}
|
|
memories[i] = blobAhwb.get();
|
|
blobAhwbs.push_back(std::move(blobAhwb));
|
|
}
|
|
}
|
|
|
|
const MeasureTiming measure = measureTiming(mExecutionBuilder);
|
|
const OptionalDuration loopTimeoutDuration =
|
|
makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
|
|
auto [nExecute, outputShapes, timing] = preparedModel->execute(
|
|
mInputs, mOutputs, memories, nullptr, measure, {}, loopTimeoutDuration, {});
|
|
mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
|
|
if (nExecute != ANEURALNETWORKS_NO_ERROR) {
|
|
return {nExecute, std::move(outputShapes), timing};
|
|
}
|
|
|
|
// Write back to output device memories.
|
|
for (uint32_t i = 0; i < memories.size(); i++) {
|
|
const RuntimeMemory* memory = mMemories[i];
|
|
if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
|
|
n = copyMemoryToIBuffer(memories[i]->getMemory(), memory->getIBuffer(), {});
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, {}, {}};
|
|
}
|
|
}
|
|
}
|
|
return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
|
|
}
|
|
|
|
} // namespace nn
|
|
} // namespace android
|