android13/packages/modules/NeuralNetworks/runtime/Manager.cpp

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define LOG_TAG "Manager"

#include "Manager.h"

#include <CpuExecutor.h>
#include <LegacyUtils.h>
#include <MetaModel.h>
#include <Tracing.h>
#include <android-base/properties.h>
#include <nnapi/IBurst.h>
#include <nnapi/IDevice.h>
#include <nnapi/IExecution.h>
#include <nnapi/IPreparedModel.h>
#include <nnapi/SharedMemory.h>
#include <nnapi/TypeUtils.h>
#include <nnapi/Types.h>
#include <nnapi/Validation.h>

#include <algorithm>
#include <functional>
#include <iterator>
#include <map>
#include <memory>
#include <regex>
#include <set>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

#include "ExecutionCallback.h"
#include "Memory.h"
#include "ModelArgumentInfo.h"
#include "ServerFlag.h"
#include "TypeManager.h"

#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
#include <build/version.h>
#include <cutils/native_handle.h>
#include <nnapi/hal/1.3/Buffer.h>
#include <nnapi/hal/Service.h>
#endif  // NN_COMPATIBILITY_LIBRARY_BUILD

#ifdef NN_EXPERIMENTAL_FEATURE
#include "NeuralNetworksExperimentalFeatures.h"
#endif  // NN_EXPERIMENTAL_FEATURE

namespace android {
namespace nn {
namespace {

Version getRuntimeFeatureLevelVersionHelper() {
#if defined(NN_EXPERIMENTAL_FEATURE) && defined(NN_COMPATIBILITY_LIBRARY_BUILD)
#error "NN_EXPERIMENTAL_FEATURE is not supported when NN_COMPATIBILITY_LIBRARY_BUILD is defined"
#elif defined(NN_EXPERIMENTAL_FEATURE)
    auto version = kVersionFeatureLevelExperimental;
    // Enable "runtimeOnlyFeatures" to indicate that the runtime feature level version supports
    // features that are only available in the runtime.
    version.runtimeOnlyFeatures = true;
#elif defined(NN_COMPATIBILITY_LIBRARY_BUILD)
    auto version = serverFeatureLevelToVersion(kMaxFeatureLevelNum);
#else   // !defined(NN_COMPATIBILITY_LIBRARY_BUILD) && !defined(NN_EXPERIMENTAL_FEATURE)
    auto version = serverFeatureLevelToVersion(getServerFeatureLevelFlag());
    // Enable "runtimeOnlyFeatures" to indicate that the runtime feature level version supports
    // features that are only available in the runtime.
    version.runtimeOnlyFeatures = true;
#endif  // !defined(NN_COMPATIBILITY_LIBRARY_BUILD) && !defined(NN_EXPERIMENTAL_FEATURE)
    return version;
}

Version getRuntimeFeatureLevelVersion() {
    static const Version version = getRuntimeFeatureLevelVersionHelper();
    return version;
}

bool getWhetherPlatformTelemetryIsEnabled() {
#if !defined(NN_COMPATIBILITY_LIBRARY_BUILD) && !defined(NN_EXPERIMENTAL_FEATURE)
    return getServerTelemetryEnableFlag();
#else   // !defined(NN_COMPATIBILITY_LIBRARY_BUILD) && !defined(NN_EXPERIMENTAL_FEATURE)
    return false;
#endif  // !defined(NN_COMPATIBILITY_LIBRARY_BUILD) && !defined(NN_EXPERIMENTAL_FEATURE)
}

}  // namespace

// A Device with actual underlying driver
class DriverDevice : public Device {
   public:
    // Create a DriverDevice from a name and a DeviceFactory function.
    // Returns nullptr on failure.
    static std::shared_ptr<DriverDevice> create(SharedDevice device);

    // Prefer using DriverDevice::create
    explicit DriverDevice(SharedDevice device);

    const std::string& getName() const override { return kInterface->getName(); }
    const std::string& getVersionString() const override { return kInterface->getVersionString(); }
    Version getFeatureLevel() const override { return kInterface->getFeatureLevel(); }
    int32_t getType() const override { return static_cast<int32_t>(kInterface->getType()); }
    const std::vector<Extension>& getSupportedExtensions() const override {
        return kInterface->getSupportedExtensions();
    }
    std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
    const Capabilities& getCapabilities() const override { return kInterface->getCapabilities(); }
    Capabilities::PerformanceInfo getPerformance(OperandType type) const override {
        return getCapabilities().operandPerformance.lookup(type);
    }
    Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
        return getCapabilities().relaxedFloat32toFloat16PerformanceScalar;
    }
    Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
        return getCapabilities().relaxedFloat32toFloat16PerformanceTensor;
    }
    Capabilities::PerformanceInfo getIfPerformance() const override {
        return getCapabilities().ifPerformance;
    }
    Capabilities::PerformanceInfo getWhilePerformance() const override {
        return getCapabilities().whilePerformance;
    }
    std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
        return kInterface->getNumberOfCacheFilesNeeded();
    }
    bool isCachingSupported() const override {
        // Caching is supported if either of numModelCache or numDataCache is greater than 0.
        const auto [numModelCacheFiles, numDataCacheFiles] = getNumberOfCacheFilesNeeded();
        return numModelCacheFiles > 0 || numDataCacheFiles > 0;
    }
    int wait() const override {
        auto result = kInterface->wait();
        if (!result.ok()) {
            LOG(ERROR) << "DriverDevice::wait error: " << result.error().message;
            return convertErrorStatusToResultCode(result.error().code);
        }
        return ANEURALNETWORKS_NO_ERROR;
    }

    std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
            const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
            const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
            const std::optional<CacheToken>& maybeToken,
            const std::vector<TokenValuePair>& metaData,
            const std::vector<ExtensionNameAndPrefix>& extensionNameAndPrefix) const override;

    std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
                                                            OperandType) const override;

   private:
    const SharedDevice kInterface;

    GeneralResult<std::vector<bool>> getSupportedOperationsImpl(const MetaModel& metaModel) const;
    GeneralResult<SharedPreparedModel> prepareModelFromCacheInternal(
            const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
            const CacheToken& token) const;

#ifdef NN_DEBUGGABLE
    // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
    // 0 - all operations reported by IDevice::getSupportedOperations() supported
    // 1 - some operations reported by IDevice::getSupportedOperations() supported
    uint32_t mSupported = 0;
#endif  // NN_DEBUGGABLE
};

// A RuntimePreparedModel with underlying IPreparedModel instance return by actual driver.
class DriverPreparedModel : public RuntimePreparedModel {
   public:
    DriverPreparedModel(const Device* device, const SharedPreparedModel& preparedModel)
        : mDevice(device), mPreparedModel(preparedModel) {
        CHECK(mDevice != nullptr);
        CHECK(mPreparedModel != nullptr);
    }

    const Device* getDevice() const override { return mDevice; }
    SharedPreparedModel getInterface() const override { return mPreparedModel; }

    std::tuple<int, std::vector<OutputShape>, Timing> execute(
            const std::vector<ModelArgumentInfo>& inputs,
            const std::vector<ModelArgumentInfo>& outputs,
            const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
            MeasureTiming measure, const OptionalTimePoint& deadline,
            const OptionalDuration& loopTimeoutDuration,
            const std::vector<TokenValuePair>& metaData) const override;

    std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
            const std::vector<ModelArgumentInfo>& inputs,
            const std::vector<ModelArgumentInfo>& outputs,
            const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
            MeasureTiming measure, const OptionalTimePoint& deadline,
            const OptionalDuration& loopTimeoutDuration,
            const OptionalDuration& timeoutDurationAfterFence,
            const std::vector<TokenValuePair>& metaData) const override;

    std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
            const std::vector<ModelArgumentInfo>& inputs,
            const std::vector<ModelArgumentInfo>& outputs,
            const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
            const OptionalDuration& loopTimeoutDuration,
            const std::vector<TokenValuePair>& metaData) const override;

    GeneralResult<SharedBurst> configureExecutionBurst() const override {
        return mPreparedModel->configureExecutionBurst();
    }

    MemoryPreference getMemoryPreference() const override {
        if (isCompliantVersion(kVersionFeatureLevel5, mDevice->getFeatureLevel())) {
            return {kDefaultRequestMemoryAlignment, kDefaultRequestMemoryPadding};
        } else {
            // We are not able to pass memory padding information to HIDL drivers, so return the
            // minimum padding.
            return {kDefaultRequestMemoryAlignment, kMinMemoryPadding};
        }
    }

   private:
    const Device* mDevice;
    const SharedPreparedModel mPreparedModel;
};

class DriverExecution : public RuntimeExecution {
   public:
    DriverExecution(SharedExecution execution, Request request,
                    std::vector<const RuntimeMemory*> memories, MeasureTiming measure,
                    OptionalDuration loopTimeoutDuration, Version deviceFeatureLevel,
                    const std::vector<TokenValuePair>& metaData)
        : kExecution(std::move(execution)),
          kRequest(std::move(request)),
          kMemories(std::move(memories)),
          kMeasure(measure),
          kLoopTimeoutDuration(std::move(loopTimeoutDuration)),
          kDeviceFeatureLevel(deviceFeatureLevel),
          kMetaData(metaData) {
        CHECK(kExecution != nullptr);
    }

    std::tuple<int, std::vector<OutputShape>, Timing> compute(
            const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;

    std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
            const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
            const OptionalDuration& timeoutDurationAfterFence) const override;

   private:
    const SharedExecution kExecution;

    // For burst execution.
    const Request kRequest;
    const std::vector<const RuntimeMemory*> kMemories;
    const MeasureTiming kMeasure;
    const OptionalDuration kLoopTimeoutDuration;
    mutable std::map<const IBurst*, SharedExecution> mCachedBurstExecutions;

    // For fenced execution.
    const Version kDeviceFeatureLevel;

    // Execution metadata.
    std::vector<TokenValuePair> kMetaData;
};

DriverDevice::DriverDevice(SharedDevice device) : kInterface(std::move(device)) {
    CHECK(kInterface != nullptr);
#ifdef NN_DEBUGGABLE
    static const char samplePrefix[] = "sample";
    if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
        mSupported = getProp("debug.nn.sample.supported");
    }
#endif  // NN_DEBUGGABLE
}

std::shared_ptr<DriverDevice> DriverDevice::create(SharedDevice device) {
    if (device == nullptr) {
        LOG(ERROR) << "DriverDevice::create called with nullptr";
        return nullptr;
    }

    return std::make_shared<DriverDevice>(std::move(device));
}

int64_t DeviceManager::versionToFeatureLevel(Version::Level versionLevel) {
    switch (versionLevel) {
        case Version::Level::FEATURE_LEVEL_1:
            return ANEURALNETWORKS_FEATURE_LEVEL_1;
        case Version::Level::FEATURE_LEVEL_2:
            return ANEURALNETWORKS_FEATURE_LEVEL_2;
        case Version::Level::FEATURE_LEVEL_3:
            return ANEURALNETWORKS_FEATURE_LEVEL_3;
        case Version::Level::FEATURE_LEVEL_4:
            return ANEURALNETWORKS_FEATURE_LEVEL_4;
        case Version::Level::FEATURE_LEVEL_5:
            return ANEURALNETWORKS_FEATURE_LEVEL_5;
        case Version::Level::FEATURE_LEVEL_6:
            return ANEURALNETWORKS_FEATURE_LEVEL_6;
        case Version::Level::FEATURE_LEVEL_7:
            return ANEURALNETWORKS_FEATURE_LEVEL_7;
        case Version::Level::FEATURE_LEVEL_8:
            return ANEURALNETWORKS_FEATURE_LEVEL_8;
#ifdef NN_EXPERIMENTAL_FEATURE
        case Version::Level::FEATURE_LEVEL_EXPERIMENTAL:
            return ANEURALNETWORKS_FEATURE_LEVEL_EXPERIMENTAL;
#endif  // NN_EXPERIMENTAL_FEATURE
    }
    LOG(FATAL) << "Unrecognized version " << versionLevel;
    return -1;
}

GeneralResult<std::vector<bool>> DriverDevice::getSupportedOperationsImpl(
        const MetaModel& metaModel) const {
    const auto featureLevel = kInterface->getFeatureLevel();
    const auto slice = metaModel.getSlice(featureLevel);
    if (!slice.has_value()) {
        return NN_ERROR() << "getSlice(" << featureLevel << ") failed";
    }

    const auto& [sliceModel, slicedModelOperationIndexToModelOperationIndex] = *slice;
    const std::vector<bool> supported = NN_TRY(kInterface->getSupportedOperations(sliceModel));
    const uint32_t slicedOperationCount = sliceModel.main.operations.size();
    if (supported.size() != slicedOperationCount) {
        return NN_ERROR() << "IDevice::getSupportedOperations returned a vector of length "
                          << supported.size() << " when expecting " << slicedOperationCount;
    }

    const Model& model = metaModel.getModel();
    const uint32_t operationCount = model.main.operations.size();
    std::vector<bool> remappedSupported(operationCount, false);
    for (size_t i = 0; i < supported.size(); ++i) {
        if (supported[i]) {
            remappedSupported[slicedModelOperationIndexToModelOperationIndex(i)] = true;
        }
    }
    return remappedSupported;
}

std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
    const Model& model = metaModel.getModel();

    auto result = getSupportedOperationsImpl(metaModel);
    if (!result.ok()) {
        LOG(ERROR) << "getSupportedOperations failed with code " << result.error().code << ": "
                   << result.error().message;
        // Set the supported operation vectors to all false, so we won't use this driver.
        return std::vector<bool>(model.main.operations.size(), false);
    }

    std::vector<bool>& supportedOperations = result.value();
#ifdef NN_DEBUGGABLE
    if (mSupported != 1) {
        return supportedOperations;
    }

    const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
    for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
        if (!supportedOperations[operationIndex]) {
            continue;
        }

        uint32_t accumulator = baseAccumulator;
        const Operation& operation = model.main.operations[operationIndex];
        accumulator ^= static_cast<uint32_t>(operation.type);
        auto accumulateOperands = [&model, &accumulator](const std::vector<uint32_t>& operands) {
            for (uint32_t operandIndex : operands) {
                const Operand& operand = model.main.operands[operandIndex];
                accumulator ^= static_cast<uint32_t>(operand.type);
                accumulator ^= operand.dimensions.size();
                for (const Dimension& dimension : operand.dimensions) {
                    accumulator ^= dimension;
                    if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY ||
                        operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE ||
                        operand.lifetime == Operand::LifeTime::POINTER) {
                        accumulator ^= 1;
                    }
                }
            }
        };
        accumulateOperands(operation.inputs);
        accumulateOperands(operation.outputs);
        if (accumulator & 1) {
            supportedOperations[operationIndex] = false;
        }
    }
#endif  // NN_DEBUGGABLE

    return supportedOperations;
}

// Opens a cache file for reading and writing and returns a shared handle.
static GeneralResult<SharedHandle> createCacheHandle(const std::string& filename,
                                                     bool createIfNotExist) {
    auto fd = base::unique_fd(open(filename.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR,
                                   S_IRUSR | S_IWUSR));
    if (!fd.ok()) {
        return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
               << "Failed to " << (createIfNotExist ? "open or create" : "open") << " cache file "
               << filename;
    }
    return std::make_shared<const Handle>(std::move(fd));
}

// Opens a list of cache files and returns a vector of shared handles. The files
// are always opened with both read and write permissions.
static GeneralResult<std::vector<SharedHandle>> createCacheHandleVec(
        uint32_t numCacheFiles, const std::string& baseFilename, bool createIfNotExist) {
    CHECK(numCacheFiles <= kMaxNumberOfCacheFiles);
    std::vector<SharedHandle> handles;
    handles.reserve(numCacheFiles);
    for (uint32_t i = 0; i < numCacheFiles; i++) {
        std::string filename = baseFilename + std::to_string(i);
        VLOG(COMPILATION) << "Cache " << i << ": " << filename;
        handles.push_back(NN_TRY(createCacheHandle(filename, createIfNotExist)));
    }
    return handles;
}

// Maps a token to cache file names and returns a pair of vectors of shared
// handles to the opened files.
static GeneralResult<CacheHandles> getCacheHandles(
        const CacheInfo& cacheInfo, const CacheToken& token,
        const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist) {
    if (const auto* cacheHandles = std::get_if<CacheHandles>(&cacheInfo.variant)) {
        if (cacheHandles->modelCache.size() != numCacheFiles.first) {
            return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
                   << "Expected " << numCacheFiles.first << " model cache handles, got "
                   << cacheHandles->modelCache.size();
        }
        if (cacheHandles->dataCache.size() != numCacheFiles.second) {
            return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
                   << "Expected " << numCacheFiles.second << " data cache handles, got "
                   << cacheHandles->dataCache.size();
        }
        return *cacheHandles;
    }

    // The filename includes kByteSizeOfCacheToken * 2 characters for token,
    // and 1 character for model/data cache identifier.
    std::string filename(kByteSizeOfCacheToken * 2 + 1, '0');
    for (uint32_t i = 0; i < kByteSizeOfCacheToken; i++) {
        filename[i * 2] = 'A' + (token[i] & 0x0F);
        filename[i * 2 + 1] = 'A' + (token[i] >> 4);
    }

    const auto& cacheDir = std::get<CacheDir>(cacheInfo.variant);
    CHECK(cacheDir.empty() || cacheDir.back() == '/');
    std::string cacheFileName = cacheDir + filename;
    const uint32_t cacheTypeIdentifierIndex = cacheDir.size() + kByteSizeOfCacheToken * 2;

    cacheFileName[cacheTypeIdentifierIndex] = '1';
    std::vector<SharedHandle> modelCache =
            NN_TRY(createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist));

    cacheFileName[cacheTypeIdentifierIndex] = '2';
    std::vector<SharedHandle> dataCache =
            NN_TRY(createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist));

    return CacheHandles{
            .modelCache = std::move(modelCache),
            .dataCache = std::move(dataCache),
    };
}

GeneralResult<SharedPreparedModel> DriverDevice::prepareModelFromCacheInternal(
        const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
        const CacheToken& token) const {
    // Get cache files if they exist, otherwise return from the function early.
    auto cache = NN_TRY(getCacheHandles(cacheInfo, token, kInterface->getNumberOfCacheFilesNeeded(),
                                        /*createIfNotExist=*/false));
    return kInterface->prepareModelFromCache(deadline, cache.modelCache, cache.dataCache, token);
}

std::pair<int, std::shared_ptr<RuntimePreparedModel>> DriverDevice::prepareModel(
        const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
        const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
        const std::optional<CacheToken>& maybeToken, const std::vector<TokenValuePair>& metaData,
        const std::vector<ExtensionNameAndPrefix>& extensionNameAndPrefix) const {
    // Attempt to compile from cache if token is present.
    if (maybeToken.has_value()) {
        auto result = prepareModelFromCacheInternal(deadline, cacheInfo, *maybeToken);
        if (result.has_value()) {
            LOG(INFO) << "prepareModelFromCache: successfully prepared model from cache";
            return {ANEURALNETWORKS_NO_ERROR,
                    std::make_shared<DriverPreparedModel>(this, std::move(result).value())};
        } else {
            LOG(ERROR) << "prepareModelFromCache failure (" << result.error().code
                       << "): " << result.error().message;
        }
    }

    // Get cache files if they exist, otherwise create them.
    CacheHandles cache;
    if (maybeToken.has_value()) {
        auto result =
                getCacheHandles(cacheInfo, *maybeToken, kInterface->getNumberOfCacheFilesNeeded(),
                                /*createIfNotExist=*/true);
        if (result.has_value()) {
            cache = std::move(result).value();
        } else {
            LOG(ERROR) << "getCacheHandles failure (" << result.error().code
                       << "): " << result.error().message;
        }
    }

    // Get the token if it exists, otherwise get a null token.
    static constexpr CacheToken kNullToken = {};
    const CacheToken token = maybeToken.value_or(kNullToken);

    // Fallback to full compilation (possibly with token) if
    // prepareModelFromCache could not be used or failed.
    const Model model = makeModel();
    auto result =
            kInterface->prepareModel(model, preference, priority, deadline, cache.modelCache,
                                     cache.dataCache, token, metaData, extensionNameAndPrefix);
    if (!result.ok()) {
        LOG(ERROR) << "IDevice::prepareModel() error: " << result.error().message;
        return {convertErrorStatusToResultCode(result.error().code), nullptr};
    }
    SharedPreparedModel preparedModel = std::move(result).value();
    CHECK(preparedModel != nullptr)
            << "IDevice::prepareModel() returned nullptr without error code";
    return {ANEURALNETWORKS_NO_ERROR,
            std::make_shared<DriverPreparedModel>(this, std::move(preparedModel))};
}

std::pair<int, std::unique_ptr<RuntimeMemory>> DriverDevice::allocate(const MemoryDescriptor& desc,
                                                                      OperandType) const {
    const BufferDesc bufferDesc = {.dimensions = desc.dimensions};
    std::vector<SharedPreparedModel> preparedModels(desc.preparedModels.size());
    std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
                   [](const auto* preparedModel) {
                       const auto versionedPreparedModel = preparedModel->getInterface();
                       CHECK(versionedPreparedModel != nullptr);
                       return versionedPreparedModel;
                   });
    auto result =
            kInterface->allocate(bufferDesc, preparedModels, desc.inputRoles, desc.outputRoles);
    if (!result.ok()) {
        LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
                   << " failed!";
        return {convertErrorStatusToResultCode(result.error().code), nullptr};
    }
    return MemoryFromDevice::create(std::move(result).value());
}

static Request createDriverRequest(const std::vector<ModelArgumentInfo>& inputs,
                                   const std::vector<ModelArgumentInfo>& outputs,
                                   const std::vector<const RuntimeMemory*>& memories) {
    Request request;
    request.inputs.reserve(inputs.size());
    std::transform(inputs.begin(), inputs.end(), std::back_inserter(request.inputs),
                   [](const auto& input) { return input.createRequestArgument(); });
    request.outputs.reserve(outputs.size());
    std::transform(outputs.begin(), outputs.end(), std::back_inserter(request.outputs),
                   [](const auto& output) { return output.createRequestArgument(); });
    request.pools.reserve(memories.size());
    std::transform(memories.begin(), memories.end(), std::back_inserter(request.pools),
                   [](const RuntimeMemory* memory) { return memory->getMemoryPool(); });
    return request;
}

// Perform computation on an actual device driver.
//
// Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
// outputs specified by pointers. The input pointer data will be copied to the input pool prior to
// execution, and the output pointer data will be copied out from the output pool after the
// execution.
std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
        MeasureTiming measure, const OptionalTimePoint& deadline,
        const OptionalDuration& loopTimeoutDuration,
        const std::vector<TokenValuePair>& metaData) const {
    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");

    auto request = createDriverRequest(inputs, outputs, memories);

    NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::execute::execute");

    ExecutionResult<std::pair<std::vector<OutputShape>, Timing>> result;

    // compute using burst if present, otherwise compute from IPreparedModel
    const bool burstCompute = (burstController != nullptr);
    if (burstCompute) {
        for (const RuntimeMemory* memory : memories) {
            const auto pool = memory->getMemoryPool();
            if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
                auto cacheHold = burstController->cacheMemory(*maybeMemory);
                memory->hold(cacheHold);
            }
        }

        VLOG(EXECUTION) << "Before burstController->execute() " << SHOW_IF_DEBUG(request);
        result = burstController->execute(request, measure, deadline, loopTimeoutDuration, metaData,
                                          TypeManager::get()->getExtensionNameAndPrefix(metaData));
    } else {
        result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration, metaData,
                                         TypeManager::get()->getExtensionNameAndPrefix(metaData));
    }

    int n = ANEURALNETWORKS_OP_FAILED;
    std::vector<OutputShape> outputShapes;
    Timing timing;

    if (result.ok()) {
        n = ANEURALNETWORKS_NO_ERROR;
        std::tie(outputShapes, timing) = std::move(result).value();
    } else {
        auto [message, code, returnedOutputShapes] = std::move(result).error();
        VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
        LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
                   << "::execute(...) error: " << message;
        n = convertErrorStatusToResultCode(code);
        if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
            outputShapes = std::move(returnedOutputShapes);
        }
        return {n, std::move(outputShapes), timing};
    }

    VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
    return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
}

std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverPreparedModel::executeFenced(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
        MeasureTiming measure, const OptionalTimePoint& deadline,
        const OptionalDuration& loopTimeoutDuration,
        const OptionalDuration& timeoutDurationAfterFence,
        const std::vector<TokenValuePair>& metaData) const {
    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
    CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));

    auto request = createDriverRequest(inputs, outputs, memories);

    NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::executeFenced");

    std::vector<SyncFence> waitForHandles;
    waitForHandles.reserve(waitFor.size());
    for (int fd : waitFor) {
        int dupFd = dup(fd);
        if (dupFd < 0) {
            LOG(ERROR) << "Unable to dup the file descriptor";
            return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
        }
        waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
    }

    SyncFence syncFence = SyncFence::createAsSignaled();
    ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
    Timing timing = {};
    if (isCompliantVersion(kHalVersionV1_3ToApi.canonical, mDevice->getFeatureLevel())) {
        auto result = mPreparedModel->executeFenced(
                request, waitForHandles, measure, deadline, loopTimeoutDuration,
                timeoutDurationAfterFence, metaData,
                TypeManager::get()->getExtensionNameAndPrefix(metaData));
        if (!result.ok()) {
            LOG(ERROR) << "IPreparedModel::executeFenced() error: " << result.error().message;
            VLOG(EXECUTION) << "**executeFenced failed**";
            return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
        }
        std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
    } else {
        // Fallback to synchronous execution if executeFenced is not supported.
        // First wait for all sync fences to be ready.
        LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
        for (const auto& fence : waitForHandles) {
            if (!fence.hasFd() || fence.getFd() < 0) {
                return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
            }
            auto r = fence.syncWait({/* no timeout */});
            if (r != SyncFence::FenceState::SIGNALED) {
                LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
                return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
            }
        }
        auto result =
                mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration, metaData,
                                        TypeManager::get()->getExtensionNameAndPrefix(metaData));
        if (!result.ok()) {
            LOG(ERROR) << "IPreparedModel::execute() error: " << result.error().message;
            return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
        }
        std::tie(std::ignore, timing) = result.value();
    }

    int syncFenceFd = -1;
    if (syncFence.hasFd()) {
        syncFenceFd = dup(syncFence.getFd());
        if (syncFenceFd < 0) {
            LOG(ERROR) << "Failed to dup the file descriptor";
            return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
        }
    }

    VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
    return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
}

std::pair<int, std::shared_ptr<RuntimeExecution>> DriverPreparedModel::createReusableExecution(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
        const OptionalDuration& loopTimeoutDuration,
        const std::vector<TokenValuePair>& metaData) const {
    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::createReusableExecution");

    auto request = createDriverRequest(inputs, outputs, memories);
    auto result = mPreparedModel->createReusableExecution(
            request, measure, loopTimeoutDuration, metaData,
            TypeManager::get()->getExtensionNameAndPrefix(metaData));
    if (!result.ok()) {
        LOG(ERROR) << "IPreparedModel::createReusableExecution() error: " << result.error().message;
        const int n = convertErrorStatusToResultCode(result.error().code);
        return {n, nullptr};
    }
    auto execution = std::make_shared<DriverExecution>(
            std::move(result).value(), std::move(request), memories, measure, loopTimeoutDuration,
            mDevice->getFeatureLevel(), metaData);
    return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
}

std::tuple<int, std::vector<OutputShape>, Timing> DriverExecution::compute(
        const SharedBurst& burstController, const OptionalTimePoint& deadline) const {
    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::compute");

    // compute using burst if present, otherwise compute from IPreparedModel
    SharedExecution execution;
    const bool burstCompute = (burstController != nullptr);
    if (burstCompute) {
        // create a reusable burst execution if the controller is not seen before
        auto burstExecution = mCachedBurstExecutions.find(burstController.get());
        if (burstExecution == mCachedBurstExecutions.end()) {
            for (const RuntimeMemory* memory : kMemories) {
                const auto pool = memory->getMemoryPool();
                if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
                    auto cacheHold = burstController->cacheMemory(*maybeMemory);
                    memory->hold(cacheHold);
                }
            }
            auto createResult = burstController->createReusableExecution(
                    kRequest, kMeasure, kLoopTimeoutDuration, kMetaData,
                    TypeManager::get()->getExtensionNameAndPrefix(kMetaData));
            if (!createResult.ok()) {
                LOG(ERROR) << "IBurst::createReusableExecution() error: "
                           << createResult.error().message;
                const int n = convertErrorStatusToResultCode(createResult.error().code);
                return {n, {}, {}};
            }
            execution = std::move(createResult).value();
            mCachedBurstExecutions.emplace(burstController.get(), execution);
        } else {
            execution = burstExecution->second;
        }
        VLOG(EXECUTION) << "Before mBurstExecution->compute() " << SHOW_IF_DEBUG(kRequest);
    } else {
        execution = kExecution;
    }

    CHECK(execution != nullptr);
    auto result = execution->compute(deadline);
    if (!result.ok()) {
        auto [message, code, returnedOutputShapes] = std::move(result).error();
        int n = convertErrorStatusToResultCode(code);
        VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
        LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
                   << "::execute(...) error: " << message;
        if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
            return {n, std::move(returnedOutputShapes), {}};
        }
        return {n, {}, {}};
    }

    VLOG(EXECUTION) << "DriverExecution::compute completed";
    auto [outputShapes, timing] = std::move(result).value();
    return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
}

std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverExecution::computeFenced(
        const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
        const OptionalDuration& timeoutDurationAfterFence) const {
    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::computeFenced");
    CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));

    std::vector<SyncFence> waitForHandles;
    waitForHandles.reserve(waitFor.size());
    for (int fd : waitFor) {
        int dupFd = dup(fd);
        if (dupFd < 0) {
            LOG(ERROR) << "Unable to dup the file descriptor";
            return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
        }
        waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
    }

    SyncFence syncFence = SyncFence::createAsSignaled();
    ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
    Timing timing = {};
    if (isCompliantVersion(kHalVersionV1_3ToApi.canonical, kDeviceFeatureLevel)) {
        auto result =
                kExecution->computeFenced(waitForHandles, deadline, timeoutDurationAfterFence);
        if (!result.ok()) {
            LOG(ERROR) << "IExecution::computeFenced() error: " << result.error().message;
            VLOG(EXECUTION) << "**computeFenced failed**";
            return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
        }
        std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
    } else {
        // Fallback to synchronous execution if computeFenced is not supported.
        // First wait for all sync fences to be ready.
        LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
        for (const auto& fence : waitForHandles) {
            if (!fence.hasFd() || fence.getFd() < 0) {
                return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
            }
            auto r = fence.syncWait({/* no timeout */});
            if (r != SyncFence::FenceState::SIGNALED) {
                LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
                return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
            }
        }
        auto result = kExecution->compute(deadline);
        if (!result.ok()) {
            LOG(ERROR) << "IExecution::compute() error: " << result.error().message;
            return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
        }
        std::tie(std::ignore, timing) = result.value();
    }

    int syncFenceFd = -1;
    if (syncFence.hasFd()) {
        syncFenceFd = dup(syncFence.getFd());
        if (syncFenceFd < 0) {
            LOG(ERROR) << "Failed to dup the file descriptor";
            return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
        }
    }

    VLOG(EXECUTION) << "DriverExecution::computeFenced completed";
    return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
}

static Capabilities createCpuCapabilities() {
    constexpr Capabilities::PerformanceInfo kPerf = {.execTime = 1.0f, .powerUsage = 1.0f};
    return makeCapabilities(kPerf, kPerf, kPerf);
}

// A special abstracted device for the CPU. Only one instance of this class will exist.
// Use get() to retrieve it.
class CpuDevice : public Device {
   public:
    // Returns the singleton CPU fallback device.
    static std::shared_ptr<CpuDevice> get() {
        static std::shared_ptr<CpuDevice> instance(new CpuDevice);
        return instance;
    }

    const std::string& getName() const override { return kName; }
    const std::string& getVersionString() const override { return kVersionString; }
    Version getFeatureLevel() const override { return kVersion; }
    int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
    const std::vector<Extension>& getSupportedExtensions() const override {
        return kSupportedExtensions;
    }
    std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
    const Capabilities& getCapabilities() const override { return kCapabilities; }
    Capabilities::PerformanceInfo getPerformance(OperandType) const override {
        return kPerformance;
    }
    Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
        return kPerformance;
    }
    Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
        return kPerformance;
    }
    Capabilities::PerformanceInfo getIfPerformance() const override { return kPerformance; }
    Capabilities::PerformanceInfo getWhilePerformance() const override { return kPerformance; }
    std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
        return {/*numModelCache=*/0, /*numDataCache=*/0};
    }
    bool isCachingSupported() const override { return false; }
    int wait() const override { return ANEURALNETWORKS_NO_ERROR; }

    std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
            const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
            const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
            const std::optional<CacheToken>& maybeToken,
            const std::vector<TokenValuePair>& metaData,
            const std::vector<ExtensionNameAndPrefix>& extensionNameAndPrefix) const override;

    std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
                                                            OperandType type) const override;

   private:
    CpuDevice() = default;
    const Version kVersion = getRuntimeFeatureLevelVersion();
    const std::string kName = "nnapi-reference";
#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
    const std::string kVersionString = build::GetBuildNumber();
#else
    const std::string kVersionString = "UNKNOWN";
#endif  // NN_COMPATIBILITY_LIBRARY_BUILD
    // Since the performance is a ratio compared to the CPU performance,
    // by definition the performance of the CPU is 1.0.
    const Capabilities::PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
    const Capabilities kCapabilities = createCpuCapabilities();
    const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
};

// A special abstracted RuntimePreparedModel for the CPU, constructed by CpuDevice.
class CpuPreparedModel : public RuntimePreparedModel {
   public:
    // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
    // a prepared model object if successfully created. Returns an error code
    // and nullptr otherwise.
    static std::pair<int, std::shared_ptr<RuntimePreparedModel>> create(Model model);

    const Device* getDevice() const override { return CpuDevice::get().get(); }
    SharedPreparedModel getInterface() const override { return nullptr; }

    std::tuple<int, std::vector<OutputShape>, Timing> execute(
            const std::vector<ModelArgumentInfo>& inputs,
            const std::vector<ModelArgumentInfo>& outputs,
            const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
            MeasureTiming measure, const OptionalTimePoint& deadline,
            const OptionalDuration& loopTimeoutDuration,
            const std::vector<TokenValuePair>& metaData) const override;

    GeneralResult<SharedBurst> configureExecutionBurst() const override { return nullptr; }

    std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
            const std::vector<ModelArgumentInfo>& inputs,
            const std::vector<ModelArgumentInfo>& outputs,
            const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
            MeasureTiming measure, const OptionalTimePoint& deadline,
            const OptionalDuration& loopTimeoutDuration,
            const OptionalDuration& timeoutDurationAfterFence,
            const std::vector<TokenValuePair>& metaData) const override;

    std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
            const std::vector<ModelArgumentInfo>& inputs,
            const std::vector<ModelArgumentInfo>& outputs,
            const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
            const OptionalDuration& loopTimeoutDuration,
            const std::vector<TokenValuePair>& metaData) const override;

    MemoryPreference getMemoryPreference() const override {
        return {kPreferredAlignment, kPreferredPadding};
    }

    // Prefer to use CpuPreparedModel::create.
    CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
        : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}

    const Model& getModel() const { return mModel; }
    const std::vector<RunTimePoolInfo>& getModelPoolInfos() const { return mModelPoolInfos; }

   private:
    // TFLite kernels prefers 64 bytes for padding and alignment.
    static constexpr uint32_t kPreferredAlignment = 64;
    static constexpr uint32_t kPreferredPadding = 64;

    const Model mModel;
    const std::vector<RunTimePoolInfo> mModelPoolInfos;
};

class CpuExecution : public RuntimeExecution {
   public:
    CpuExecution(const CpuPreparedModel& preparedModel, Request request,
                 std::vector<RunTimePoolInfo> requestPoolInfos,
                 OptionalDuration loopTimeoutDuration)
        : kPreparedModel(preparedModel),
          kRequest(std::move(request)),
          kRequestPoolInfos(std::move(requestPoolInfos)),
          kLoopTimeoutDuration(std::move(loopTimeoutDuration)) {}

    std::tuple<int, std::vector<OutputShape>, Timing> compute(
            const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;

    std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
            const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
            const OptionalDuration& timeoutDurationAfterFence) const override;

   private:
    const CpuPreparedModel& kPreparedModel;
    Request kRequest;
    std::vector<RunTimePoolInfo> kRequestPoolInfos;
    const OptionalDuration kLoopTimeoutDuration;
};

std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
    const Model& model = metaModel.getModel();
    const size_t count = model.main.operations.size();
    std::vector<bool> result(count, false);
    for (size_t i = 0; i < count; i++) {
        // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
        //                    We may want to use the slicer for CpuDevice just as we do for
        //                    DriverDevice.
        OperationType operationType = model.main.operations[i].type;
        result[i] = !isExtension(operationType) && operationType != OperationType::OEM_OPERATION;
    }
    return result;
}

template <typename Type>
static Result<void> validateAndCheckCompliance(const Type& object) {
    const auto version = NN_TRY(validate(object));
    if (!isCompliantVersion(version, DeviceManager::get()->getRuntimeVersion())) {
        return NN_ERROR() << "Object than is newer what is allowed. Version needed: " << version
                          << ", current runtime version supported: "
                          << DeviceManager::get()->getRuntimeVersion();
    }
    return {};
}

std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuDevice::prepareModel(
        const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
        const OptionalTimePoint& deadline, const CacheInfo& /*cacheInfo*/,
        const std::optional<CacheToken>& maybeToken,
        const std::vector<TokenValuePair>& /*metaData*/,
        const std::vector<ExtensionNameAndPrefix>& /*extensionNameAndPrefix*/) const {
    CHECK(!maybeToken.has_value())
            << "Should never call prepareModel with cache information on CpuDevice";

    const Model model = makeModel();
    if (auto result = validateAndCheckCompliance(model); !result.ok()) {
        LOG(ERROR) << "Invalid Model: " << result.error();
        return {ANEURALNETWORKS_OP_FAILED, nullptr};
    }
    if (auto result = validateAndCheckCompliance(preference); !result.ok()) {
        LOG(ERROR) << "Invalid ExecutionPreference: " << result.error();
        return {ANEURALNETWORKS_OP_FAILED, nullptr};
    }
    if (auto result = validateAndCheckCompliance(priority); !result.ok()) {
        LOG(ERROR) << "Invalid Priority: " << result.error();
        return {ANEURALNETWORKS_OP_FAILED, nullptr};
    }
    if (hasDeadlinePassed(deadline)) {
        return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
    }

    return CpuPreparedModel::create(model);
}

std::pair<int, std::unique_ptr<RuntimeMemory>> CpuDevice::allocate(const MemoryDescriptor& desc,
                                                                   OperandType type) const {
    uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
    if (size == 0) {
        LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
        return {ANEURALNETWORKS_OP_FAILED, nullptr};
    }
    return MemoryAshmem::create(size);
}

std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuPreparedModel::create(Model model) {
    std::vector<RunTimePoolInfo> poolInfos;
    if (!setRunTimePoolInfosFromCanonicalMemories(&poolInfos, model.pools)) {
        return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
    }

    std::shared_ptr<RuntimePreparedModel> preparedModel =
            std::make_shared<CpuPreparedModel>(std::move(model), std::move(poolInfos));
    return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
}

static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
        const Model& model, const Request& request,
        const std::vector<RunTimePoolInfo>& modelPoolInfos,
        const std::vector<RunTimePoolInfo>& requestPoolInfos, const OptionalTimePoint& deadline,
        const OptionalDuration& loopTimeoutDuration) {
    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
    CpuExecutor executor;
    if (loopTimeoutDuration.has_value()) {
        executor.setLoopTimeout(loopTimeoutDuration->count());
    }
    if (deadline.has_value()) {
        executor.setDeadline(*deadline);
    }
    int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
    const auto& outputShapes = executor.getOutputShapes();
    return {err, outputShapes, {}};
}

std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuPreparedModel::executeFenced(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
        MeasureTiming measure, const OptionalTimePoint& deadline,
        const OptionalDuration& loopTimeoutDuration, const OptionalDuration& duration,
        const std::vector<TokenValuePair>& /*metaData*/) const {
    VLOG(EXECUTION)
            << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
    for (int syncFd : waitFor) {
        if (syncFd > 0) {
            auto r = syncWait(syncFd, -1);
            if (r != FenceState::SIGNALED) {
                LOG(ERROR) << "sync wait failed, fd: " << syncFd;
                return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
            }
        }
    }

    // Update deadline if the timeout duration is closer than the deadline.
    auto closestDeadline = deadline;
    if (duration.has_value()) {
        const auto timeoutDurationDeadline = makeDeadline(*duration);
        if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
            closestDeadline = timeoutDurationDeadline;
        }
    }

    const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
                                                        closestDeadline, loopTimeoutDuration, {});
    return {result, -1, nullptr, timing};
}

static std::tuple<int, Request, std::vector<RunTimePoolInfo>> createCpuRequest(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories) {
    std::vector<RunTimePoolInfo> requestPoolInfos;
    requestPoolInfos.reserve(memories.size());
    for (const RuntimeMemory* mem : memories) {
        if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
            requestPoolInfos.emplace_back(*poolInfo);
        } else {
            return {ANEURALNETWORKS_UNMAPPABLE, {}, {}};
        }
    }
    // Create as many pools as there are input / output.
    auto fixPointerArguments =
            [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
                std::vector<DataLocation> ptrArgsLocations;
                for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
                    if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
                        ptrArgsLocations.push_back(
                                {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
                                 .offset = 0,
                                 .length = argumentInfo.length(),
                                 .padding = argumentInfo.padding()});
                        requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
                                static_cast<uint8_t*>(argumentInfo.buffer())));
                    }
                }
                return ptrArgsLocations;
            };
    const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
    const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);

    Request request;
    request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
    request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
    return {ANEURALNETWORKS_NO_ERROR, std::move(request), std::move(requestPoolInfos)};
}

// Perform computation on NNAPI CPU reference implementation.
//
// Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
// same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
// there are input/output in this method to avoid data copying.
//
// Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories, const SharedBurst& /*burstController*/,
        MeasureTiming /*measure*/, const OptionalTimePoint& deadline,
        const OptionalDuration& loopTimeoutDuration,
        const std::vector<TokenValuePair>& /*metaData*/) const {
    if (hasDeadlinePassed(deadline)) {
        return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
    }

    int nCreateRequest;
    Request request;
    std::vector<RunTimePoolInfo> requestPoolInfos;
    std::tie(nCreateRequest, request, requestPoolInfos) =
            createCpuRequest(inputs, outputs, memories);
    if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
        return {nCreateRequest, {}, {}};
    }

    if (!DeviceManager::get()->syncExecCpu()) {
        // TODO: use a thread pool
        // TODO(mikie): this could have NNTRACE so we could measure the overhead
        //              of spinning up a new thread.
        std::tuple<int, std::vector<OutputShape>, Timing> result = {};
        std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
            result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
                                  loopTimeoutDuration);
        }).join();
        return result;
    }

    return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
                        loopTimeoutDuration);
}

std::pair<int, std::shared_ptr<RuntimeExecution>> CpuPreparedModel::createReusableExecution(
        const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
        const std::vector<const RuntimeMemory*>& memories, MeasureTiming /*measure*/,
        const OptionalDuration& loopTimeoutDuration,
        const std::vector<TokenValuePair>& /*metaData*/) const {
    auto [nCreateRequest, request, requestPoolInfos] = createCpuRequest(inputs, outputs, memories);
    if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
        return {nCreateRequest, nullptr};
    }
    auto execution = std::make_shared<CpuExecution>(
            *this, std::move(request), std::move(requestPoolInfos), loopTimeoutDuration);
    return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
}

std::tuple<int, std::vector<OutputShape>, Timing> CpuExecution::compute(
        const SharedBurst& /*burstController*/, const OptionalTimePoint& deadline) const {
    if (hasDeadlinePassed(deadline)) {
        return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
    }

    if (!DeviceManager::get()->syncExecCpu()) {
        // TODO: use a thread pool
        // TODO(mikie): this could have NNTRACE so we could measure the overhead
        //              of spinning up a new thread.
        std::tuple<int, std::vector<OutputShape>, Timing> result = {};
        std::thread([this, &deadline, &result] {
            result = computeOnCpu(kPreparedModel.getModel(), kRequest,
                                  kPreparedModel.getModelPoolInfos(), kRequestPoolInfos, deadline,
                                  kLoopTimeoutDuration);
        }).join();
        return result;
    }

    return computeOnCpu(kPreparedModel.getModel(), kRequest, kPreparedModel.getModelPoolInfos(),
                        kRequestPoolInfos, deadline, kLoopTimeoutDuration);
}

std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuExecution::computeFenced(
        const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
        const OptionalDuration& duration) const {
    VLOG(EXECUTION)
            << "CpuExecution::computeFenced wait for sync fences to signal before execution";
    for (int syncFd : waitFor) {
        if (syncFd > 0) {
            auto r = syncWait(syncFd, -1);
            if (r != FenceState::SIGNALED) {
                LOG(ERROR) << "sync wait failed, fd: " << syncFd;
                return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
            }
        }
    }

    // Update deadline if the timeout duration is closer than the deadline.
    auto closestDeadline = deadline;
    if (duration.has_value()) {
        const auto timeoutDurationDeadline = makeDeadline(*duration);
        if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
            closestDeadline = timeoutDurationDeadline;
        }
    }

    const auto [result, outputShapes, timing] = compute(nullptr, closestDeadline);
    return {result, -1, nullptr, timing};
}

int64_t DeviceManager::getRuntimeFeatureLevel() const {
    return versionToFeatureLevel(mRuntimeVersion.level);
}

DeviceManager* DeviceManager::get() {
    static DeviceManager manager;
    return &manager;
}

std::shared_ptr<Device> DeviceManager::getCpuDevice() {
    return CpuDevice::get();
}

std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const SharedDevice& device) {
    VLOG(MANAGER) << "forTest_makeDriverDevice(" << device->getName() << ")";
    const auto driverDevice = DriverDevice::create(device);
    CHECK(driverDevice != nullptr);
    return driverDevice;
}

#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
std::vector<std::shared_ptr<DriverDevice>> getDriverDevices(
        [[maybe_unused]] Version::Level maxFeatureLevelAllowed) {
#ifdef __ANDROID__
    auto devices = hardware::neuralnetworks::service::getDevices(maxFeatureLevelAllowed);

    std::vector<std::shared_ptr<DriverDevice>> driverDevices;
    driverDevices.reserve(devices.size());
    for (auto& device : devices) {
        driverDevices.push_back(DriverDevice::create(std::move(device)));
    }
    return driverDevices;
#else   // __ANDROID__
    return {};
#endif  // __ANDROID__
}
#else
std::vector<std::shared_ptr<DriverDevice>> getDriverDevices(
        Version::Level /*maxFeatureLevelAllowed*/) {
    auto devices = getDevices();
    std::vector<std::shared_ptr<DriverDevice>> driverDevices;
    driverDevices.reserve(devices.size());
    for (auto& device : devices) {
        driverDevices.push_back(DriverDevice::create(std::move(device)));
    }
    return driverDevices;
}
#endif  // NN_COMPATIBILITY_LIBRARY_BUILD

void DeviceManager::findAvailableDevices() {
    VLOG(MANAGER) << "findAvailableDevices";

#ifdef NN_DEBUGGABLE
    // debug.nn.enabled-devices defines a regex pattern. For all available driver devices, only the
    // ones with name matching the pattern are enabled. Driver devices with unmatched names are
    // ignored. If this property is not set, all available driver devices are enabled by default.
    // This filter only applies to driver devices. nnapi-reference is always enabled.
    std::string patternStr = base::GetProperty("debug.nn.enabled-devices", ".*");
    LOG(INFO) << "Enabled devices: " << patternStr;
    const std::regex pattern(patternStr);
#endif  // NN_DEBUGGABLE

    // register driver devices
    auto driverDevices = getDriverDevices(mRuntimeVersion.level);
    for (auto& driverDevice : driverDevices) {
#ifdef NN_DEBUGGABLE
        if (!std::regex_match(driverDevice->getName(), pattern)) {
            LOG(INFO) << "Ignored interface " << driverDevice->getName()
                      << " (version = " << driverDevice->getVersionString() << ")";
            continue;
        }
#endif  // NN_DEBUGGABLE
        LOG(INFO) << "Found interface " << driverDevice->getName()
                  << " (version = " << driverDevice->getVersionString() << ")";
        mDevices.push_back(std::move(driverDevice));
    }

#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
    // register CPU fallback device
    mDevices.push_back(CpuDevice::get());
    mDevicesCpuOnly.push_back(CpuDevice::get());
#endif  // NN_COMPATIBILITY_LIBRARY_BUILD
}

void DeviceManager::registerDevice(const SharedDevice& device) {
    if (auto driverDevice = DriverDevice::create(device)) {
        mDevices.push_back(std::move(driverDevice));
    }
}

DeviceManager::DeviceManager() {
    VLOG(MANAGER) << "DeviceManager::DeviceManager";
    mRuntimeVersion = getRuntimeFeatureLevelVersion();
    mIsPlatformTelemetryEnabled = getWhetherPlatformTelemetryIsEnabled();
    findAvailableDevices();
#ifdef NN_DEBUGGABLE
    mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
    mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
    mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
    mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
    mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
#endif  // NN_DEBUGGABLE
}

}  // namespace nn
}  // namespace android