android13/packages/modules/NeuralNetworks/common/ExecutionBurstServer.cpp

/*
 * Copyright (C) 2019 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define LOG_TAG "ExecutionBurstServer"

#include "ExecutionBurstServer.h"

#include <android-base/logging.h>

#include <algorithm>
#include <cstring>
#include <limits>
#include <map>
#include <memory>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

#include "HalInterfaces.h"
#include "Tracing.h"
#include "Utils.h"

namespace android::nn {
namespace {

using hardware::MQDescriptorSync;
using V1_2::FmqRequestDatum;
using V1_2::FmqResultDatum;
using V1_2::IBurstCallback;
using V1_2::IBurstContext;

constexpr V1_2::Timing kNoTiming = {std::numeric_limits<uint64_t>::max(),
                                    std::numeric_limits<uint64_t>::max()};

// DefaultBurstExecutorWithCache adapts an IPreparedModel so that it can be
// used as an IBurstExecutorWithCache. Specifically, the cache simply stores the
// hidl_memory object, and the execution forwards calls to the provided
// IPreparedModel's "executeSynchronously" method. With this class, hidl_memory
// must be mapped and unmapped for each execution.
class DefaultBurstExecutorWithCache : public ExecutionBurstServer::IBurstExecutorWithCache {
   public:
    DefaultBurstExecutorWithCache(V1_2::IPreparedModel* preparedModel)
        : mpPreparedModel(preparedModel) {}

    bool isCacheEntryPresent(int32_t slot) const override {
        const auto it = mMemoryCache.find(slot);
        return (it != mMemoryCache.end()) && it->second.valid();
    }

    void addCacheEntry(const hardware::hidl_memory& memory, int32_t slot) override {
        mMemoryCache[slot] = memory;
    }

    void removeCacheEntry(int32_t slot) override { mMemoryCache.erase(slot); }

    std::tuple<V1_0::ErrorStatus, hardware::hidl_vec<V1_2::OutputShape>, V1_2::Timing> execute(
            const V1_0::Request& request, const std::vector<int32_t>& slots,
            V1_2::MeasureTiming measure) override {
        // convert slots to pools
        hardware::hidl_vec<hardware::hidl_memory> pools(slots.size());
        std::transform(slots.begin(), slots.end(), pools.begin(),
                       [this](int32_t slot) { return mMemoryCache[slot]; });

        // create full request
        V1_0::Request fullRequest = request;
        fullRequest.pools = std::move(pools);

        // setup execution
        V1_0::ErrorStatus returnedStatus = V1_0::ErrorStatus::GENERAL_FAILURE;
        hardware::hidl_vec<V1_2::OutputShape> returnedOutputShapes;
        V1_2::Timing returnedTiming;
        auto cb = [&returnedStatus, &returnedOutputShapes, &returnedTiming](
                          V1_0::ErrorStatus status,
                          const hardware::hidl_vec<V1_2::OutputShape>& outputShapes,
                          const V1_2::Timing& timing) {
            returnedStatus = status;
            returnedOutputShapes = outputShapes;
            returnedTiming = timing;
        };

        // execute
        const hardware::Return<void> ret =
                mpPreparedModel->executeSynchronously(fullRequest, measure, cb);
        if (!ret.isOk() || returnedStatus != V1_0::ErrorStatus::NONE) {
            LOG(ERROR) << "IPreparedModelAdapter::execute -- Error executing";
            return {returnedStatus, std::move(returnedOutputShapes), kNoTiming};
        }

        return std::make_tuple(returnedStatus, std::move(returnedOutputShapes), returnedTiming);
    }

   private:
    V1_2::IPreparedModel* const mpPreparedModel;
    std::map<int32_t, hardware::hidl_memory> mMemoryCache;
};

}  // anonymous namespace

// serialize result
std::vector<FmqResultDatum> serialize(V1_0::ErrorStatus errorStatus,
                                      const std::vector<V1_2::OutputShape>& outputShapes,
                                      V1_2::Timing timing) {
    // count how many elements need to be sent for a request
    size_t count = 2 + outputShapes.size();
    for (const auto& outputShape : outputShapes) {
        count += outputShape.dimensions.size();
    }

    // create buffer to temporarily store elements
    std::vector<FmqResultDatum> data;
    data.reserve(count);

    // package packetInfo
    {
        FmqResultDatum datum;
        datum.packetInformation({/*.packetSize=*/static_cast<uint32_t>(count),
                                 /*.errorStatus=*/errorStatus,
                                 /*.numberOfOperands=*/static_cast<uint32_t>(outputShapes.size())});
        data.push_back(datum);
    }

    // package output shape data
    for (const auto& operand : outputShapes) {
        // package operand information
        FmqResultDatum::OperandInformation info{};
        info.isSufficient = operand.isSufficient;
        info.numberOfDimensions = static_cast<uint32_t>(operand.dimensions.size());

        FmqResultDatum datum;
        datum.operandInformation(info);
        data.push_back(datum);

        // package operand dimensions
        for (uint32_t dimension : operand.dimensions) {
            FmqResultDatum datum;
            datum.operandDimensionValue(dimension);
            data.push_back(datum);
        }
    }

    // package executionTiming
    {
        FmqResultDatum datum;
        datum.executionTiming(timing);
        data.push_back(datum);
    }

    // return result
    return data;
}

// deserialize request
std::optional<std::tuple<V1_0::Request, std::vector<int32_t>, V1_2::MeasureTiming>> deserialize(
        const std::vector<FmqRequestDatum>& data) {
    using discriminator = FmqRequestDatum::hidl_discriminator;

    size_t index = 0;

    // validate packet information
    if (index >= data.size() ||
        data.at(index).getDiscriminator() != discriminator::packetInformation) {
        LOG(ERROR) << "FMQ Request packet ill-formed";
        return std::nullopt;
    }

    // unpackage packet information
    const FmqRequestDatum::PacketInformation& packetInfo = data.at(index).packetInformation();
    index++;
    const uint32_t packetSize = packetInfo.packetSize;
    const uint32_t numberOfInputOperands = packetInfo.numberOfInputOperands;
    const uint32_t numberOfOutputOperands = packetInfo.numberOfOutputOperands;
    const uint32_t numberOfPools = packetInfo.numberOfPools;

    // verify packet size
    if (data.size() != packetSize) {
        LOG(ERROR) << "FMQ Request packet ill-formed";
        return std::nullopt;
    }

    // unpackage input operands
    std::vector<V1_0::RequestArgument> inputs;
    inputs.reserve(numberOfInputOperands);
    for (size_t operand = 0; operand < numberOfInputOperands; ++operand) {
        // validate input operand information
        if (index >= data.size() ||
            data.at(index).getDiscriminator() != discriminator::inputOperandInformation) {
            LOG(ERROR) << "FMQ Request packet ill-formed";
            return std::nullopt;
        }

        // unpackage operand information
        const FmqRequestDatum::OperandInformation& operandInfo =
                data.at(index).inputOperandInformation();
        index++;
        const bool hasNoValue = operandInfo.hasNoValue;
        const V1_0::DataLocation location = operandInfo.location;
        const uint32_t numberOfDimensions = operandInfo.numberOfDimensions;

        // unpackage operand dimensions
        std::vector<uint32_t> dimensions;
        dimensions.reserve(numberOfDimensions);
        for (size_t i = 0; i < numberOfDimensions; ++i) {
            // validate dimension
            if (index >= data.size() ||
                data.at(index).getDiscriminator() != discriminator::inputOperandDimensionValue) {
                LOG(ERROR) << "FMQ Request packet ill-formed";
                return std::nullopt;
            }

            // unpackage dimension
            const uint32_t dimension = data.at(index).inputOperandDimensionValue();
            index++;

            // store result
            dimensions.push_back(dimension);
        }

        // store result
        inputs.push_back(
                {/*.hasNoValue=*/hasNoValue, /*.location=*/location, /*.dimensions=*/dimensions});
    }

    // unpackage output operands
    std::vector<V1_0::RequestArgument> outputs;
    outputs.reserve(numberOfOutputOperands);
    for (size_t operand = 0; operand < numberOfOutputOperands; ++operand) {
        // validate output operand information
        if (index >= data.size() ||
            data.at(index).getDiscriminator() != discriminator::outputOperandInformation) {
            LOG(ERROR) << "FMQ Request packet ill-formed";
            return std::nullopt;
        }

        // unpackage operand information
        const FmqRequestDatum::OperandInformation& operandInfo =
                data.at(index).outputOperandInformation();
        index++;
        const bool hasNoValue = operandInfo.hasNoValue;
        const V1_0::DataLocation location = operandInfo.location;
        const uint32_t numberOfDimensions = operandInfo.numberOfDimensions;

        // unpackage operand dimensions
        std::vector<uint32_t> dimensions;
        dimensions.reserve(numberOfDimensions);
        for (size_t i = 0; i < numberOfDimensions; ++i) {
            // validate dimension
            if (index >= data.size() ||
                data.at(index).getDiscriminator() != discriminator::outputOperandDimensionValue) {
                LOG(ERROR) << "FMQ Request packet ill-formed";
                return std::nullopt;
            }

            // unpackage dimension
            const uint32_t dimension = data.at(index).outputOperandDimensionValue();
            index++;

            // store result
            dimensions.push_back(dimension);
        }

        // store result
        outputs.push_back(
                {/*.hasNoValue=*/hasNoValue, /*.location=*/location, /*.dimensions=*/dimensions});
    }

    // unpackage pools
    std::vector<int32_t> slots;
    slots.reserve(numberOfPools);
    for (size_t pool = 0; pool < numberOfPools; ++pool) {
        // validate input operand information
        if (index >= data.size() ||
            data.at(index).getDiscriminator() != discriminator::poolIdentifier) {
            LOG(ERROR) << "FMQ Request packet ill-formed";
            return std::nullopt;
        }

        // unpackage operand information
        const int32_t poolId = data.at(index).poolIdentifier();
        index++;

        // store result
        slots.push_back(poolId);
    }

    // validate measureTiming
    if (index >= data.size() || data.at(index).getDiscriminator() != discriminator::measureTiming) {
        LOG(ERROR) << "FMQ Request packet ill-formed";
        return std::nullopt;
    }

    // unpackage measureTiming
    const V1_2::MeasureTiming measure = data.at(index).measureTiming();
    index++;

    // validate packet information
    if (index != packetSize) {
        LOG(ERROR) << "FMQ Request packet ill-formed";
        return std::nullopt;
    }

    // return request
    V1_0::Request request = {/*.inputs=*/inputs, /*.outputs=*/outputs, /*.pools=*/{}};
    return std::make_tuple(std::move(request), std::move(slots), measure);
}

// RequestChannelReceiver methods

std::unique_ptr<RequestChannelReceiver> RequestChannelReceiver::create(
        const FmqRequestDescriptor& requestChannel, std::chrono::microseconds pollingTimeWindow) {
    std::unique_ptr<FmqRequestChannel> fmqRequestChannel =
            std::make_unique<FmqRequestChannel>(requestChannel);

    if (!fmqRequestChannel->isValid()) {
        LOG(ERROR) << "Unable to create RequestChannelReceiver";
        return nullptr;
    }
    if (fmqRequestChannel->getEventFlagWord() == nullptr) {
        LOG(ERROR)
                << "RequestChannelReceiver::create was passed an MQDescriptor without an EventFlag";
        return nullptr;
    }

    return std::make_unique<RequestChannelReceiver>(std::move(fmqRequestChannel),
                                                    pollingTimeWindow);
}

RequestChannelReceiver::RequestChannelReceiver(std::unique_ptr<FmqRequestChannel> fmqRequestChannel,
                                               std::chrono::microseconds pollingTimeWindow)
    : mFmqRequestChannel(std::move(fmqRequestChannel)), kPollingTimeWindow(pollingTimeWindow) {}

std::optional<std::tuple<V1_0::Request, std::vector<int32_t>, V1_2::MeasureTiming>>
RequestChannelReceiver::getBlocking() {
    const auto packet = getPacketBlocking();
    if (!packet) {
        return std::nullopt;
    }

    return deserialize(*packet);
}

void RequestChannelReceiver::invalidate() {
    mTeardown = true;

    // force unblock
    // ExecutionBurstServer is by default waiting on a request packet. If the
    // client process destroys its burst object, the server may still be waiting
    // on the futex. This force unblock wakes up any thread waiting on the
    // futex.
    // TODO: look for a different/better way to signal/notify the futex to wake
    // up any thread waiting on it
    FmqRequestDatum datum;
    datum.packetInformation({/*.packetSize=*/0, /*.numberOfInputOperands=*/0,
                             /*.numberOfOutputOperands=*/0, /*.numberOfPools=*/0});
    mFmqRequestChannel->writeBlocking(&datum, 1);
}

std::optional<std::vector<FmqRequestDatum>> RequestChannelReceiver::getPacketBlocking() {

    if (mTeardown) {
        return std::nullopt;
    }

    // First spend time polling if results are available in FMQ instead of
    // waiting on the futex. Polling is more responsive (yielding lower
    // latencies), but can take up more power, so only poll for a limited period
    // of time.

    auto& getCurrentTime = std::chrono::high_resolution_clock::now;
    const auto timeToStopPolling = getCurrentTime() + kPollingTimeWindow;

    while (getCurrentTime() < timeToStopPolling) {
        // if class is being torn down, immediately return
        if (mTeardown.load(std::memory_order_relaxed)) {
            return std::nullopt;
        }

        // Check if data is available. If it is, immediately retrieve it and
        // return.
        const size_t available = mFmqRequestChannel->availableToRead();
        if (available > 0) {
            // This is the first point when we know an execution is occurring,
            // so begin to collect systraces. Note that a similar systrace does
            // not exist at the corresponding point in
            // ResultChannelReceiver::getPacketBlocking because the execution is
            // already in flight.
            NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
                         "ExecutionBurstServer getting packet");
            std::vector<FmqRequestDatum> packet(available);
            const bool success = mFmqRequestChannel->read(packet.data(), available);
            if (!success) {
                LOG(ERROR) << "Error receiving packet";
                return std::nullopt;
            }
            return std::make_optional(std::move(packet));
        }

        std::this_thread::yield();
    }

    // If we get to this point, we either stopped polling because it was taking
    // too long or polling was not allowed. Instead, perform a blocking call
    // which uses a futex to save power.

    // wait for request packet and read first element of request packet
    FmqRequestDatum datum;
    bool success = mFmqRequestChannel->readBlocking(&datum, 1);

    // This is the first point when we know an execution is occurring, so begin
    // to collect systraces. Note that a similar systrace does not exist at the
    // corresponding point in ResultChannelReceiver::getPacketBlocking because
    // the execution is already in flight.
    NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION, "ExecutionBurstServer getting packet");

    // retrieve remaining elements
    // NOTE: all of the data is already available at this point, so there's no
    // need to do a blocking wait to wait for more data. This is known because
    // in FMQ, all writes are published (made available) atomically. Currently,
    // the producer always publishes the entire packet in one function call, so
    // if the first element of the packet is available, the remaining elements
    // are also available.
    const size_t count = mFmqRequestChannel->availableToRead();
    std::vector<FmqRequestDatum> packet(count + 1);
    std::memcpy(&packet.front(), &datum, sizeof(datum));
    success &= mFmqRequestChannel->read(packet.data() + 1, count);

    // terminate loop
    if (mTeardown) {
        return std::nullopt;
    }

    // ensure packet was successfully received
    if (!success) {
        LOG(ERROR) << "Error receiving packet";
        return std::nullopt;
    }

    return std::make_optional(std::move(packet));
}

// ResultChannelSender methods

std::unique_ptr<ResultChannelSender> ResultChannelSender::create(
        const FmqResultDescriptor& resultChannel) {
    std::unique_ptr<FmqResultChannel> fmqResultChannel =
            std::make_unique<FmqResultChannel>(resultChannel);

    if (!fmqResultChannel->isValid()) {
        LOG(ERROR) << "Unable to create RequestChannelSender";
        return nullptr;
    }
    if (fmqResultChannel->getEventFlagWord() == nullptr) {
        LOG(ERROR) << "ResultChannelSender::create was passed an MQDescriptor without an EventFlag";
        return nullptr;
    }

    return std::make_unique<ResultChannelSender>(std::move(fmqResultChannel));
}

ResultChannelSender::ResultChannelSender(std::unique_ptr<FmqResultChannel> fmqResultChannel)
    : mFmqResultChannel(std::move(fmqResultChannel)) {}

bool ResultChannelSender::send(V1_0::ErrorStatus errorStatus,
                               const std::vector<V1_2::OutputShape>& outputShapes,
                               V1_2::Timing timing) {
    const std::vector<FmqResultDatum> serialized = serialize(errorStatus, outputShapes, timing);
    return sendPacket(serialized);
}

bool ResultChannelSender::sendPacket(const std::vector<FmqResultDatum>& packet) {
    if (packet.size() > mFmqResultChannel->availableToWrite()) {
        LOG(ERROR)
                << "ResultChannelSender::sendPacket -- packet size exceeds size available in FMQ";
        const std::vector<FmqResultDatum> errorPacket =
                serialize(V1_0::ErrorStatus::GENERAL_FAILURE, {}, kNoTiming);

        // Always send the packet with "blocking" because this signals the futex
        // and unblocks the consumer if it is waiting on the futex.
        return mFmqResultChannel->writeBlocking(errorPacket.data(), errorPacket.size());
    }

    // Always send the packet with "blocking" because this signals the futex and
    // unblocks the consumer if it is waiting on the futex.
    return mFmqResultChannel->writeBlocking(packet.data(), packet.size());
}

// ExecutionBurstServer methods

sp<ExecutionBurstServer> ExecutionBurstServer::create(
        const sp<IBurstCallback>& callback, const MQDescriptorSync<FmqRequestDatum>& requestChannel,
        const MQDescriptorSync<FmqResultDatum>& resultChannel,
        std::shared_ptr<IBurstExecutorWithCache> executorWithCache,
        std::chrono::microseconds pollingTimeWindow) {
    // check inputs
    if (callback == nullptr || executorWithCache == nullptr) {
        LOG(ERROR) << "ExecutionBurstServer::create passed a nullptr";
        return nullptr;
    }

    // create FMQ objects
    std::unique_ptr<RequestChannelReceiver> requestChannelReceiver =
            RequestChannelReceiver::create(requestChannel, pollingTimeWindow);
    std::unique_ptr<ResultChannelSender> resultChannelSender =
            ResultChannelSender::create(resultChannel);

    // check FMQ objects
    if (!requestChannelReceiver || !resultChannelSender) {
        LOG(ERROR) << "ExecutionBurstServer::create failed to create FastMessageQueue";
        return nullptr;
    }

    // make and return context
    return new ExecutionBurstServer(callback, std::move(requestChannelReceiver),
                                    std::move(resultChannelSender), std::move(executorWithCache));
}

sp<ExecutionBurstServer> ExecutionBurstServer::create(
        const sp<IBurstCallback>& callback, const MQDescriptorSync<FmqRequestDatum>& requestChannel,
        const MQDescriptorSync<FmqResultDatum>& resultChannel, V1_2::IPreparedModel* preparedModel,
        std::chrono::microseconds pollingTimeWindow) {
    // check relevant input
    if (preparedModel == nullptr) {
        LOG(ERROR) << "ExecutionBurstServer::create passed a nullptr";
        return nullptr;
    }

    // adapt IPreparedModel to have caching
    const std::shared_ptr<DefaultBurstExecutorWithCache> preparedModelAdapter =
            std::make_shared<DefaultBurstExecutorWithCache>(preparedModel);

    // make and return context
    return ExecutionBurstServer::create(callback, requestChannel, resultChannel,
                                        preparedModelAdapter, pollingTimeWindow);
}

ExecutionBurstServer::ExecutionBurstServer(
        const sp<IBurstCallback>& callback, std::unique_ptr<RequestChannelReceiver> requestChannel,
        std::unique_ptr<ResultChannelSender> resultChannel,
        std::shared_ptr<IBurstExecutorWithCache> executorWithCache)
    : mCallback(callback),
      mRequestChannelReceiver(std::move(requestChannel)),
      mResultChannelSender(std::move(resultChannel)),
      mExecutorWithCache(std::move(executorWithCache)) {
    // TODO: highly document the threading behavior of this class
    mWorker = std::thread([this] { task(); });
}

ExecutionBurstServer::~ExecutionBurstServer() {
    // set teardown flag
    mTeardown = true;
    mRequestChannelReceiver->invalidate();

    // wait for task thread to end
    mWorker.join();
}

hardware::Return<void> ExecutionBurstServer::freeMemory(int32_t slot) {
    std::lock_guard<std::mutex> hold(mMutex);
    mExecutorWithCache->removeCacheEntry(slot);
    return hardware::Void();
}

void ExecutionBurstServer::ensureCacheEntriesArePresentLocked(const std::vector<int32_t>& slots) {
    const auto slotIsKnown = [this](int32_t slot) {
        return mExecutorWithCache->isCacheEntryPresent(slot);
    };

    // find unique unknown slots
    std::vector<int32_t> unknownSlots = slots;
    auto unknownSlotsEnd = unknownSlots.end();
    std::sort(unknownSlots.begin(), unknownSlotsEnd);
    unknownSlotsEnd = std::unique(unknownSlots.begin(), unknownSlotsEnd);
    unknownSlotsEnd = std::remove_if(unknownSlots.begin(), unknownSlotsEnd, slotIsKnown);
    unknownSlots.erase(unknownSlotsEnd, unknownSlots.end());

    // quick-exit if all slots are known
    if (unknownSlots.empty()) {
        return;
    }

    V1_0::ErrorStatus errorStatus = V1_0::ErrorStatus::GENERAL_FAILURE;
    std::vector<hardware::hidl_memory> returnedMemories;
    auto cb = [&errorStatus, &returnedMemories](
                      V1_0::ErrorStatus status,
                      const hardware::hidl_vec<hardware::hidl_memory>& memories) {
        errorStatus = status;
        returnedMemories = memories;
    };

    const hardware::Return<void> ret = mCallback->getMemories(unknownSlots, cb);

    if (!ret.isOk() || errorStatus != V1_0::ErrorStatus::NONE ||
        returnedMemories.size() != unknownSlots.size()) {
        LOG(ERROR) << "Error retrieving memories";
        return;
    }

    // add memories to unknown slots
    for (size_t i = 0; i < unknownSlots.size(); ++i) {
        mExecutorWithCache->addCacheEntry(returnedMemories[i], unknownSlots[i]);
    }
}

void ExecutionBurstServer::task() {
    // loop until the burst object is being destroyed
    while (!mTeardown) {
        // receive request
        auto arguments = mRequestChannelReceiver->getBlocking();

        // if the request packet was not properly received, return a generic
        // error and skip the execution
        //
        // if the  burst is being torn down, skip the execution exection so the
        // "task" function can end
        if (!arguments) {
            if (!mTeardown) {
                mResultChannelSender->send(V1_0::ErrorStatus::GENERAL_FAILURE, {}, kNoTiming);
            }
            continue;
        }

        // otherwise begin tracing execution
        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
                     "ExecutionBurstServer getting memory, executing, and returning results");

        // unpack the arguments; types are Request, std::vector<int32_t>, and
        // MeasureTiming, respectively
        const auto [requestWithoutPools, slotsOfPools, measure] = std::move(*arguments);

        // ensure executor with cache has required memory
        std::lock_guard<std::mutex> hold(mMutex);
        ensureCacheEntriesArePresentLocked(slotsOfPools);

        // perform computation; types are ErrorStatus, hidl_vec<OutputShape>,
        // and Timing, respectively
        const auto [errorStatus, outputShapes, returnedTiming] =
                mExecutorWithCache->execute(requestWithoutPools, slotsOfPools, measure);

        // return result
        mResultChannelSender->send(errorStatus, outputShapes, returnedTiming);
    }
}

}  // namespace android::nn