/* * Copyright (C) 2021 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "TaskProcessor.h" #include #include #include "RenderScriptToolkit.h" #include "Utils.h" #define LOG_TAG "renderscript.toolkit.TaskProcessor" namespace android { namespace renderscript { int Task::setTiling(unsigned int targetTileSizeInBytes) { // Empirically, values smaller than 1000 are unlikely to give good performance. targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes); const size_t cellSizeInBytes = mVectorSize; // If we add float support, vectorSize * 4 for that. const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes; assert(targetCellsPerTile > 0); size_t cellsToProcessY; size_t cellsToProcessX; if (mRestriction == nullptr) { cellsToProcessX = mSizeX; cellsToProcessY = mSizeY; } else { assert(mRestriction->endX > mRestriction->startX); assert(mRestriction->endY > mRestriction->startY); cellsToProcessX = mRestriction->endX - mRestriction->startX; cellsToProcessY = mRestriction->endY - mRestriction->startY; } // We want rows as large as possible, as the SIMD code we have is more efficient with // large rows. mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile); // Once we know the number of tiles per row, we divide that row evenly. We round up to make // sure all cells are included in the last tile of the row. mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow); // We do the same thing for the Y direction. size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX); mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile); mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn); return mTilesPerRow * mTilesPerColumn; } void Task::processTile(unsigned int threadIndex, size_t tileIndex) { // Figure out the overall boundaries. size_t startWorkX; size_t startWorkY; size_t endWorkX; size_t endWorkY; if (mRestriction == nullptr) { startWorkX = 0; startWorkY = 0; endWorkX = mSizeX; endWorkY = mSizeY; } else { startWorkX = mRestriction->startX; startWorkY = mRestriction->startY; endWorkX = mRestriction->endX; endWorkY = mRestriction->endY; } // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify // first the X, Y coordinate of our tile in that grid. size_t tileIndexY = tileIndex / mTilesPerRow; size_t tileIndexX = tileIndex % mTilesPerRow; // Calculate the starting and ending point of that tile. size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX; size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY; size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX); size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY); // Call the derived class to do the specific work. if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) { // When the tile covers entire rows, we can take advantage that some ops are not 2D. processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1); } else { processData(threadIndex, startCellX, startCellY, endCellX, endCellY); } } TaskProcessor::TaskProcessor(unsigned int numThreads) : mUsesSimd{cpuSupportsSimd()}, /* If the requested number of threads is 0, we'll decide based on the number of cores. * Through empirical testing, we've found that using more than 6 threads does not help. * There may be more optimal choices to make depending on the SoC but we'll stick to * this simple heuristic for now. * * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less * worker pool thread than the total number of threads. */ mNumberOfPoolThreads{numThreads ? numThreads - 1 : std::min(6u, std::thread::hardware_concurrency() - 1)} { for (size_t i = 0; i < mNumberOfPoolThreads; i++) { mPoolThreads.emplace_back( std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false)); } } TaskProcessor::~TaskProcessor() { { std::lock_guard lock(mQueueMutex); mStopThreads = true; mWorkAvailableOrStop.notify_all(); } for (auto& thread : mPoolThreads) { thread.join(); } } void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) { if (threadIndex != 0) { // Set the name of the thread, except for thread 0, which is not part of the pool. // PR_SET_NAME takes a maximum of 16 characters, including the terminating null. char name[16]{"RenderScToolkit"}; prctl(PR_SET_NAME, name, 0, 0, 0); // ALOGI("Starting thread%d", threadIndex); } std::unique_lock lock(mQueueMutex); while (true) { mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) { return mStopThreads || (mTilesNotYetStarted > 0) || (returnWhenNoWork && (mTilesNotYetStarted == 0)); }); // ALOGI("Woke thread%d", threadIndex); // This ScopedLockAssertion is to help the compiler when it checks thread annotations // to realize that we have the lock. It's however not completely true; we don't // hold the lock while processing the tile. // TODO Figure out how to fix that. android::base::ScopedLockAssertion lockAssert(mQueueMutex); if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) { break; } while (mTilesNotYetStarted > 0 && !mStopThreads) { // This picks the tiles in decreasing order but that does not matter. int myTile = --mTilesNotYetStarted; mTilesInProcess++; lock.unlock(); { // We won't be executing this code unless the main thread is // holding the mTaskMutex lock, which guards mCurrentTask. // The compiler can't figure this out. android::base::ScopedLockAssertion lockAssert(mTaskMutex); mCurrentTask->processTile(threadIndex, myTile); } lock.lock(); mTilesInProcess--; if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) { mWorkIsFinished.notify_one(); } } } // if (threadIndex != 0) { // ALOGI("Ending thread%d", threadIndex); // } } void TaskProcessor::doTask(Task* task) { std::lock_guard lockGuard(mTaskMutex); task->setUsesSimd(mUsesSimd); mCurrentTask = task; // Notify the thread pool of available work. startWork(task); // Start processing some of the tiles on the calling thread. processTilesOfWork(0, true); // Wait for all the pool workers to complete. waitForPoolWorkersToComplete(); mCurrentTask = nullptr; } void TaskProcessor::startWork(Task* task) { /** * The size in bytes that we're hoping each tile will be. If this value is too small, * we'll spend too much time in synchronization. If it's too large, some cores may be * idle while others still have a lot of work to do. Ideally, it would depend on the * device we're running. 16k is the same value used by RenderScript and seems reasonable * from ad-hoc tests. */ const size_t targetTileSize = 16 * 1024; std::lock_guard lock(mQueueMutex); assert(mTilesInProcess == 0); mTilesNotYetStarted = task->setTiling(targetTileSize); mWorkAvailableOrStop.notify_all(); } void TaskProcessor::waitForPoolWorkersToComplete() { std::unique_lock lock(mQueueMutex); // The predicate, i.e. the lambda, will make sure that // we terminate even if the main thread calls this after // mWorkIsFinished is signaled. mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) { return mTilesNotYetStarted == 0 && mTilesInProcess == 0; }); } } // namespace renderscript } // namespace android