222 lines
8.7 KiB
C++
222 lines
8.7 KiB
C++
/*
|
|
* Copyright (C) 2021 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "TaskProcessor.h"
|
|
|
|
#include <assert.h>
|
|
#include <sys/prctl.h>
|
|
|
|
#include "RenderScriptToolkit.h"
|
|
#include "Utils.h"
|
|
|
|
#define LOG_TAG "renderscript.toolkit.TaskProcessor"
|
|
|
|
namespace android {
|
|
namespace renderscript {
|
|
|
|
int Task::setTiling(unsigned int targetTileSizeInBytes) {
|
|
// Empirically, values smaller than 1000 are unlikely to give good performance.
|
|
targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
|
|
const size_t cellSizeInBytes =
|
|
mVectorSize; // If we add float support, vectorSize * 4 for that.
|
|
const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
|
|
assert(targetCellsPerTile > 0);
|
|
|
|
size_t cellsToProcessY;
|
|
size_t cellsToProcessX;
|
|
if (mRestriction == nullptr) {
|
|
cellsToProcessX = mSizeX;
|
|
cellsToProcessY = mSizeY;
|
|
} else {
|
|
assert(mRestriction->endX > mRestriction->startX);
|
|
assert(mRestriction->endY > mRestriction->startY);
|
|
cellsToProcessX = mRestriction->endX - mRestriction->startX;
|
|
cellsToProcessY = mRestriction->endY - mRestriction->startY;
|
|
}
|
|
|
|
// We want rows as large as possible, as the SIMD code we have is more efficient with
|
|
// large rows.
|
|
mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
|
|
// Once we know the number of tiles per row, we divide that row evenly. We round up to make
|
|
// sure all cells are included in the last tile of the row.
|
|
mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
|
|
|
|
// We do the same thing for the Y direction.
|
|
size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
|
|
mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
|
|
mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
|
|
|
|
return mTilesPerRow * mTilesPerColumn;
|
|
}
|
|
|
|
void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
|
|
// Figure out the overall boundaries.
|
|
size_t startWorkX;
|
|
size_t startWorkY;
|
|
size_t endWorkX;
|
|
size_t endWorkY;
|
|
if (mRestriction == nullptr) {
|
|
startWorkX = 0;
|
|
startWorkY = 0;
|
|
endWorkX = mSizeX;
|
|
endWorkY = mSizeY;
|
|
} else {
|
|
startWorkX = mRestriction->startX;
|
|
startWorkY = mRestriction->startY;
|
|
endWorkX = mRestriction->endX;
|
|
endWorkY = mRestriction->endY;
|
|
}
|
|
// Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
|
|
// first the X, Y coordinate of our tile in that grid.
|
|
size_t tileIndexY = tileIndex / mTilesPerRow;
|
|
size_t tileIndexX = tileIndex % mTilesPerRow;
|
|
// Calculate the starting and ending point of that tile.
|
|
size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
|
|
size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
|
|
size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
|
|
size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
|
|
|
|
// Call the derived class to do the specific work.
|
|
if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
|
|
// When the tile covers entire rows, we can take advantage that some ops are not 2D.
|
|
processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
|
|
} else {
|
|
processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
|
|
}
|
|
}
|
|
|
|
TaskProcessor::TaskProcessor(unsigned int numThreads)
|
|
: mUsesSimd{cpuSupportsSimd()},
|
|
/* If the requested number of threads is 0, we'll decide based on the number of cores.
|
|
* Through empirical testing, we've found that using more than 6 threads does not help.
|
|
* There may be more optimal choices to make depending on the SoC but we'll stick to
|
|
* this simple heuristic for now.
|
|
*
|
|
* We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
|
|
* worker pool thread than the total number of threads.
|
|
*/
|
|
mNumberOfPoolThreads{numThreads ? numThreads - 1
|
|
: std::min(6u, std::thread::hardware_concurrency() - 1)} {
|
|
for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
|
|
mPoolThreads.emplace_back(
|
|
std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
|
|
}
|
|
}
|
|
|
|
TaskProcessor::~TaskProcessor() {
|
|
{
|
|
std::lock_guard<std::mutex> lock(mQueueMutex);
|
|
mStopThreads = true;
|
|
mWorkAvailableOrStop.notify_all();
|
|
}
|
|
|
|
for (auto& thread : mPoolThreads) {
|
|
thread.join();
|
|
}
|
|
}
|
|
|
|
void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
|
|
if (threadIndex != 0) {
|
|
// Set the name of the thread, except for thread 0, which is not part of the pool.
|
|
// PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
|
|
char name[16]{"RenderScToolkit"};
|
|
prctl(PR_SET_NAME, name, 0, 0, 0);
|
|
// ALOGI("Starting thread%d", threadIndex);
|
|
}
|
|
|
|
std::unique_lock<std::mutex> lock(mQueueMutex);
|
|
while (true) {
|
|
mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
|
|
return mStopThreads || (mTilesNotYetStarted > 0) ||
|
|
(returnWhenNoWork && (mTilesNotYetStarted == 0));
|
|
});
|
|
// ALOGI("Woke thread%d", threadIndex);
|
|
|
|
// This ScopedLockAssertion is to help the compiler when it checks thread annotations
|
|
// to realize that we have the lock. It's however not completely true; we don't
|
|
// hold the lock while processing the tile.
|
|
// TODO Figure out how to fix that.
|
|
android::base::ScopedLockAssertion lockAssert(mQueueMutex);
|
|
if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
|
|
break;
|
|
}
|
|
|
|
while (mTilesNotYetStarted > 0 && !mStopThreads) {
|
|
// This picks the tiles in decreasing order but that does not matter.
|
|
int myTile = --mTilesNotYetStarted;
|
|
mTilesInProcess++;
|
|
lock.unlock();
|
|
{
|
|
// We won't be executing this code unless the main thread is
|
|
// holding the mTaskMutex lock, which guards mCurrentTask.
|
|
// The compiler can't figure this out.
|
|
android::base::ScopedLockAssertion lockAssert(mTaskMutex);
|
|
mCurrentTask->processTile(threadIndex, myTile);
|
|
}
|
|
lock.lock();
|
|
mTilesInProcess--;
|
|
if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
|
|
mWorkIsFinished.notify_one();
|
|
}
|
|
}
|
|
}
|
|
// if (threadIndex != 0) {
|
|
// ALOGI("Ending thread%d", threadIndex);
|
|
// }
|
|
}
|
|
|
|
void TaskProcessor::doTask(Task* task) {
|
|
std::lock_guard<std::mutex> lockGuard(mTaskMutex);
|
|
task->setUsesSimd(mUsesSimd);
|
|
mCurrentTask = task;
|
|
// Notify the thread pool of available work.
|
|
startWork(task);
|
|
// Start processing some of the tiles on the calling thread.
|
|
processTilesOfWork(0, true);
|
|
// Wait for all the pool workers to complete.
|
|
waitForPoolWorkersToComplete();
|
|
mCurrentTask = nullptr;
|
|
}
|
|
|
|
void TaskProcessor::startWork(Task* task) {
|
|
/**
|
|
* The size in bytes that we're hoping each tile will be. If this value is too small,
|
|
* we'll spend too much time in synchronization. If it's too large, some cores may be
|
|
* idle while others still have a lot of work to do. Ideally, it would depend on the
|
|
* device we're running. 16k is the same value used by RenderScript and seems reasonable
|
|
* from ad-hoc tests.
|
|
*/
|
|
const size_t targetTileSize = 16 * 1024;
|
|
|
|
std::lock_guard<std::mutex> lock(mQueueMutex);
|
|
assert(mTilesInProcess == 0);
|
|
mTilesNotYetStarted = task->setTiling(targetTileSize);
|
|
mWorkAvailableOrStop.notify_all();
|
|
}
|
|
|
|
void TaskProcessor::waitForPoolWorkersToComplete() {
|
|
std::unique_lock<std::mutex> lock(mQueueMutex);
|
|
// The predicate, i.e. the lambda, will make sure that
|
|
// we terminate even if the main thread calls this after
|
|
// mWorkIsFinished is signaled.
|
|
mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
|
|
return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
|
|
});
|
|
}
|
|
|
|
} // namespace renderscript
|
|
} // namespace android
|