@@ -27,6 +27,38 @@
namespace libcamera {
+class DebayerCpuThread : public Object
+{
+public:
+ DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
+ bool enableInputMemcpy);
+
+ void configure(unsigned int yStart, unsigned int yEnd);
+ void setupInputMemcpy(const uint8_t *linePointers[]);
+ void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
+ void memcpyNextLine(const uint8_t *linePointers[]);
+ void process(uint32_t frame, const uint8_t *src, uint8_t *dst);
+ void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
+ void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
+
+ DebayerCpu *debayer_;
+ unsigned int threadIndex_;
+ unsigned int yStart_;
+ unsigned int yEnd_;
+ unsigned int lineBufferLength_;
+ unsigned int lineBufferPadding_;
+ unsigned int lineBufferIndex_;
+ std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers];
+ bool enableInputMemcpy_;
+};
+
+DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
+ bool enableInputMemcpy)
+ : debayer_(debayer), threadIndex_(threadIndex),
+ enableInputMemcpy_(enableInputMemcpy)
+{
+}
+
/**
* \class DebayerCpu
* \brief Class for debayering on the CPU
@@ -53,8 +85,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
* \todo Make memcpy automatic based on runtime detection of platform
* capabilities.
*/
- enableInputMemcpy_ =
+ bool enableInputMemcpy =
configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
+
+ /* Just one thread object for now, which will be called inline rather than async */
+ threads_.resize(1);
+
+ for (unsigned int i = 0; i < threads_.size(); i++)
+ threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy);
}
DebayerCpu::~DebayerCpu() = default;
@@ -484,7 +522,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg,
if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0)
return -EINVAL;
- if (stats_->configure(inputCfg) != 0)
+ if (stats_->configure(inputCfg, threads_.size()) != 0)
return -EINVAL;
const Size &statsPatternSize = stats_->patternSize();
@@ -548,17 +586,36 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg,
*/
stats_->setWindow(Rectangle(window_.size()));
+ unsigned int yStart = 0;
+ unsigned int linesPerThread = (window_.height / threads_.size()) &
+ ~(inputConfig_.patternSize.height - 1);
+ unsigned int i;
+
+ for (i = 0; i < (threads_.size() - 1); i++) {
+ threads_[i]->configure(yStart, yStart + linesPerThread);
+ yStart += linesPerThread;
+ }
+ threads_[i]->configure(yStart, window_.height);
+
+ return 0;
+}
+
+void DebayerCpuThread::configure(unsigned int yStart, unsigned int yEnd)
+{
+ Debayer::DebayerInputConfig &inputConfig = debayer_->inputConfig_;
+
+ yStart_ = yStart;
+ yEnd_ = yEnd;
+
/* pad with patternSize.Width on both left and right side */
- lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8;
- lineBufferLength_ = window_.width * inputConfig_.bpp / 8 +
+ lineBufferPadding_ = inputConfig.patternSize.width * inputConfig.bpp / 8;
+ lineBufferLength_ = debayer_->window_.width * inputConfig.bpp / 8 +
2 * lineBufferPadding_;
if (enableInputMemcpy_) {
- for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++)
+ for (unsigned int i = 0; i <= inputConfig.patternSize.height; i++)
lineBuffers_[i].resize(lineBufferLength_);
}
-
- return 0;
}
/*
@@ -599,9 +656,9 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
return std::make_tuple(stride, stride * size.height);
}
-void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
+void DebayerCpuThread::setupInputMemcpy(const uint8_t *linePointers[])
{
- const unsigned int patternHeight = inputConfig_.patternSize.height;
+ const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
if (!enableInputMemcpy_)
return;
@@ -617,20 +674,20 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
lineBufferIndex_ = patternHeight;
}
-void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
+void DebayerCpuThread::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
{
- const unsigned int patternHeight = inputConfig_.patternSize.height;
+ const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
for (unsigned int i = 0; i < patternHeight; i++)
linePointers[i] = linePointers[i + 1];
- linePointers[patternHeight] = src +
- (patternHeight / 2) * (int)inputConfig_.stride;
+ linePointers[patternHeight] =
+ src + (patternHeight / 2) * (int)debayer_->inputConfig_.stride;
}
-void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
+void DebayerCpuThread::memcpyNextLine(const uint8_t *linePointers[])
{
- const unsigned int patternHeight = inputConfig_.patternSize.height;
+ const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
if (!enableInputMemcpy_)
return;
@@ -643,23 +700,42 @@ void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
}
-void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
+void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst)
{
- unsigned int yEnd = window_.height;
+ Rectangle &window = debayer_->window_;
+
+ /* Adjust src to top left corner of the window */
+ src += (window.y + yStart_) * debayer_->inputConfig_.stride +
+ window.x * debayer_->inputConfig_.bpp / 8;
+ /* Adjust dst for yStart_ */
+ dst += yStart_ * debayer_->outputConfig_.stride;
+
+ if (debayer_->inputConfig_.patternSize.height == 2)
+ process2(frame, src, dst);
+ else
+ process4(frame, src, dst);
+}
+
+void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
+{
+ unsigned int outputStride = debayer_->outputConfig_.stride;
+ unsigned int inputStride = debayer_->inputConfig_.stride;
+ Rectangle &window = debayer_->window_;
+ unsigned int yEnd = yEnd_;
/* Holds [0] previous- [1] current- [2] next-line */
const uint8_t *linePointers[3];
- /* Adjust src to top left corner of the window */
- src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
-
/* [x] becomes [x - 1] after initial shiftLinePointers() call */
- if (window_.y) {
- linePointers[1] = src - inputConfig_.stride; /* previous-line */
+ if (window.y + yStart_) {
+ linePointers[1] = src - inputStride; /* previous-line */
linePointers[2] = src;
} else {
- /* window_.y == 0, use the next line as prev line */
- linePointers[1] = src + inputConfig_.stride;
+ /* Top line, use the next line as prev line */
+ linePointers[1] = src + inputStride;
linePointers[2] = src;
+ }
+
+ if (window.y == 0 && yEnd_ == window.height) {
/*
* Last 2 lines also need special handling.
* (And configure() ensures that yEnd >= 2.)
@@ -669,83 +745,93 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
setupInputMemcpy(linePointers);
- for (unsigned int y = 0; y < yEnd; y += 2) {
+ /*
+ * Note y is the line-number *inside* the window, since stats_' window
+ * is the stats window inside/relative to the debayer window. IOW for
+ * single thread rendering y goes from 0 to window.height.
+ */
+ for (unsigned int y = yStart_; y < yEnd; y += 2) {
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- stats_->processLine0(frame, y, linePointers);
- (this->*debayer0_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_);
+ debayer_->debayer0(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- (this->*debayer1_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->debayer1(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
}
- if (window_.y == 0) {
+ if (window.y == 0 && yEnd_ == window.height) {
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- stats_->processLine0(frame, yEnd, linePointers);
- (this->*debayer0_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->stats_->processLine0(frame, yEnd, linePointers, threadIndex_);
+ debayer_->debayer0(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
shiftLinePointers(linePointers, src);
/* next line may point outside of src, use prev. */
linePointers[2] = linePointers[0];
- (this->*debayer1_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->debayer1(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
}
}
-void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
+void DebayerCpuThread::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
{
+ unsigned int outputStride = debayer_->outputConfig_.stride;
+ unsigned int inputStride = debayer_->inputConfig_.stride;
+
/*
* This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
* [3] 1-line-down [4] 2-lines-down.
*/
const uint8_t *linePointers[5];
- /* Adjust src to top left corner of the window */
- src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
-
/* [x] becomes [x - 1] after initial shiftLinePointers() call */
- linePointers[1] = src - 2 * inputConfig_.stride;
- linePointers[2] = src - inputConfig_.stride;
+ linePointers[1] = src - 2 * inputStride;
+ linePointers[2] = src - inputStride;
linePointers[3] = src;
- linePointers[4] = src + inputConfig_.stride;
+ linePointers[4] = src + inputStride;
setupInputMemcpy(linePointers);
- for (unsigned int y = 0; y < window_.height; y += 4) {
+ /*
+ * Note y is the line-number *inside* the window, since stats_' window
+ * is the stats window inside/relative to the debayer window. IOW for
+ * single thread rendering y goes from 0 to window.height.
+ */
+ for (unsigned int y = yStart_; y < yEnd_; y += 4) {
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- stats_->processLine0(frame, y, linePointers);
- (this->*debayer0_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_);
+ debayer_->debayer0(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- (this->*debayer1_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->debayer1(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- stats_->processLine2(frame, y, linePointers);
- (this->*debayer2_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->stats_->processLine2(frame, y, linePointers, threadIndex_);
+ debayer_->debayer2(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
- (this->*debayer3_)(dst, linePointers);
- src += inputConfig_.stride;
- dst += outputConfig_.stride;
+ debayer_->debayer3(dst, linePointers);
+ src += inputStride;
+ dst += outputStride;
}
}
@@ -867,10 +953,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
stats_->startFrame(frame);
- if (inputConfig_.patternSize.height == 2)
- process2(frame, in.planes()[0].data(), out.planes()[0].data());
- else
- process4(frame, in.planes()[0].data(), out.planes()[0].data());
+ threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data());
metadata.planes()[0].bytesused = out.planes()[0].size();
@@ -26,6 +26,7 @@
namespace libcamera {
+class DebayerCpuThread;
class DebayerCpu : public Debayer
{
public:
@@ -44,6 +45,8 @@ public:
const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
private:
+ friend class DebayerCpuThread;
+
/**
* \brief Called to debayer 1 line of Bayer input data to output format
* \param[out] dst Pointer to the start of the output line to write
@@ -74,6 +77,11 @@ private:
*/
using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
+ void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); }
+ void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); }
+ void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); }
+ void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); }
+
/* 8-bit raw bayer format */
template<bool addAlphaByte, bool ccmEnabled>
void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
@@ -105,11 +113,6 @@ private:
int setDebayerFunctions(PixelFormat inputFormat,
PixelFormat outputFormat,
bool ccmEnabled);
- void setupInputMemcpy(const uint8_t *linePointers[]);
- void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
- void memcpyNextLine(const uint8_t *linePointers[]);
- void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
- void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
void updateGammaTable(const DebayerParams ¶ms);
void updateLookupTables(const DebayerParams ¶ms);
@@ -142,12 +145,9 @@ private:
debayerFn debayer3_;
Rectangle window_;
std::unique_ptr<SwStatsCpu> stats_;
- std::vector<uint8_t> lineBuffers_[kMaxLineBuffers];
- unsigned int lineBufferLength_;
- unsigned int lineBufferPadding_;
- unsigned int lineBufferIndex_;
unsigned int xShift_; /* Offset of 0/1 applied to window_.x */
- bool enableInputMemcpy_;
+
+ std::vector<DebayerCpuThread *>threads_;
};
} /* namespace libcamera */
Add a DebayerCpuThreadclass and use this in the inner render loop. This contains data which needs to be separate per thread. This is a preparation patch for making DebayerCpu support multi-threading. Benchmarking on the Arduino Uno-Q with a weak CPU which is good for performance testing, shows 146-147ms per 3272x2464 frame both before and after this change, with things maybe being 0.5 ms slower after this change. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- Changes in v2: - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread class, derived from Object to allow calling invokeMethod for thread re-use in followup patches - As part of this also move a bunch of methods which primarily deal with per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(), process*() to the new DebayerCpuThread class --- src/libcamera/software_isp/debayer_cpu.cpp | 215 ++++++++++++++------- src/libcamera/software_isp/debayer_cpu.h | 20 +- 2 files changed, 159 insertions(+), 76 deletions(-)