| Message ID | 20260223160930.27913-3-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi 2026. 02. 23. 17:09 keltezéssel, Hans de Goede írta: > Add a DebayerCpuThreadclass and use this in the inner render loop. > This contains data which needs to be separate per thread. > > This is a preparation patch for making DebayerCpu support multi-threading. > > Benchmarking on the Arduino Uno-Q with a weak CPU which is good for > performance testing, shows 146-147ms per 3272x2464 frame both before and > after this change, with things maybe being 0.5 ms slower after this change. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > Changes in v2: > - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread > class, derived from Object to allow calling invokeMethod for thread re-use > in followup patches > - As part of this also move a bunch of methods which primarily deal with > per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(), > process*() to the new DebayerCpuThread class > --- > src/libcamera/software_isp/debayer_cpu.cpp | 215 ++++++++++++++------- > src/libcamera/software_isp/debayer_cpu.h | 20 +- > 2 files changed, 159 insertions(+), 76 deletions(-) > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index e7b012105..122bfbb05 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -27,6 +27,38 @@ > [...] > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 7a6517462..7196dcdd0 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -26,6 +26,7 @@ > > namespace libcamera { > > +class DebayerCpuThread; > class DebayerCpu : public Debayer > { > public: > @@ -44,6 +45,8 @@ public: > const SharedFD &getStatsFD() { return stats_->getStatsFD(); } > > private: > + friend class DebayerCpuThread; > + > /** > * \brief Called to debayer 1 line of Bayer input data to output format > * \param[out] dst Pointer to the start of the output line to write > @@ -74,6 +77,11 @@ private: > */ > using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); > > + void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); } > + void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); } > + void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); } > + void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); } > + > /* 8-bit raw bayer format */ > template<bool addAlphaByte, bool ccmEnabled> > void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); > @@ -105,11 +113,6 @@ private: > int setDebayerFunctions(PixelFormat inputFormat, > PixelFormat outputFormat, > bool ccmEnabled); > - void setupInputMemcpy(const uint8_t *linePointers[]); > - void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); > - void memcpyNextLine(const uint8_t *linePointers[]); > - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); > - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); > void updateGammaTable(const DebayerParams ¶ms); > void updateLookupTables(const DebayerParams ¶ms); > > @@ -142,12 +145,9 @@ private: > debayerFn debayer3_; > Rectangle window_; > std::unique_ptr<SwStatsCpu> stats_; > - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; > - unsigned int lineBufferLength_; > - unsigned int lineBufferPadding_; > - unsigned int lineBufferIndex_; > unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ > - bool enableInputMemcpy_; > + > + std::vector<DebayerCpuThread *>threads_; This should be `std::unique_ptr<>` or similar. > }; > > } /* namespace libcamera */
Hi Hans, thank you for the update. Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > Add a DebayerCpuThreadclass and use this in the inner render loop. > This contains data which needs to be separate per thread. > > This is a preparation patch for making DebayerCpu support multi-threading. > > Benchmarking on the Arduino Uno-Q with a weak CPU which is good for > performance testing, shows 146-147ms per 3272x2464 frame both before and > after this change, with things maybe being 0.5 ms slower after this change. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > Changes in v2: > - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread > class, derived from Object to allow calling invokeMethod for thread re-use > in followup patches > - As part of this also move a bunch of methods which primarily deal with > per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(), > process*() to the new DebayerCpuThread class > --- > src/libcamera/software_isp/debayer_cpu.cpp | 215 ++++++++++++++------- > src/libcamera/software_isp/debayer_cpu.h | 20 +- > 2 files changed, 159 insertions(+), 76 deletions(-) > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index e7b012105..122bfbb05 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -27,6 +27,38 @@ > > namespace libcamera { > > +class DebayerCpuThread : public Object Building it complains about this class and its members not documented. > +{ > +public: > + DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, > + bool enableInputMemcpy); > + > + void configure(unsigned int yStart, unsigned int yEnd); > + void setupInputMemcpy(const uint8_t *linePointers[]); > + void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); > + void memcpyNextLine(const uint8_t *linePointers[]); > + void process(uint32_t frame, const uint8_t *src, uint8_t *dst); > + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); > + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); > + > + DebayerCpu *debayer_; > + unsigned int threadIndex_; > + unsigned int yStart_; > + unsigned int yEnd_; > + unsigned int lineBufferLength_; > + unsigned int lineBufferPadding_; > + unsigned int lineBufferIndex_; > + std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers]; > + bool enableInputMemcpy_; > +}; > + > +DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, > + bool enableInputMemcpy) > + : debayer_(debayer), threadIndex_(threadIndex), > + enableInputMemcpy_(enableInputMemcpy) > +{ > +} > + > /** > * \class DebayerCpu > * \brief Class for debayering on the CPU > @@ -53,8 +85,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > * \todo Make memcpy automatic based on runtime detection of platform > * capabilities. > */ > - enableInputMemcpy_ = > + bool enableInputMemcpy = > configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > + > + /* Just one thread object for now, which will be called inline rather than async */ > + threads_.resize(1); > + > + for (unsigned int i = 0; i < threads_.size(); i++) > + threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy); > } > > DebayerCpu::~DebayerCpu() = default; > @@ -484,7 +522,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, > if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0) > return -EINVAL; > > - if (stats_->configure(inputCfg) != 0) > + if (stats_->configure(inputCfg, threads_.size()) != 0) > return -EINVAL; > > const Size &statsPatternSize = stats_->patternSize(); > @@ -548,17 +586,36 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, > */ > stats_->setWindow(Rectangle(window_.size())); > > + unsigned int yStart = 0; > + unsigned int linesPerThread = (window_.height / threads_.size()) & > + ~(inputConfig_.patternSize.height - 1); > + unsigned int i; > + > + for (i = 0; i < (threads_.size() - 1); i++) { > + threads_[i]->configure(yStart, yStart + linesPerThread); > + yStart += linesPerThread; > + } > + threads_[i]->configure(yStart, window_.height); > + > + return 0; > +} > + > +void DebayerCpuThread::configure(unsigned int yStart, unsigned int yEnd) > +{ > + Debayer::DebayerInputConfig &inputConfig = debayer_->inputConfig_; > + > + yStart_ = yStart; > + yEnd_ = yEnd; > + > /* pad with patternSize.Width on both left and right side */ > - lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8; > - lineBufferLength_ = window_.width * inputConfig_.bpp / 8 + > + lineBufferPadding_ = inputConfig.patternSize.width * inputConfig.bpp / 8; > + lineBufferLength_ = debayer_->window_.width * inputConfig.bpp / 8 + > 2 * lineBufferPadding_; > > if (enableInputMemcpy_) { > - for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++) > + for (unsigned int i = 0; i <= inputConfig.patternSize.height; i++) > lineBuffers_[i].resize(lineBufferLength_); > } > - > - return 0; > } > > /* > @@ -599,9 +656,9 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size > return std::make_tuple(stride, stride * size.height); > } > > -void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) > +void DebayerCpuThread::setupInputMemcpy(const uint8_t *linePointers[]) > { > - const unsigned int patternHeight = inputConfig_.patternSize.height; > + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; > > if (!enableInputMemcpy_) > return; > @@ -617,20 +674,20 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) > lineBufferIndex_ = patternHeight; > } > > -void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) > +void DebayerCpuThread::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) > { > - const unsigned int patternHeight = inputConfig_.patternSize.height; > + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; > > for (unsigned int i = 0; i < patternHeight; i++) > linePointers[i] = linePointers[i + 1]; > > - linePointers[patternHeight] = src + > - (patternHeight / 2) * (int)inputConfig_.stride; > + linePointers[patternHeight] = > + src + (patternHeight / 2) * (int)debayer_->inputConfig_.stride; > } > > -void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) > +void DebayerCpuThread::memcpyNextLine(const uint8_t *linePointers[]) > { > - const unsigned int patternHeight = inputConfig_.patternSize.height; > + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; > > if (!enableInputMemcpy_) > return; > @@ -643,23 +700,42 @@ void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) > lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1); > } > > -void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > +void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) > { > - unsigned int yEnd = window_.height; > + Rectangle &window = debayer_->window_; > + > + /* Adjust src to top left corner of the window */ > + src += (window.y + yStart_) * debayer_->inputConfig_.stride + > + window.x * debayer_->inputConfig_.bpp / 8; > + /* Adjust dst for yStart_ */ > + dst += yStart_ * debayer_->outputConfig_.stride; > + > + if (debayer_->inputConfig_.patternSize.height == 2) > + process2(frame, src, dst); > + else > + process4(frame, src, dst); > +} > + > +void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > +{ > + unsigned int outputStride = debayer_->outputConfig_.stride; > + unsigned int inputStride = debayer_->inputConfig_.stride; > + Rectangle &window = debayer_->window_; > + unsigned int yEnd = yEnd_; > /* Holds [0] previous- [1] current- [2] next-line */ > const uint8_t *linePointers[3]; > > - /* Adjust src to top left corner of the window */ > - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; > - > /* [x] becomes [x - 1] after initial shiftLinePointers() call */ > - if (window_.y) { > - linePointers[1] = src - inputConfig_.stride; /* previous-line */ > + if (window.y + yStart_) { > + linePointers[1] = src - inputStride; /* previous-line */ > linePointers[2] = src; > } else { > - /* window_.y == 0, use the next line as prev line */ > - linePointers[1] = src + inputConfig_.stride; > + /* Top line, use the next line as prev line */ > + linePointers[1] = src + inputStride; > linePointers[2] = src; > + } > + > + if (window.y == 0 && yEnd_ == window.height) { > /* > * Last 2 lines also need special handling. > * (And configure() ensures that yEnd >= 2.) > @@ -669,83 +745,93 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > > setupInputMemcpy(linePointers); > > - for (unsigned int y = 0; y < yEnd; y += 2) { > + /* > + * Note y is the line-number *inside* the window, since stats_' window > + * is the stats window inside/relative to the debayer window. IOW for > + * single thread rendering y goes from 0 to window.height. > + */ > + for (unsigned int y = yStart_; y < yEnd; y += 2) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine0(frame, y, linePointers); > - (this->*debayer0_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_); > + debayer_->debayer0(dst, linePointers); > + src += inputStride; > + dst += outputStride; > > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - (this->*debayer1_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->debayer1(dst, linePointers); > + src += inputStride; > + dst += outputStride; > } > > - if (window_.y == 0) { > + if (window.y == 0 && yEnd_ == window.height) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine0(frame, yEnd, linePointers); > - (this->*debayer0_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->stats_->processLine0(frame, yEnd, linePointers, threadIndex_); > + debayer_->debayer0(dst, linePointers); > + src += inputStride; > + dst += outputStride; > > shiftLinePointers(linePointers, src); > /* next line may point outside of src, use prev. */ > linePointers[2] = linePointers[0]; > - (this->*debayer1_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->debayer1(dst, linePointers); > + src += inputStride; > + dst += outputStride; > } > } > > -void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > +void DebayerCpuThread::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > { > + unsigned int outputStride = debayer_->outputConfig_.stride; > + unsigned int inputStride = debayer_->inputConfig_.stride; > + > /* > * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line > * [3] 1-line-down [4] 2-lines-down. > */ > const uint8_t *linePointers[5]; > > - /* Adjust src to top left corner of the window */ > - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; > - > /* [x] becomes [x - 1] after initial shiftLinePointers() call */ > - linePointers[1] = src - 2 * inputConfig_.stride; > - linePointers[2] = src - inputConfig_.stride; > + linePointers[1] = src - 2 * inputStride; > + linePointers[2] = src - inputStride; > linePointers[3] = src; > - linePointers[4] = src + inputConfig_.stride; > + linePointers[4] = src + inputStride; > > setupInputMemcpy(linePointers); > > - for (unsigned int y = 0; y < window_.height; y += 4) { > + /* > + * Note y is the line-number *inside* the window, since stats_' window > + * is the stats window inside/relative to the debayer window. IOW for > + * single thread rendering y goes from 0 to window.height. > + */ > + for (unsigned int y = yStart_; y < yEnd_; y += 4) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine0(frame, y, linePointers); > - (this->*debayer0_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_); > + debayer_->debayer0(dst, linePointers); > + src += inputStride; > + dst += outputStride; > > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - (this->*debayer1_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->debayer1(dst, linePointers); > + src += inputStride; > + dst += outputStride; > > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine2(frame, y, linePointers); > - (this->*debayer2_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->stats_->processLine2(frame, y, linePointers, threadIndex_); > + debayer_->debayer2(dst, linePointers); > + src += inputStride; > + dst += outputStride; > > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - (this->*debayer3_)(dst, linePointers); > - src += inputConfig_.stride; > - dst += outputConfig_.stride; > + debayer_->debayer3(dst, linePointers); > + src += inputStride; > + dst += outputStride; > } > } > > @@ -867,10 +953,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > stats_->startFrame(frame); > > - if (inputConfig_.patternSize.height == 2) > - process2(frame, in.planes()[0].data(), out.planes()[0].data()); > - else > - process4(frame, in.planes()[0].data(), out.planes()[0].data()); > + threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); > > metadata.planes()[0].bytesused = out.planes()[0].size(); > > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 7a6517462..7196dcdd0 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -26,6 +26,7 @@ > > namespace libcamera { > > +class DebayerCpuThread; > class DebayerCpu : public Debayer > { > public: > @@ -44,6 +45,8 @@ public: > const SharedFD &getStatsFD() { return stats_->getStatsFD(); } > > private: > + friend class DebayerCpuThread; > + > /** > * \brief Called to debayer 1 line of Bayer input data to output format > * \param[out] dst Pointer to the start of the output line to write > @@ -74,6 +77,11 @@ private: > */ > using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); > > + void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); } > + void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); } > + void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); } > + void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); } > + > /* 8-bit raw bayer format */ > template<bool addAlphaByte, bool ccmEnabled> > void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); > @@ -105,11 +113,6 @@ private: > int setDebayerFunctions(PixelFormat inputFormat, > PixelFormat outputFormat, > bool ccmEnabled); > - void setupInputMemcpy(const uint8_t *linePointers[]); > - void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); > - void memcpyNextLine(const uint8_t *linePointers[]); > - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); > - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); > void updateGammaTable(const DebayerParams ¶ms); > void updateLookupTables(const DebayerParams ¶ms); > > @@ -142,12 +145,9 @@ private: > debayerFn debayer3_; > Rectangle window_; > std::unique_ptr<SwStatsCpu> stats_; > - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; > - unsigned int lineBufferLength_; > - unsigned int lineBufferPadding_; > - unsigned int lineBufferIndex_; > unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ > - bool enableInputMemcpy_; > + > + std::vector<DebayerCpuThread *>threads_; > }; > > } /* namespace libcamera */
diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index e7b012105..122bfbb05 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -27,6 +27,38 @@ namespace libcamera { +class DebayerCpuThread : public Object +{ +public: + DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, + bool enableInputMemcpy); + + void configure(unsigned int yStart, unsigned int yEnd); + void setupInputMemcpy(const uint8_t *linePointers[]); + void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); + void memcpyNextLine(const uint8_t *linePointers[]); + void process(uint32_t frame, const uint8_t *src, uint8_t *dst); + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); + + DebayerCpu *debayer_; + unsigned int threadIndex_; + unsigned int yStart_; + unsigned int yEnd_; + unsigned int lineBufferLength_; + unsigned int lineBufferPadding_; + unsigned int lineBufferIndex_; + std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers]; + bool enableInputMemcpy_; +}; + +DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, + bool enableInputMemcpy) + : debayer_(debayer), threadIndex_(threadIndex), + enableInputMemcpy_(enableInputMemcpy) +{ +} + /** * \class DebayerCpu * \brief Class for debayering on the CPU @@ -53,8 +85,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat * \todo Make memcpy automatic based on runtime detection of platform * capabilities. */ - enableInputMemcpy_ = + bool enableInputMemcpy = configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); + + /* Just one thread object for now, which will be called inline rather than async */ + threads_.resize(1); + + for (unsigned int i = 0; i < threads_.size(); i++) + threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy); } DebayerCpu::~DebayerCpu() = default; @@ -484,7 +522,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0) return -EINVAL; - if (stats_->configure(inputCfg) != 0) + if (stats_->configure(inputCfg, threads_.size()) != 0) return -EINVAL; const Size &statsPatternSize = stats_->patternSize(); @@ -548,17 +586,36 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, */ stats_->setWindow(Rectangle(window_.size())); + unsigned int yStart = 0; + unsigned int linesPerThread = (window_.height / threads_.size()) & + ~(inputConfig_.patternSize.height - 1); + unsigned int i; + + for (i = 0; i < (threads_.size() - 1); i++) { + threads_[i]->configure(yStart, yStart + linesPerThread); + yStart += linesPerThread; + } + threads_[i]->configure(yStart, window_.height); + + return 0; +} + +void DebayerCpuThread::configure(unsigned int yStart, unsigned int yEnd) +{ + Debayer::DebayerInputConfig &inputConfig = debayer_->inputConfig_; + + yStart_ = yStart; + yEnd_ = yEnd; + /* pad with patternSize.Width on both left and right side */ - lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8; - lineBufferLength_ = window_.width * inputConfig_.bpp / 8 + + lineBufferPadding_ = inputConfig.patternSize.width * inputConfig.bpp / 8; + lineBufferLength_ = debayer_->window_.width * inputConfig.bpp / 8 + 2 * lineBufferPadding_; if (enableInputMemcpy_) { - for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++) + for (unsigned int i = 0; i <= inputConfig.patternSize.height; i++) lineBuffers_[i].resize(lineBufferLength_); } - - return 0; } /* @@ -599,9 +656,9 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size return std::make_tuple(stride, stride * size.height); } -void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) +void DebayerCpuThread::setupInputMemcpy(const uint8_t *linePointers[]) { - const unsigned int patternHeight = inputConfig_.patternSize.height; + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; if (!enableInputMemcpy_) return; @@ -617,20 +674,20 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) lineBufferIndex_ = patternHeight; } -void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) +void DebayerCpuThread::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) { - const unsigned int patternHeight = inputConfig_.patternSize.height; + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; for (unsigned int i = 0; i < patternHeight; i++) linePointers[i] = linePointers[i + 1]; - linePointers[patternHeight] = src + - (patternHeight / 2) * (int)inputConfig_.stride; + linePointers[patternHeight] = + src + (patternHeight / 2) * (int)debayer_->inputConfig_.stride; } -void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) +void DebayerCpuThread::memcpyNextLine(const uint8_t *linePointers[]) { - const unsigned int patternHeight = inputConfig_.patternSize.height; + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; if (!enableInputMemcpy_) return; @@ -643,23 +700,42 @@ void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1); } -void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) +void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) { - unsigned int yEnd = window_.height; + Rectangle &window = debayer_->window_; + + /* Adjust src to top left corner of the window */ + src += (window.y + yStart_) * debayer_->inputConfig_.stride + + window.x * debayer_->inputConfig_.bpp / 8; + /* Adjust dst for yStart_ */ + dst += yStart_ * debayer_->outputConfig_.stride; + + if (debayer_->inputConfig_.patternSize.height == 2) + process2(frame, src, dst); + else + process4(frame, src, dst); +} + +void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) +{ + unsigned int outputStride = debayer_->outputConfig_.stride; + unsigned int inputStride = debayer_->inputConfig_.stride; + Rectangle &window = debayer_->window_; + unsigned int yEnd = yEnd_; /* Holds [0] previous- [1] current- [2] next-line */ const uint8_t *linePointers[3]; - /* Adjust src to top left corner of the window */ - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; - /* [x] becomes [x - 1] after initial shiftLinePointers() call */ - if (window_.y) { - linePointers[1] = src - inputConfig_.stride; /* previous-line */ + if (window.y + yStart_) { + linePointers[1] = src - inputStride; /* previous-line */ linePointers[2] = src; } else { - /* window_.y == 0, use the next line as prev line */ - linePointers[1] = src + inputConfig_.stride; + /* Top line, use the next line as prev line */ + linePointers[1] = src + inputStride; linePointers[2] = src; + } + + if (window.y == 0 && yEnd_ == window.height) { /* * Last 2 lines also need special handling. * (And configure() ensures that yEnd >= 2.) @@ -669,83 +745,93 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) setupInputMemcpy(linePointers); - for (unsigned int y = 0; y < yEnd; y += 2) { + /* + * Note y is the line-number *inside* the window, since stats_' window + * is the stats window inside/relative to the debayer window. IOW for + * single thread rendering y goes from 0 to window.height. + */ + for (unsigned int y = yStart_; y < yEnd; y += 2) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, y, linePointers); - (this->*debayer0_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_); + debayer_->debayer0(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - (this->*debayer1_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer1(dst, linePointers); + src += inputStride; + dst += outputStride; } - if (window_.y == 0) { + if (window.y == 0 && yEnd_ == window.height) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, yEnd, linePointers); - (this->*debayer0_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine0(frame, yEnd, linePointers, threadIndex_); + debayer_->debayer0(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); /* next line may point outside of src, use prev. */ linePointers[2] = linePointers[0]; - (this->*debayer1_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer1(dst, linePointers); + src += inputStride; + dst += outputStride; } } -void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) +void DebayerCpuThread::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) { + unsigned int outputStride = debayer_->outputConfig_.stride; + unsigned int inputStride = debayer_->inputConfig_.stride; + /* * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line * [3] 1-line-down [4] 2-lines-down. */ const uint8_t *linePointers[5]; - /* Adjust src to top left corner of the window */ - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; - /* [x] becomes [x - 1] after initial shiftLinePointers() call */ - linePointers[1] = src - 2 * inputConfig_.stride; - linePointers[2] = src - inputConfig_.stride; + linePointers[1] = src - 2 * inputStride; + linePointers[2] = src - inputStride; linePointers[3] = src; - linePointers[4] = src + inputConfig_.stride; + linePointers[4] = src + inputStride; setupInputMemcpy(linePointers); - for (unsigned int y = 0; y < window_.height; y += 4) { + /* + * Note y is the line-number *inside* the window, since stats_' window + * is the stats window inside/relative to the debayer window. IOW for + * single thread rendering y goes from 0 to window.height. + */ + for (unsigned int y = yStart_; y < yEnd_; y += 4) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, y, linePointers); - (this->*debayer0_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_); + debayer_->debayer0(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - (this->*debayer1_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer1(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine2(frame, y, linePointers); - (this->*debayer2_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine2(frame, y, linePointers, threadIndex_); + debayer_->debayer2(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - (this->*debayer3_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer3(dst, linePointers); + src += inputStride; + dst += outputStride; } } @@ -867,10 +953,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output stats_->startFrame(frame); - if (inputConfig_.patternSize.height == 2) - process2(frame, in.planes()[0].data(), out.planes()[0].data()); - else - process4(frame, in.planes()[0].data(), out.planes()[0].data()); + threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); metadata.planes()[0].bytesused = out.planes()[0].size(); diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index 7a6517462..7196dcdd0 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -26,6 +26,7 @@ namespace libcamera { +class DebayerCpuThread; class DebayerCpu : public Debayer { public: @@ -44,6 +45,8 @@ public: const SharedFD &getStatsFD() { return stats_->getStatsFD(); } private: + friend class DebayerCpuThread; + /** * \brief Called to debayer 1 line of Bayer input data to output format * \param[out] dst Pointer to the start of the output line to write @@ -74,6 +77,11 @@ private: */ using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); + void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); } + void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); } + void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); } + void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); } + /* 8-bit raw bayer format */ template<bool addAlphaByte, bool ccmEnabled> void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); @@ -105,11 +113,6 @@ private: int setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputFormat, bool ccmEnabled); - void setupInputMemcpy(const uint8_t *linePointers[]); - void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); - void memcpyNextLine(const uint8_t *linePointers[]); - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); void updateGammaTable(const DebayerParams ¶ms); void updateLookupTables(const DebayerParams ¶ms); @@ -142,12 +145,9 @@ private: debayerFn debayer3_; Rectangle window_; std::unique_ptr<SwStatsCpu> stats_; - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; - unsigned int lineBufferLength_; - unsigned int lineBufferPadding_; - unsigned int lineBufferIndex_; unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ - bool enableInputMemcpy_; + + std::vector<DebayerCpuThread *>threads_; }; } /* namespace libcamera */
Add a DebayerCpuThreadclass and use this in the inner render loop. This contains data which needs to be separate per thread. This is a preparation patch for making DebayerCpu support multi-threading. Benchmarking on the Arduino Uno-Q with a weak CPU which is good for performance testing, shows 146-147ms per 3272x2464 frame both before and after this change, with things maybe being 0.5 ms slower after this change. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- Changes in v2: - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread class, derived from Object to allow calling invokeMethod for thread re-use in followup patches - As part of this also move a bunch of methods which primarily deal with per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(), process*() to the new DebayerCpuThread class --- src/libcamera/software_isp/debayer_cpu.cpp | 215 ++++++++++++++------- src/libcamera/software_isp/debayer_cpu.h | 20 +- 2 files changed, 159 insertions(+), 76 deletions(-)