| Message ID | 20260216190204.106922-3-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi Hans, thank you for the patch. Looks basically correct to me, some comments below. Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > Add a DebayerCpuThreadData data struct and use this in the inner render > loop. This contains data which needs to be separate per thread. > > This is a preparation patch for making DebayerCpu support multi-threading. > > Note this passed the DebayerCpuThreadData with a pointer rather then by s/then/than/ > reference, because passing by reference is not supported for functions > passed as the thread function to std::thread(). > > Benchmarking on the Uno-Q with a weak CPU which is good for performance > testing, shows 146-147ms per 3272x2464 frame both before and after this > change, with things maybe being 0.5 ms slower after this change. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > src/libcamera/software_isp/debayer_cpu.cpp | 90 ++++++++++++++-------- > src/libcamera/software_isp/debayer_cpu.h | 30 +++++--- > 2 files changed, 77 insertions(+), 43 deletions(-) > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index 97c1959a..e1d3c164 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -41,7 +41,7 @@ namespace libcamera { > * \param[in] configuration The global configuration > */ > DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) > - : Debayer(configuration), stats_(std::move(stats)) > + : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) > { > /* > * Reading from uncached buffers may be very slow. > @@ -555,8 +555,9 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, > 2 * lineBufferPadding_; > > if (enableInputMemcpy_) { > - for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++) > - lineBuffers_[i].resize(lineBufferLength_); > + for (unsigned int i = 0; i < threadCount_; i++) > + for (unsigned int j = 0; j <= inputConfig_.patternSize.height; j++) > + threadData_[i].lineBuffers[j].resize(lineBufferLength_); > } > > return 0; > @@ -600,7 +601,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size > return std::make_tuple(stride, stride * size.height); > } > > -void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) > +void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[], > + DebayerCpuThreadData *threadData) > { > const unsigned int patternHeight = inputConfig_.patternSize.height; > > @@ -608,14 +610,14 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) > return; > > for (unsigned int i = 0; i < patternHeight; i++) { > - memcpy(lineBuffers_[i].data(), > + memcpy(threadData->lineBuffers[i].data(), > linePointers[i + 1] - lineBufferPadding_, > lineBufferLength_); > - linePointers[i + 1] = lineBuffers_[i].data() + lineBufferPadding_; > + linePointers[i + 1] = threadData->lineBuffers[i].data() + lineBufferPadding_; > } > > /* Point lineBufferIndex_ to first unused lineBuffer */ > - lineBufferIndex_ = patternHeight; > + threadData->lineBufferIndex = patternHeight; > } > > void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) > @@ -629,66 +631,78 @@ void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t > (patternHeight / 2) * (int)inputConfig_.stride; > } > > -void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) > +void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[], > + DebayerCpuThreadData *threadData) > { > const unsigned int patternHeight = inputConfig_.patternSize.height; > > if (!enableInputMemcpy_) > return; > > - memcpy(lineBuffers_[lineBufferIndex_].data(), > + memcpy(threadData->lineBuffers[threadData->lineBufferIndex].data(), > linePointers[patternHeight] - lineBufferPadding_, > lineBufferLength_); > - linePointers[patternHeight] = lineBuffers_[lineBufferIndex_].data() + lineBufferPadding_; > + linePointers[patternHeight] = threadData->lineBuffers[threadData->lineBufferIndex].data() + lineBufferPadding_; > > - lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1); > + threadData->lineBufferIndex = (threadData->lineBufferIndex + 1) % (patternHeight + 1); > } > > -void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > +void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > + DebayerCpuThreadData *threadData) > { > - unsigned int yEnd = window_.height; > /* Holds [0] previous- [1] current- [2] next-line */ > const uint8_t *linePointers[3]; > > /* Adjust src to top left corner of the window */ > - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; > + src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; > > /* [x] becomes [x - 1] after initial shiftLinePointers() call */ > - if (window_.y) { > + if (window_.y + threadData->yStart) { > linePointers[1] = src - inputConfig_.stride; /* previous-line */ > linePointers[2] = src; > } else { > - /* window_.y == 0, use the next line as prev line */ > + /* Top line, use the next line as prev line */ > linePointers[1] = src + inputConfig_.stride; > linePointers[2] = src; > + } > + > + if (window_.y == 0 && threadData->yEnd == window_.height) { > /* > * Last 2 lines also need special handling. > * (And configure() ensures that yEnd >= 2.) > */ > - yEnd -= 2; > + threadData->yEnd -= 2; > + threadData->processLastLinesSeperately = true; s/Seperately/Separately/ > + } else { > + threadData->processLastLinesSeperately = false; > } > > - setupInputMemcpy(linePointers); > + setupInputMemcpy(linePointers, threadData); > > - for (unsigned int y = 0; y < yEnd; y += 2) { > + /* > + * Note y is the line-number *inside* the window, since stats_' window > + * is the stats window inside/relative to the debayer window. IOW for > + * single thread rendering y goes from 0 - window_.height. > + */ > + for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > + memcpyNextLine(linePointers, threadData); > stats_->processLine0(frame, y, linePointers, &statsBuffer_); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > + memcpyNextLine(linePointers, threadData); > (this->*debayer1_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > } > > - if (window_.y == 0) { > + if (threadData->processLastLinesSeperately) { > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > - stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_); > + memcpyNextLine(linePointers, threadData); > + stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -702,7 +716,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > } > } > > -void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > +void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > + DebayerCpuThreadData *threadData) > { > /* > * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line > @@ -711,7 +726,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > const uint8_t *linePointers[5]; > > /* Adjust src to top left corner of the window */ > - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; > + src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; > > /* [x] becomes [x - 1] after initial shiftLinePointers() call */ > linePointers[1] = src - 2 * inputConfig_.stride; > @@ -719,31 +734,36 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > linePointers[3] = src; > linePointers[4] = src + inputConfig_.stride; > > - setupInputMemcpy(linePointers); > + setupInputMemcpy(linePointers, threadData); > > - for (unsigned int y = 0; y < window_.height; y += 4) { > + /* > + * Note y is the line-number *inside* the window, since stats_' window > + * is the stats window inside/relative to the debayer window. IOW for > + * single thread rendering y goes from 0 - window_.height. s/-/to/ (The same in process4.) > + */ > + for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > + memcpyNextLine(linePointers, threadData); > stats_->processLine0(frame, y, linePointers, &statsBuffer_); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > + memcpyNextLine(linePointers, threadData); > (this->*debayer1_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > + memcpyNextLine(linePointers, threadData); > stats_->processLine2(frame, y, linePointers, &statsBuffer_); > (this->*debayer2_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > > shiftLinePointers(linePointers, src); > - memcpyNextLine(linePointers); > + memcpyNextLine(linePointers, threadData); > (this->*debayer3_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -868,10 +888,12 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > stats_->startFrame(frame, &statsBuffer_, 1); > > + threadData_[0].yStart = 0; > + threadData_[0].yEnd = window_.height; > if (inputConfig_.patternSize.height == 2) > - process2(frame, in.planes()[0].data(), out.planes()[0].data()); > + process2(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); > else > - process4(frame, in.planes()[0].data(), out.planes()[0].data()); > + process4(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); > > metadata.planes()[0].bytesused = out.planes()[0].size(); > > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 8abf5168..800b018c 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -74,6 +74,19 @@ private: > */ > using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); > > + /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ Not a matter of this patch but I wonder why 5? > + static constexpr unsigned int kMaxLineBuffers = 5; > + > + /* Per render thread data */ > + struct DebayerCpuThreadData { > + unsigned int yStart; > + unsigned int yEnd; > + std::vector<uint8_t> lineBuffers[kMaxLineBuffers]; > + unsigned int lineBufferIndex; > + /* Stored here to avoid causing register pressure in inner loop */ What inner loop? > + bool processLastLinesSeperately; > + }; > + > /* 8-bit raw bayer format */ > template<bool addAlphaByte, bool ccmEnabled> > void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); > @@ -105,17 +118,14 @@ private: > int setDebayerFunctions(PixelFormat inputFormat, > PixelFormat outputFormat, > bool ccmEnabled); > - void setupInputMemcpy(const uint8_t *linePointers[]); > + void setupInputMemcpy(const uint8_t *linePointers[], DebayerCpuThreadData *threadData); > void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); > - void memcpyNextLine(const uint8_t *linePointers[]); > - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); > - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); > + void memcpyNextLine(const uint8_t *linePointers[], DebayerCpuThreadData *threadData); > + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData); > + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData); > void updateGammaTable(const DebayerParams ¶ms); > void updateLookupTables(const DebayerParams ¶ms); > > - /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ > - static constexpr unsigned int kMaxLineBuffers = 5; > - > static constexpr unsigned int kRGBLookupSize = 256; > static constexpr unsigned int kGammaLookupSize = 1024; > struct CcmColumn { > @@ -143,12 +153,14 @@ private: > debayerFn debayer3_; > Rectangle window_; > std::unique_ptr<SwStatsCpu> stats_; > - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; > unsigned int lineBufferLength_; > unsigned int lineBufferPadding_; > - unsigned int lineBufferIndex_; > unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ > bool enableInputMemcpy_; > + > + static constexpr unsigned int kMaxThreads = 4; > + struct DebayerCpuThreadData threadData_[kMaxThreads]; I think std::array is preferred. > + unsigned int threadCount_; > }; > > } /* namespace libcamera */
Hi, On 17-Feb-26 10:22 PM, Milan Zamazal wrote: > Hi Hans, > > thank you for the patch. > > Looks basically correct to me, some comments below. > > Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > >> Add a DebayerCpuThreadData data struct and use this in the inner render >> loop. This contains data which needs to be separate per thread. >> >> This is a preparation patch for making DebayerCpu support multi-threading. >> >> Note this passed the DebayerCpuThreadData with a pointer rather then by > > s/then/than/ No longer relevant for v2, but I did end up doing a couple other s/then/than/ fixes for v2. ... >> + if (window_.y == 0 && threadData->yEnd == window_.height) { >> /* >> * Last 2 lines also need special handling. >> * (And configure() ensures that yEnd >= 2.) >> */ >> - yEnd -= 2; >> + threadData->yEnd -= 2; >> + threadData->processLastLinesSeperately = true; > > s/Seperately/Separately/ No longer relevant for v2 (which is significantly reworked). ... >> - for (unsigned int y = 0; y < window_.height; y += 4) { >> + /* >> + * Note y is the line-number *inside* the window, since stats_' window >> + * is the stats window inside/relative to the debayer window. IOW for >> + * single thread rendering y goes from 0 - window_.height. > > s/-/to/ > > (The same in process4.) Ack both fixed for v2. ... >> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h >> index 8abf5168..800b018c 100644 >> --- a/src/libcamera/software_isp/debayer_cpu.h >> +++ b/src/libcamera/software_isp/debayer_cpu.h >> @@ -74,6 +74,19 @@ private: >> */ >> using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); >> >> + /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ > > Not a matter of this patch but I wonder why 5? this is for a mixed RGB-IR pattern which requires 2 lines above and below the current line to get all colors. ATM the whole process4() support is unused I've out of tree patches for this here: https://github.com/jwrdegoede/libcamera/commits/ov01a1s/ but those are waiting for the new bayer-pattern control to allow communicating the special IGIG_GBGR_IGIG_GRGB bayer pattern to userspace. ... >> + >> + static constexpr unsigned int kMaxThreads = 4; >> + struct DebayerCpuThreadData threadData_[kMaxThreads]; > > I think std::array is preferred. Ack. Regards, Hans
diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index 97c1959a..e1d3c164 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -41,7 +41,7 @@ namespace libcamera { * \param[in] configuration The global configuration */ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) - : Debayer(configuration), stats_(std::move(stats)) + : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) { /* * Reading from uncached buffers may be very slow. @@ -555,8 +555,9 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, 2 * lineBufferPadding_; if (enableInputMemcpy_) { - for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++) - lineBuffers_[i].resize(lineBufferLength_); + for (unsigned int i = 0; i < threadCount_; i++) + for (unsigned int j = 0; j <= inputConfig_.patternSize.height; j++) + threadData_[i].lineBuffers[j].resize(lineBufferLength_); } return 0; @@ -600,7 +601,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size return std::make_tuple(stride, stride * size.height); } -void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) +void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[], + DebayerCpuThreadData *threadData) { const unsigned int patternHeight = inputConfig_.patternSize.height; @@ -608,14 +610,14 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) return; for (unsigned int i = 0; i < patternHeight; i++) { - memcpy(lineBuffers_[i].data(), + memcpy(threadData->lineBuffers[i].data(), linePointers[i + 1] - lineBufferPadding_, lineBufferLength_); - linePointers[i + 1] = lineBuffers_[i].data() + lineBufferPadding_; + linePointers[i + 1] = threadData->lineBuffers[i].data() + lineBufferPadding_; } /* Point lineBufferIndex_ to first unused lineBuffer */ - lineBufferIndex_ = patternHeight; + threadData->lineBufferIndex = patternHeight; } void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) @@ -629,66 +631,78 @@ void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t (patternHeight / 2) * (int)inputConfig_.stride; } -void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) +void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[], + DebayerCpuThreadData *threadData) { const unsigned int patternHeight = inputConfig_.patternSize.height; if (!enableInputMemcpy_) return; - memcpy(lineBuffers_[lineBufferIndex_].data(), + memcpy(threadData->lineBuffers[threadData->lineBufferIndex].data(), linePointers[patternHeight] - lineBufferPadding_, lineBufferLength_); - linePointers[patternHeight] = lineBuffers_[lineBufferIndex_].data() + lineBufferPadding_; + linePointers[patternHeight] = threadData->lineBuffers[threadData->lineBufferIndex].data() + lineBufferPadding_; - lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1); + threadData->lineBufferIndex = (threadData->lineBufferIndex + 1) % (patternHeight + 1); } -void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) +void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, + DebayerCpuThreadData *threadData) { - unsigned int yEnd = window_.height; /* Holds [0] previous- [1] current- [2] next-line */ const uint8_t *linePointers[3]; /* Adjust src to top left corner of the window */ - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; + src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; /* [x] becomes [x - 1] after initial shiftLinePointers() call */ - if (window_.y) { + if (window_.y + threadData->yStart) { linePointers[1] = src - inputConfig_.stride; /* previous-line */ linePointers[2] = src; } else { - /* window_.y == 0, use the next line as prev line */ + /* Top line, use the next line as prev line */ linePointers[1] = src + inputConfig_.stride; linePointers[2] = src; + } + + if (window_.y == 0 && threadData->yEnd == window_.height) { /* * Last 2 lines also need special handling. * (And configure() ensures that yEnd >= 2.) */ - yEnd -= 2; + threadData->yEnd -= 2; + threadData->processLastLinesSeperately = true; + } else { + threadData->processLastLinesSeperately = false; } - setupInputMemcpy(linePointers); + setupInputMemcpy(linePointers, threadData); - for (unsigned int y = 0; y < yEnd; y += 2) { + /* + * Note y is the line-number *inside* the window, since stats_' window + * is the stats window inside/relative to the debayer window. IOW for + * single thread rendering y goes from 0 - window_.height. + */ + for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); + memcpyNextLine(linePointers, threadData); stats_->processLine0(frame, y, linePointers, &statsBuffer_); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); + memcpyNextLine(linePointers, threadData); (this->*debayer1_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; } - if (window_.y == 0) { + if (threadData->processLastLinesSeperately) { shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); - stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_); + memcpyNextLine(linePointers, threadData); + stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -702,7 +716,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) } } -void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) +void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, + DebayerCpuThreadData *threadData) { /* * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line @@ -711,7 +726,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) const uint8_t *linePointers[5]; /* Adjust src to top left corner of the window */ - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; + src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; /* [x] becomes [x - 1] after initial shiftLinePointers() call */ linePointers[1] = src - 2 * inputConfig_.stride; @@ -719,31 +734,36 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) linePointers[3] = src; linePointers[4] = src + inputConfig_.stride; - setupInputMemcpy(linePointers); + setupInputMemcpy(linePointers, threadData); - for (unsigned int y = 0; y < window_.height; y += 4) { + /* + * Note y is the line-number *inside* the window, since stats_' window + * is the stats window inside/relative to the debayer window. IOW for + * single thread rendering y goes from 0 - window_.height. + */ + for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); + memcpyNextLine(linePointers, threadData); stats_->processLine0(frame, y, linePointers, &statsBuffer_); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); + memcpyNextLine(linePointers, threadData); (this->*debayer1_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); + memcpyNextLine(linePointers, threadData); stats_->processLine2(frame, y, linePointers, &statsBuffer_); (this->*debayer2_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; shiftLinePointers(linePointers, src); - memcpyNextLine(linePointers); + memcpyNextLine(linePointers, threadData); (this->*debayer3_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -868,10 +888,12 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output stats_->startFrame(frame, &statsBuffer_, 1); + threadData_[0].yStart = 0; + threadData_[0].yEnd = window_.height; if (inputConfig_.patternSize.height == 2) - process2(frame, in.planes()[0].data(), out.planes()[0].data()); + process2(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); else - process4(frame, in.planes()[0].data(), out.planes()[0].data()); + process4(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); metadata.planes()[0].bytesused = out.planes()[0].size(); diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index 8abf5168..800b018c 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -74,6 +74,19 @@ private: */ using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); + /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ + static constexpr unsigned int kMaxLineBuffers = 5; + + /* Per render thread data */ + struct DebayerCpuThreadData { + unsigned int yStart; + unsigned int yEnd; + std::vector<uint8_t> lineBuffers[kMaxLineBuffers]; + unsigned int lineBufferIndex; + /* Stored here to avoid causing register pressure in inner loop */ + bool processLastLinesSeperately; + }; + /* 8-bit raw bayer format */ template<bool addAlphaByte, bool ccmEnabled> void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); @@ -105,17 +118,14 @@ private: int setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputFormat, bool ccmEnabled); - void setupInputMemcpy(const uint8_t *linePointers[]); + void setupInputMemcpy(const uint8_t *linePointers[], DebayerCpuThreadData *threadData); void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); - void memcpyNextLine(const uint8_t *linePointers[]); - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); + void memcpyNextLine(const uint8_t *linePointers[], DebayerCpuThreadData *threadData); + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData); + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData); void updateGammaTable(const DebayerParams ¶ms); void updateLookupTables(const DebayerParams ¶ms); - /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ - static constexpr unsigned int kMaxLineBuffers = 5; - static constexpr unsigned int kRGBLookupSize = 256; static constexpr unsigned int kGammaLookupSize = 1024; struct CcmColumn { @@ -143,12 +153,14 @@ private: debayerFn debayer3_; Rectangle window_; std::unique_ptr<SwStatsCpu> stats_; - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; unsigned int lineBufferLength_; unsigned int lineBufferPadding_; - unsigned int lineBufferIndex_; unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ bool enableInputMemcpy_; + + static constexpr unsigned int kMaxThreads = 4; + struct DebayerCpuThreadData threadData_[kMaxThreads]; + unsigned int threadCount_; }; } /* namespace libcamera */
Add a DebayerCpuThreadData data struct and use this in the inner render loop. This contains data which needs to be separate per thread. This is a preparation patch for making DebayerCpu support multi-threading. Note this passed the DebayerCpuThreadData with a pointer rather then by reference, because passing by reference is not supported for functions passed as the thread function to std::thread(). Benchmarking on the Uno-Q with a weak CPU which is good for performance testing, shows 146-147ms per 3272x2464 frame both before and after this change, with things maybe being 0.5 ms slower after this change. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- src/libcamera/software_isp/debayer_cpu.cpp | 90 ++++++++++++++-------- src/libcamera/software_isp/debayer_cpu.h | 30 +++++--- 2 files changed, 77 insertions(+), 43 deletions(-)