| Message ID | 20260304075052.11599-3-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi 2026. 03. 04. 8:50 keltezéssel, Hans de Goede írta: > Add a DebayerCpuThreadclass and use this in the inner render loop. > This contains data which needs to be separate per thread. > > This is a preparation patch for making DebayerCpu support multi-threading. > > Benchmarking on the Arduino Uno-Q with a weak CPU which is good for > performance testing, shows 146-147ms per 3272x2464 frame both before and > after this change, with things maybe being 0.5 ms slower after this change. > > Reviewed-by: Milan Zamazal <mzamazal@redhat.com> > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > Changes in v4: > - Move kMaxLineBuffers constant to DebayerCpuThread class > - Add Milan's Reviewed-by > > Changes in v3: > - Use std::unique_ptr for the DebayerCpuThread pointers > - Document new DebayerCpuThread class > - Make DebayerCpuThread inherit from both Thread and Object > > Changes in v2: > - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread > class, derived from Object to allow calling invokeMethod for thread re-use > in followup patches > - As part of this also move a bunch of methods which primarily deal with > per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(), > process*() to the new DebayerCpuThread class > --- > src/libcamera/software_isp/debayer_cpu.cpp | 247 +++++++++++++++------ > src/libcamera/software_isp/debayer_cpu.h | 23 +- > 2 files changed, 191 insertions(+), 79 deletions(-) > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index e7b012105..d57d640df 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -18,6 +18,8 @@ > > #include <linux/dma-buf.h> > > +#include <libcamera/base/thread.h> > + > #include <libcamera/formats.h> > > #include "libcamera/internal/bayer_format.h" > @@ -27,6 +29,55 @@ > > namespace libcamera { > > +/** > + * \brief Class representing one CPU debayering thread > + * > + * Implementation for CPU based debayering threads. > + */ > +class DebayerCpuThread : public Thread, public Object > +{ > +public: > + DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, > + bool enableInputMemcpy); > + > + void configure(unsigned int yStart, unsigned int yEnd); > + void process(uint32_t frame, const uint8_t *src, uint8_t *dst); > + > +private: > + void setupInputMemcpy(const uint8_t *linePointers[]); > + void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); > + void memcpyNextLine(const uint8_t *linePointers[]); > + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); > + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); > + > + /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ > + static constexpr unsigned int kMaxLineBuffers = 5; > + > + DebayerCpu *debayer_; > + unsigned int threadIndex_; > + unsigned int yStart_; > + unsigned int yEnd_; > + unsigned int lineBufferLength_; > + unsigned int lineBufferPadding_; > + unsigned int lineBufferIndex_; > + std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; > + bool enableInputMemcpy_; > +}; > + > +/** > + * \brief Construct a DebayerCpuThread object > + * \param[in] debayer pointer back to the DebayerCpuObject this thread belongs to > + * \param[in] threadIndex 0 .. n thread-index value for the thread > + * \param[in] enableInputMemcpy when set copy input data to a heap buffer before use > + */ > +DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, > + bool enableInputMemcpy) > + : Thread("DebayerCpu:" + std::to_string(threadIndex)), > + debayer_(debayer), threadIndex_(threadIndex), > + enableInputMemcpy_(enableInputMemcpy) > +{ > +} > + > /** > * \class DebayerCpu > * \brief Class for debayering on the CPU > @@ -53,8 +104,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > * \todo Make memcpy automatic based on runtime detection of platform > * capabilities. > */ > - enableInputMemcpy_ = > + bool enableInputMemcpy = > configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > + > + /* Just one thread object for now, which will be called inline rather than async */ > + threads_.resize(1); > + > + for (unsigned int i = 0; i < threads_.size(); i++) > + threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); > } > > DebayerCpu::~DebayerCpu() = default; > @@ -484,7 +541,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, > if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0) > return -EINVAL; > > - if (stats_->configure(inputCfg) != 0) > + if (stats_->configure(inputCfg, threads_.size()) != 0) > return -EINVAL; > > const Size &statsPatternSize = stats_->patternSize(); > @@ -548,17 +605,43 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, > */ > stats_->setWindow(Rectangle(window_.size())); > > + unsigned int yStart = 0; > + unsigned int linesPerThread = (window_.height / threads_.size()) & > + ~(inputConfig_.patternSize.height - 1); > + unsigned int i; > + > + for (i = 0; i < (threads_.size() - 1); i++) { > + threads_[i]->configure(yStart, yStart + linesPerThread); > + yStart += linesPerThread; > + } > + threads_[i]->configure(yStart, window_.height); Or possibly `threads_.back()->configure(...` > + > + return 0; > +} > + > +/** > + * \brief Configure thread to process a specific part of the image > + * \param[in] yStart y coordinate of first line to process > + * \param[in] yEnd y coordinate of the line at which to stop processing > + * > + * Configure the thread to process lines yStart - (yEnd - 1). This is a bit confusing in my opinion because `-` means two different things in the same sentence. Maybe "lines from yStart to yEnd - 1", or maybe "lines [yStart, yEnd)". > + */ > [...] > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 7a6517462..780576090 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -26,6 +26,7 @@ > > namespace libcamera { > > +class DebayerCpuThread; > class DebayerCpu : public Debayer > { > public: > @@ -44,6 +45,8 @@ public: > const SharedFD &getStatsFD() { return stats_->getStatsFD(); } > > private: > + friend class DebayerCpuThread; > + > /** > * \brief Called to debayer 1 line of Bayer input data to output format > * \param[out] dst Pointer to the start of the output line to write > @@ -74,6 +77,11 @@ private: > */ > using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); > > + void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); } > + void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); } > + void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); } > + void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); } > + > /* 8-bit raw bayer format */ > template<bool addAlphaByte, bool ccmEnabled> > void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); > @@ -105,17 +113,9 @@ private: > int setDebayerFunctions(PixelFormat inputFormat, > PixelFormat outputFormat, > bool ccmEnabled); > - void setupInputMemcpy(const uint8_t *linePointers[]); > - void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); > - void memcpyNextLine(const uint8_t *linePointers[]); > - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); > - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); > void updateGammaTable(const DebayerParams ¶ms); > void updateLookupTables(const DebayerParams ¶ms); > > - /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ > - static constexpr unsigned int kMaxLineBuffers = 5; > - > static constexpr unsigned int kRGBLookupSize = 256; > static constexpr unsigned int kGammaLookupSize = 1024; > struct CcmColumn { > @@ -142,12 +142,9 @@ private: > debayerFn debayer3_; > Rectangle window_; > std::unique_ptr<SwStatsCpu> stats_; > - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; > - unsigned int lineBufferLength_; > - unsigned int lineBufferPadding_; > - unsigned int lineBufferIndex_; > unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ > - bool enableInputMemcpy_; > + > + std::vector<std::unique_ptr<DebayerCpuThread>>threads_; Missing space before the name. Tested-by: Barnabás Pőcze <barnabas.pocze@ideasonboard.com> # ThinkPad X1 Yoga Gen 7 + ov2740 > }; > > } /* namespace libcamera */
Hi, On 4-Mar-26 14:47, Barnabás Pőcze wrote: > Hi > > 2026. 03. 04. 8:50 keltezéssel, Hans de Goede írta: >> Add a DebayerCpuThreadclass and use this in the inner render loop. >> This contains data which needs to be separate per thread. >> >> This is a preparation patch for making DebayerCpu support multi-threading. >> >> Benchmarking on the Arduino Uno-Q with a weak CPU which is good for >> performance testing, shows 146-147ms per 3272x2464 frame both before and >> after this change, with things maybe being 0.5 ms slower after this change. >> >> Reviewed-by: Milan Zamazal <mzamazal@redhat.com> >> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >> --- >> Changes in v4: >> - Move kMaxLineBuffers constant to DebayerCpuThread class >> - Add Milan's Reviewed-by >> >> Changes in v3: >> - Use std::unique_ptr for the DebayerCpuThread pointers >> - Document new DebayerCpuThread class >> - Make DebayerCpuThread inherit from both Thread and Object >> >> Changes in v2: >> - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread >> class, derived from Object to allow calling invokeMethod for thread re-use >> in followup patches >> - As part of this also move a bunch of methods which primarily deal with >> per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(), >> process*() to the new DebayerCpuThread class >> --- >> src/libcamera/software_isp/debayer_cpu.cpp | 247 +++++++++++++++------ >> src/libcamera/software_isp/debayer_cpu.h | 23 +- >> 2 files changed, 191 insertions(+), 79 deletions(-) >> >> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp >> index e7b012105..d57d640df 100644 >> --- a/src/libcamera/software_isp/debayer_cpu.cpp >> +++ b/src/libcamera/software_isp/debayer_cpu.cpp >> @@ -18,6 +18,8 @@ >> #include <linux/dma-buf.h> >> +#include <libcamera/base/thread.h> >> + >> #include <libcamera/formats.h> >> #include "libcamera/internal/bayer_format.h" >> @@ -27,6 +29,55 @@ >> namespace libcamera { >> +/** >> + * \brief Class representing one CPU debayering thread >> + * >> + * Implementation for CPU based debayering threads. >> + */ >> +class DebayerCpuThread : public Thread, public Object >> +{ >> +public: >> + DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, >> + bool enableInputMemcpy); >> + >> + void configure(unsigned int yStart, unsigned int yEnd); >> + void process(uint32_t frame, const uint8_t *src, uint8_t *dst); >> + >> +private: >> + void setupInputMemcpy(const uint8_t *linePointers[]); >> + void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); >> + void memcpyNextLine(const uint8_t *linePointers[]); >> + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); >> + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); >> + >> + /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ >> + static constexpr unsigned int kMaxLineBuffers = 5; >> + >> + DebayerCpu *debayer_; >> + unsigned int threadIndex_; >> + unsigned int yStart_; >> + unsigned int yEnd_; >> + unsigned int lineBufferLength_; >> + unsigned int lineBufferPadding_; >> + unsigned int lineBufferIndex_; >> + std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; >> + bool enableInputMemcpy_; >> +}; >> + >> +/** >> + * \brief Construct a DebayerCpuThread object >> + * \param[in] debayer pointer back to the DebayerCpuObject this thread belongs to >> + * \param[in] threadIndex 0 .. n thread-index value for the thread >> + * \param[in] enableInputMemcpy when set copy input data to a heap buffer before use >> + */ >> +DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, >> + bool enableInputMemcpy) >> + : Thread("DebayerCpu:" + std::to_string(threadIndex)), >> + debayer_(debayer), threadIndex_(threadIndex), >> + enableInputMemcpy_(enableInputMemcpy) >> +{ >> +} >> + >> /** >> * \class DebayerCpu >> * \brief Class for debayering on the CPU >> @@ -53,8 +104,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat >> * \todo Make memcpy automatic based on runtime detection of platform >> * capabilities. >> */ >> - enableInputMemcpy_ = >> + bool enableInputMemcpy = >> configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); >> + >> + /* Just one thread object for now, which will be called inline rather than async */ >> + threads_.resize(1); >> + >> + for (unsigned int i = 0; i < threads_.size(); i++) >> + threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); >> } >> DebayerCpu::~DebayerCpu() = default; >> @@ -484,7 +541,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, >> if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0) >> return -EINVAL; >> - if (stats_->configure(inputCfg) != 0) >> + if (stats_->configure(inputCfg, threads_.size()) != 0) >> return -EINVAL; >> const Size &statsPatternSize = stats_->patternSize(); >> @@ -548,17 +605,43 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, >> */ >> stats_->setWindow(Rectangle(window_.size())); >> + unsigned int yStart = 0; >> + unsigned int linesPerThread = (window_.height / threads_.size()) & >> + ~(inputConfig_.patternSize.height - 1); >> + unsigned int i; >> + >> + for (i = 0; i < (threads_.size() - 1); i++) { >> + threads_[i]->configure(yStart, yStart + linesPerThread); >> + yStart += linesPerThread; >> + } >> + threads_[i]->configure(yStart, window_.height); > > Or possibly `threads_.back()->configure(...` I think that sticking with [i] to match the configure() calls inside the for loop is more consistent so I'm going to keep this as is. >> + >> + return 0; >> +} >> + >> +/** >> + * \brief Configure thread to process a specific part of the image >> + * \param[in] yStart y coordinate of first line to process >> + * \param[in] yEnd y coordinate of the line at which to stop processing >> + * >> + * Configure the thread to process lines yStart - (yEnd - 1). > > This is a bit confusing in my opinion because `-` means two different things > in the same sentence. Maybe "lines from yStart to yEnd - 1", or maybe > "lines [yStart, yEnd)". Ack, will fix for the next version. >> + */ >> [...] >> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h >> index 7a6517462..780576090 100644 >> --- a/src/libcamera/software_isp/debayer_cpu.h >> +++ b/src/libcamera/software_isp/debayer_cpu.h >> @@ -26,6 +26,7 @@ >> namespace libcamera { >> +class DebayerCpuThread; >> class DebayerCpu : public Debayer >> { >> public: >> @@ -44,6 +45,8 @@ public: >> const SharedFD &getStatsFD() { return stats_->getStatsFD(); } >> private: >> + friend class DebayerCpuThread; >> + >> /** >> * \brief Called to debayer 1 line of Bayer input data to output format >> * \param[out] dst Pointer to the start of the output line to write >> @@ -74,6 +77,11 @@ private: >> */ >> using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); >> + void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); } >> + void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); } >> + void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); } >> + void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); } >> + >> /* 8-bit raw bayer format */ >> template<bool addAlphaByte, bool ccmEnabled> >> void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); >> @@ -105,17 +113,9 @@ private: >> int setDebayerFunctions(PixelFormat inputFormat, >> PixelFormat outputFormat, >> bool ccmEnabled); >> - void setupInputMemcpy(const uint8_t *linePointers[]); >> - void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); >> - void memcpyNextLine(const uint8_t *linePointers[]); >> - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); >> - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); >> void updateGammaTable(const DebayerParams ¶ms); >> void updateLookupTables(const DebayerParams ¶ms); >> - /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ >> - static constexpr unsigned int kMaxLineBuffers = 5; >> - >> static constexpr unsigned int kRGBLookupSize = 256; >> static constexpr unsigned int kGammaLookupSize = 1024; >> struct CcmColumn { >> @@ -142,12 +142,9 @@ private: >> debayerFn debayer3_; >> Rectangle window_; >> std::unique_ptr<SwStatsCpu> stats_; >> - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; >> - unsigned int lineBufferLength_; >> - unsigned int lineBufferPadding_; >> - unsigned int lineBufferIndex_; >> unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ >> - bool enableInputMemcpy_; >> + >> + std::vector<std::unique_ptr<DebayerCpuThread>>threads_; > > Missing space before the name. Ack. > Tested-by: Barnabás Pőcze <barnabas.pocze@ideasonboard.com> # ThinkPad X1 Yoga Gen 7 + ov2740 Thanks. Regards, Hans
diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index e7b012105..d57d640df 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -18,6 +18,8 @@ #include <linux/dma-buf.h> +#include <libcamera/base/thread.h> + #include <libcamera/formats.h> #include "libcamera/internal/bayer_format.h" @@ -27,6 +29,55 @@ namespace libcamera { +/** + * \brief Class representing one CPU debayering thread + * + * Implementation for CPU based debayering threads. + */ +class DebayerCpuThread : public Thread, public Object +{ +public: + DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, + bool enableInputMemcpy); + + void configure(unsigned int yStart, unsigned int yEnd); + void process(uint32_t frame, const uint8_t *src, uint8_t *dst); + +private: + void setupInputMemcpy(const uint8_t *linePointers[]); + void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); + void memcpyNextLine(const uint8_t *linePointers[]); + void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); + void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); + + /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ + static constexpr unsigned int kMaxLineBuffers = 5; + + DebayerCpu *debayer_; + unsigned int threadIndex_; + unsigned int yStart_; + unsigned int yEnd_; + unsigned int lineBufferLength_; + unsigned int lineBufferPadding_; + unsigned int lineBufferIndex_; + std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; + bool enableInputMemcpy_; +}; + +/** + * \brief Construct a DebayerCpuThread object + * \param[in] debayer pointer back to the DebayerCpuObject this thread belongs to + * \param[in] threadIndex 0 .. n thread-index value for the thread + * \param[in] enableInputMemcpy when set copy input data to a heap buffer before use + */ +DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, + bool enableInputMemcpy) + : Thread("DebayerCpu:" + std::to_string(threadIndex)), + debayer_(debayer), threadIndex_(threadIndex), + enableInputMemcpy_(enableInputMemcpy) +{ +} + /** * \class DebayerCpu * \brief Class for debayering on the CPU @@ -53,8 +104,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat * \todo Make memcpy automatic based on runtime detection of platform * capabilities. */ - enableInputMemcpy_ = + bool enableInputMemcpy = configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); + + /* Just one thread object for now, which will be called inline rather than async */ + threads_.resize(1); + + for (unsigned int i = 0; i < threads_.size(); i++) + threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); } DebayerCpu::~DebayerCpu() = default; @@ -484,7 +541,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0) return -EINVAL; - if (stats_->configure(inputCfg) != 0) + if (stats_->configure(inputCfg, threads_.size()) != 0) return -EINVAL; const Size &statsPatternSize = stats_->patternSize(); @@ -548,17 +605,43 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg, */ stats_->setWindow(Rectangle(window_.size())); + unsigned int yStart = 0; + unsigned int linesPerThread = (window_.height / threads_.size()) & + ~(inputConfig_.patternSize.height - 1); + unsigned int i; + + for (i = 0; i < (threads_.size() - 1); i++) { + threads_[i]->configure(yStart, yStart + linesPerThread); + yStart += linesPerThread; + } + threads_[i]->configure(yStart, window_.height); + + return 0; +} + +/** + * \brief Configure thread to process a specific part of the image + * \param[in] yStart y coordinate of first line to process + * \param[in] yEnd y coordinate of the line at which to stop processing + * + * Configure the thread to process lines yStart - (yEnd - 1). + */ +void DebayerCpuThread::configure(unsigned int yStart, unsigned int yEnd) +{ + Debayer::DebayerInputConfig &inputConfig = debayer_->inputConfig_; + + yStart_ = yStart; + yEnd_ = yEnd; + /* pad with patternSize.Width on both left and right side */ - lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8; - lineBufferLength_ = window_.width * inputConfig_.bpp / 8 + + lineBufferPadding_ = inputConfig.patternSize.width * inputConfig.bpp / 8; + lineBufferLength_ = debayer_->window_.width * inputConfig.bpp / 8 + 2 * lineBufferPadding_; if (enableInputMemcpy_) { - for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++) + for (unsigned int i = 0; i <= inputConfig.patternSize.height; i++) lineBuffers_[i].resize(lineBufferLength_); } - - return 0; } /* @@ -599,9 +682,9 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size return std::make_tuple(stride, stride * size.height); } -void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) +void DebayerCpuThread::setupInputMemcpy(const uint8_t *linePointers[]) { - const unsigned int patternHeight = inputConfig_.patternSize.height; + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; if (!enableInputMemcpy_) return; @@ -617,20 +700,20 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[]) lineBufferIndex_ = patternHeight; } -void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) +void DebayerCpuThread::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src) { - const unsigned int patternHeight = inputConfig_.patternSize.height; + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; for (unsigned int i = 0; i < patternHeight; i++) linePointers[i] = linePointers[i + 1]; - linePointers[patternHeight] = src + - (patternHeight / 2) * (int)inputConfig_.stride; + linePointers[patternHeight] = + src + (patternHeight / 2) * (int)debayer_->inputConfig_.stride; } -void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) +void DebayerCpuThread::memcpyNextLine(const uint8_t *linePointers[]) { - const unsigned int patternHeight = inputConfig_.patternSize.height; + const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height; if (!enableInputMemcpy_) return; @@ -643,23 +726,48 @@ void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[]) lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1); } -void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) +/** + * \brief Process part of the image assigned to this debayer thread + * \param[in] frame The frame number + * \param[in] src The source buffer + * \param[in] dst The destination buffer + */ +void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) { - unsigned int yEnd = window_.height; + Rectangle &window = debayer_->window_; + + /* Adjust src to top left corner of the window */ + src += (window.y + yStart_) * debayer_->inputConfig_.stride + + window.x * debayer_->inputConfig_.bpp / 8; + /* Adjust dst for yStart_ */ + dst += yStart_ * debayer_->outputConfig_.stride; + + if (debayer_->inputConfig_.patternSize.height == 2) + process2(frame, src, dst); + else + process4(frame, src, dst); +} + +void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) +{ + unsigned int outputStride = debayer_->outputConfig_.stride; + unsigned int inputStride = debayer_->inputConfig_.stride; + Rectangle &window = debayer_->window_; + unsigned int yEnd = yEnd_; /* Holds [0] previous- [1] current- [2] next-line */ const uint8_t *linePointers[3]; - /* Adjust src to top left corner of the window */ - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; - /* [x] becomes [x - 1] after initial shiftLinePointers() call */ - if (window_.y) { - linePointers[1] = src - inputConfig_.stride; /* previous-line */ + if (window.y + yStart_) { + linePointers[1] = src - inputStride; /* previous-line */ linePointers[2] = src; } else { - /* window_.y == 0, use the next line as prev line */ - linePointers[1] = src + inputConfig_.stride; + /* Top line, use the next line as prev line */ + linePointers[1] = src + inputStride; linePointers[2] = src; + } + + if (window.y == 0 && yEnd_ == window.height) { /* * Last 2 lines also need special handling. * (And configure() ensures that yEnd >= 2.) @@ -669,83 +777,93 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) setupInputMemcpy(linePointers); - for (unsigned int y = 0; y < yEnd; y += 2) { + /* + * Note y is the line-number *inside* the window, since stats_' window + * is the stats window inside/relative to the debayer window. IOW for + * single thread rendering y goes from 0 to window.height. + */ + for (unsigned int y = yStart_; y < yEnd; y += 2) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, y, linePointers); - (this->*debayer0_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_); + debayer_->debayer0(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - (this->*debayer1_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer1(dst, linePointers); + src += inputStride; + dst += outputStride; } - if (window_.y == 0) { + if (window.y == 0 && yEnd_ == window.height) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, yEnd, linePointers); - (this->*debayer0_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine0(frame, yEnd, linePointers, threadIndex_); + debayer_->debayer0(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); /* next line may point outside of src, use prev. */ linePointers[2] = linePointers[0]; - (this->*debayer1_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer1(dst, linePointers); + src += inputStride; + dst += outputStride; } } -void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) +void DebayerCpuThread::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) { + unsigned int outputStride = debayer_->outputConfig_.stride; + unsigned int inputStride = debayer_->inputConfig_.stride; + /* * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line * [3] 1-line-down [4] 2-lines-down. */ const uint8_t *linePointers[5]; - /* Adjust src to top left corner of the window */ - src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8; - /* [x] becomes [x - 1] after initial shiftLinePointers() call */ - linePointers[1] = src - 2 * inputConfig_.stride; - linePointers[2] = src - inputConfig_.stride; + linePointers[1] = src - 2 * inputStride; + linePointers[2] = src - inputStride; linePointers[3] = src; - linePointers[4] = src + inputConfig_.stride; + linePointers[4] = src + inputStride; setupInputMemcpy(linePointers); - for (unsigned int y = 0; y < window_.height; y += 4) { + /* + * Note y is the line-number *inside* the window, since stats_' window + * is the stats window inside/relative to the debayer window. IOW for + * single thread rendering y goes from 0 to window.height. + */ + for (unsigned int y = yStart_; y < yEnd_; y += 4) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, y, linePointers); - (this->*debayer0_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_); + debayer_->debayer0(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - (this->*debayer1_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer1(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine2(frame, y, linePointers); - (this->*debayer2_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->stats_->processLine2(frame, y, linePointers, threadIndex_); + debayer_->debayer2(dst, linePointers); + src += inputStride; + dst += outputStride; shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - (this->*debayer3_)(dst, linePointers); - src += inputConfig_.stride; - dst += outputConfig_.stride; + debayer_->debayer3(dst, linePointers); + src += inputStride; + dst += outputStride; } } @@ -867,10 +985,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output stats_->startFrame(frame); - if (inputConfig_.patternSize.height == 2) - process2(frame, in.planes()[0].data(), out.planes()[0].data()); - else - process4(frame, in.planes()[0].data(), out.planes()[0].data()); + threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); metadata.planes()[0].bytesused = out.planes()[0].size(); diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index 7a6517462..780576090 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -26,6 +26,7 @@ namespace libcamera { +class DebayerCpuThread; class DebayerCpu : public Debayer { public: @@ -44,6 +45,8 @@ public: const SharedFD &getStatsFD() { return stats_->getStatsFD(); } private: + friend class DebayerCpuThread; + /** * \brief Called to debayer 1 line of Bayer input data to output format * \param[out] dst Pointer to the start of the output line to write @@ -74,6 +77,11 @@ private: */ using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]); + void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); } + void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); } + void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); } + void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); } + /* 8-bit raw bayer format */ template<bool addAlphaByte, bool ccmEnabled> void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]); @@ -105,17 +113,9 @@ private: int setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputFormat, bool ccmEnabled); - void setupInputMemcpy(const uint8_t *linePointers[]); - void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src); - void memcpyNextLine(const uint8_t *linePointers[]); - void process2(uint32_t frame, const uint8_t *src, uint8_t *dst); - void process4(uint32_t frame, const uint8_t *src, uint8_t *dst); void updateGammaTable(const DebayerParams ¶ms); void updateLookupTables(const DebayerParams ¶ms); - /* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */ - static constexpr unsigned int kMaxLineBuffers = 5; - static constexpr unsigned int kRGBLookupSize = 256; static constexpr unsigned int kGammaLookupSize = 1024; struct CcmColumn { @@ -142,12 +142,9 @@ private: debayerFn debayer3_; Rectangle window_; std::unique_ptr<SwStatsCpu> stats_; - std::vector<uint8_t> lineBuffers_[kMaxLineBuffers]; - unsigned int lineBufferLength_; - unsigned int lineBufferPadding_; - unsigned int lineBufferIndex_; unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ - bool enableInputMemcpy_; + + std::vector<std::unique_ptr<DebayerCpuThread>>threads_; }; } /* namespace libcamera */