| Message ID | 20260216190204.106922-6-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta: > Add CPU soft ISP multi-threading support. > > Benchmark results for the Uno-Q with a weak CPU which is good for > performance testing, all numbers with an IMX219 running at > 3280x2464 -> 3272x2464: > > 1 thread : 147ms / frame, ~6.5 fps > 2 threads: 81ms / frame, ~12 fps > 3 threads: 66ms / frame, ~14.5 fps > > Adding a 4th thread does not improve performance. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++----- > src/libcamera/software_isp/debayer_cpu.h | 2 +- > 2 files changed, 40 insertions(+), 11 deletions(-) > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index 5e168554..c4b6c5b8 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -14,6 +14,7 @@ > #include <algorithm> > #include <stdlib.h> > #include <sys/ioctl.h> > +#include <thread> > #include <time.h> > #include <utility> > > @@ -41,7 +42,7 @@ namespace libcamera { > * \param[in] configuration The global configuration > */ > DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) > - : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) > + : Debayer(configuration), stats_(std::move(stats)) > { > /* > * Reading from uncached buffers may be very slow. > @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > */ > enableInputMemcpy_ = > configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > + threadCount_ = > + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3); > + threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads); > } > > DebayerCpu::~DebayerCpu() = default; > @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers, threadData); > - stats_->processLine0(frame, y, linePointers, &statsBuffer_); > + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > if (threadData->processLastLinesSeperately) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers, threadData); > - stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); > + stats_->processLine0(frame, threadData->yEnd, linePointers, > + threadData->statsBuffer); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers, threadData); > - stats_->processLine0(frame, y, linePointers, &statsBuffer_); > + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers, threadData); > - stats_->processLine2(frame, y, linePointers, &statsBuffer_); > + stats_->processLine2(frame, y, linePointers, threadData->statsBuffer); > (this->*debayer2_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams ¶ms) > > void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms) > { > + std::unique_ptr<std::thread> threads[threadCount_ - 1]; No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient. > + SwIspStats statsBuffer[threadCount_]; > + unsigned int i; > + > bench_.startFrame(); > > std::vector<DmaSyncer> dmaSyncers; > @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > return; > } > > - stats_->startFrame(frame, &statsBuffer_, 1); > + stats_->startFrame(frame, statsBuffer, threadCount_); > > - threadData_[0].yStart = 0; > - threadData_[0].yEnd = window_.height; > - (this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); > + unsigned int yStart = 0; > + unsigned int linesPerThread = (window_.height / threadCount_) & > + ~(inputConfig_.patternSize.width - 1); > + for (i = 0; i < (threadCount_ - 1); i++) { > + threadData_[i].yStart = yStart; > + threadData_[i].yEnd = yStart + linesPerThread; > + threadData_[i].statsBuffer = &statsBuffer[i]; > + threads[i] = std::make_unique<std::thread>( > + processInner_, this, frame, > + in.planes()[0].data(), > + out.planes()[0].data() + yStart * outputConfig_.stride, > + &threadData_[i]); > + yStart += linesPerThread; > + } > + threadData_[i].yStart = yStart; > + threadData_[i].yEnd = window_.height; > + threadData_[i].statsBuffer = &statsBuffer[i]; > + (this->*processInner_)(frame, in.planes()[0].data(), > + out.planes()[0].data() + yStart * outputConfig_.stride, > + &threadData_[i]); > + > + for (i = 0; i < (threadCount_ - 1); i++) > + threads[i]->join(); This is creating and then joining x threads *per frame*, right? I feel like that could be improved. Have you looked at having a set of long-running threads? (Even something very simple, e.g. just threads with a mutex and condvar, started in start() and stopped in stop()?) > > metadata.planes()[0].bytesused = out.planes()[0].size(); > > @@ -909,7 +938,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > * > * \todo Pass real bufferId once stats buffer passing is changed. > */ > - stats_->finishFrame(frame, 0, &statsBuffer_, 1); > + stats_->finishFrame(frame, 0, statsBuffer, threadCount_); > outputBufferReady.emit(output); > inputBufferReady.emit(input); > } > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index b85dd11c..63fa7710 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -85,6 +85,7 @@ private: > unsigned int lineBufferIndex; > /* Stored here to avoid causing register pressure in inner loop */ > bool processLastLinesSeperately; > + SwIspStats *statsBuffer; > }; > > using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst, > @@ -150,7 +151,6 @@ private: > Rectangle window_; > > /* Variables used every line */ > - SwIspStats statsBuffer_; > debayerFn debayer0_; > debayerFn debayer1_; > debayerFn debayer2_;
On Tue, Feb 17, 2026 at 10:07:26AM +0100, Barnabás Pőcze wrote: > Hi > > 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta: > > Add CPU soft ISP multi-threading support. > > > > Benchmark results for the Uno-Q with a weak CPU which is good for > > performance testing, all numbers with an IMX219 running at > > 3280x2464 -> 3272x2464: > > > > 1 thread : 147ms / frame, ~6.5 fps > > 2 threads: 81ms / frame, ~12 fps > > 3 threads: 66ms / frame, ~14.5 fps > > > > Adding a 4th thread does not improve performance. > > > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > > --- > > src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++----- > > src/libcamera/software_isp/debayer_cpu.h | 2 +- > > 2 files changed, 40 insertions(+), 11 deletions(-) > > > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > > index 5e168554..c4b6c5b8 100644 > > --- a/src/libcamera/software_isp/debayer_cpu.cpp > > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > > @@ -14,6 +14,7 @@ > > #include <algorithm> > > #include <stdlib.h> > > #include <sys/ioctl.h> > > +#include <thread> > > #include <time.h> > > #include <utility> > > > > @@ -41,7 +42,7 @@ namespace libcamera { > > * \param[in] configuration The global configuration > > */ > > DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) > > - : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) > > + : Debayer(configuration), stats_(std::move(stats)) > > { > > /* > > * Reading from uncached buffers may be very slow. > > @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > > */ > > enableInputMemcpy_ = > > configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > > + threadCount_ = > > + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3); > > + threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads); > > } > > > > DebayerCpu::~DebayerCpu() = default; > > @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > > for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers, threadData); > > - stats_->processLine0(frame, y, linePointers, &statsBuffer_); > > + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); > > (this->*debayer0_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > > if (threadData->processLastLinesSeperately) { > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers, threadData); > > - stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); > > + stats_->processLine0(frame, threadData->yEnd, linePointers, > > + threadData->statsBuffer); > > (this->*debayer0_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > > for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers, threadData); > > - stats_->processLine0(frame, y, linePointers, &statsBuffer_); > > + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); > > (this->*debayer0_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > > > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers, threadData); > > - stats_->processLine2(frame, y, linePointers, &statsBuffer_); > > + stats_->processLine2(frame, y, linePointers, threadData->statsBuffer); > > (this->*debayer2_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams ¶ms) > > > > void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms) > > { > > + std::unique_ptr<std::thread> threads[threadCount_ - 1]; > > No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient. > > > > + SwIspStats statsBuffer[threadCount_]; > > + unsigned int i; > > + > > bench_.startFrame(); > > > > std::vector<DmaSyncer> dmaSyncers; > > @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > return; > > } > > > > - stats_->startFrame(frame, &statsBuffer_, 1); > > + stats_->startFrame(frame, statsBuffer, threadCount_); > > > > - threadData_[0].yStart = 0; > > - threadData_[0].yEnd = window_.height; > > - (this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); > > + unsigned int yStart = 0; > > + unsigned int linesPerThread = (window_.height / threadCount_) & > > + ~(inputConfig_.patternSize.width - 1); > > + for (i = 0; i < (threadCount_ - 1); i++) { > > + threadData_[i].yStart = yStart; > > + threadData_[i].yEnd = yStart + linesPerThread; > > + threadData_[i].statsBuffer = &statsBuffer[i]; > > + threads[i] = std::make_unique<std::thread>( > > + processInner_, this, frame, > > + in.planes()[0].data(), > > + out.planes()[0].data() + yStart * outputConfig_.stride, > > + &threadData_[i]); > > + yStart += linesPerThread; > > + } > > + threadData_[i].yStart = yStart; > > + threadData_[i].yEnd = window_.height; > > + threadData_[i].statsBuffer = &statsBuffer[i]; > > + (this->*processInner_)(frame, in.planes()[0].data(), > > + out.planes()[0].data() + yStart * outputConfig_.stride, > > + &threadData_[i]); > > + > > + for (i = 0; i < (threadCount_ - 1); i++) > > + threads[i]->join(); > > This is creating and then joining x threads *per frame*, right? I feel like that > could be improved. Have you looked at having a set of long-running threads? (Even > something very simple, e.g. just threads with a mutex and condvar, started in start() > and stopped in stop()?) Furthermore, this should use the Thread class instead of std::thread, and it should subclass Thread to stored the per-thread data. > > > > metadata.planes()[0].bytesused = out.planes()[0].size(); > > > > @@ -909,7 +938,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > * > > * \todo Pass real bufferId once stats buffer passing is changed. > > */ > > - stats_->finishFrame(frame, 0, &statsBuffer_, 1); > > + stats_->finishFrame(frame, 0, statsBuffer, threadCount_); > > outputBufferReady.emit(output); > > inputBufferReady.emit(input); > > } > > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > > index b85dd11c..63fa7710 100644 > > --- a/src/libcamera/software_isp/debayer_cpu.h > > +++ b/src/libcamera/software_isp/debayer_cpu.h > > @@ -85,6 +85,7 @@ private: > > unsigned int lineBufferIndex; > > /* Stored here to avoid causing register pressure in inner loop */ > > bool processLastLinesSeperately; > > + SwIspStats *statsBuffer; > > }; > > > > using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst, > > @@ -150,7 +151,6 @@ private: > > Rectangle window_; > > > > /* Variables used every line */ > > - SwIspStats statsBuffer_; > > debayerFn debayer0_; > > debayerFn debayer1_; > > debayerFn debayer2_;
Hi Laurent, On 19-Feb-26 15:15, Laurent Pinchart wrote: > On Tue, Feb 17, 2026 at 10:07:26AM +0100, Barnabás Pőcze wrote: >> Hi >> >> 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta: >>> Add CPU soft ISP multi-threading support. >>> >>> Benchmark results for the Uno-Q with a weak CPU which is good for >>> performance testing, all numbers with an IMX219 running at >>> 3280x2464 -> 3272x2464: >>> >>> 1 thread : 147ms / frame, ~6.5 fps >>> 2 threads: 81ms / frame, ~12 fps >>> 3 threads: 66ms / frame, ~14.5 fps >>> >>> Adding a 4th thread does not improve performance. >>> >>> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >>> --- >>> src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++----- >>> src/libcamera/software_isp/debayer_cpu.h | 2 +- >>> 2 files changed, 40 insertions(+), 11 deletions(-) >>> >>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp >>> index 5e168554..c4b6c5b8 100644 >>> --- a/src/libcamera/software_isp/debayer_cpu.cpp >>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp >>> @@ -14,6 +14,7 @@ >>> #include <algorithm> >>> #include <stdlib.h> >>> #include <sys/ioctl.h> >>> +#include <thread> >>> #include <time.h> >>> #include <utility> >>> >>> @@ -41,7 +42,7 @@ namespace libcamera { >>> * \param[in] configuration The global configuration >>> */ >>> DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) >>> - : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) >>> + : Debayer(configuration), stats_(std::move(stats)) >>> { >>> /* >>> * Reading from uncached buffers may be very slow. >>> @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat >>> */ >>> enableInputMemcpy_ = >>> configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); >>> + threadCount_ = >>> + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3); >>> + threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads); >>> } >>> >>> DebayerCpu::~DebayerCpu() = default; >>> @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, >>> for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { >>> shiftLinePointers(linePointers, src); >>> memcpyNextLine(linePointers, threadData); >>> - stats_->processLine0(frame, y, linePointers, &statsBuffer_); >>> + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); >>> (this->*debayer0_)(dst, linePointers); >>> src += inputConfig_.stride; >>> dst += outputConfig_.stride; >>> @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, >>> if (threadData->processLastLinesSeperately) { >>> shiftLinePointers(linePointers, src); >>> memcpyNextLine(linePointers, threadData); >>> - stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); >>> + stats_->processLine0(frame, threadData->yEnd, linePointers, >>> + threadData->statsBuffer); >>> (this->*debayer0_)(dst, linePointers); >>> src += inputConfig_.stride; >>> dst += outputConfig_.stride; >>> @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, >>> for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { >>> shiftLinePointers(linePointers, src); >>> memcpyNextLine(linePointers, threadData); >>> - stats_->processLine0(frame, y, linePointers, &statsBuffer_); >>> + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); >>> (this->*debayer0_)(dst, linePointers); >>> src += inputConfig_.stride; >>> dst += outputConfig_.stride; >>> @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, >>> >>> shiftLinePointers(linePointers, src); >>> memcpyNextLine(linePointers, threadData); >>> - stats_->processLine2(frame, y, linePointers, &statsBuffer_); >>> + stats_->processLine2(frame, y, linePointers, threadData->statsBuffer); >>> (this->*debayer2_)(dst, linePointers); >>> src += inputConfig_.stride; >>> dst += outputConfig_.stride; >>> @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams ¶ms) >>> >>> void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms) >>> { >>> + std::unique_ptr<std::thread> threads[threadCount_ - 1]; >> >> No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient. >> >> >>> + SwIspStats statsBuffer[threadCount_]; >>> + unsigned int i; >>> + >>> bench_.startFrame(); >>> >>> std::vector<DmaSyncer> dmaSyncers; >>> @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >>> return; >>> } >>> >>> - stats_->startFrame(frame, &statsBuffer_, 1); >>> + stats_->startFrame(frame, statsBuffer, threadCount_); >>> >>> - threadData_[0].yStart = 0; >>> - threadData_[0].yEnd = window_.height; >>> - (this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); >>> + unsigned int yStart = 0; >>> + unsigned int linesPerThread = (window_.height / threadCount_) & >>> + ~(inputConfig_.patternSize.width - 1); >>> + for (i = 0; i < (threadCount_ - 1); i++) { >>> + threadData_[i].yStart = yStart; >>> + threadData_[i].yEnd = yStart + linesPerThread; >>> + threadData_[i].statsBuffer = &statsBuffer[i]; >>> + threads[i] = std::make_unique<std::thread>( >>> + processInner_, this, frame, >>> + in.planes()[0].data(), >>> + out.planes()[0].data() + yStart * outputConfig_.stride, >>> + &threadData_[i]); >>> + yStart += linesPerThread; >>> + } >>> + threadData_[i].yStart = yStart; >>> + threadData_[i].yEnd = window_.height; >>> + threadData_[i].statsBuffer = &statsBuffer[i]; >>> + (this->*processInner_)(frame, in.planes()[0].data(), >>> + out.planes()[0].data() + yStart * outputConfig_.stride, >>> + &threadData_[i]); >>> + >>> + for (i = 0; i < (threadCount_ - 1); i++) >>> + threads[i]->join(); >> >> This is creating and then joining x threads *per frame*, right? I feel like that >> could be improved. Have you looked at having a set of long-running threads? (Even >> something very simple, e.g. just threads with a mutex and condvar, started in start() >> and stopped in stop()?) Ack, I agree that only creating N threads once and re-using them would be better. I started with the KISS approach of just firing of a new thread every frame for v1, because C++'s stdlib still lacks any kind of ThreadPool mechanism which is kinda sad in 2026. > Furthermore, this should use the Thread class instead of std::thread, > and it should subclass Thread to stored the per-thread data. I looked into using the libcamera Thread class for this, but it seems unsuitable for this purpose. AFAICT is designed to have methods called on an object which was moved to thread, which is not what is happening here. I did consider introducing a new DebayerCpuThread object which would then contain the per thread-data. This would require that class to have a copy of the minimum set of other vars (of which there are not much) needed for the actual inner-loop processing. Then I can move those objects to threads and use ConnectionTypeQueued calls for the actual process() call. But I do not see any way to wait for queued calls to finish on an Object type ? The whole Thread / Object / BoundMethod trio seems to be designed around message dispatching / async calling methods. But here I want a worker thread to repeatedly do the same thing. What I'm envisioning for this is using a std::thread with a main func like this: thread_main(int threadIdx) { while (1) { workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx)); if (stop_) break; process(); mutex_.lock() pendingWork &= ~(1 << threadIdx); mutex_.unlock() workDoneCondvar_.notify_all(); } } Which is something which the Thread class does not let me do, AFAICT the Thread class does not let me specify my own thread function, instead forcing use of the message mechanism. So I think sticking with std::thread is better in this case. Again please advise how you want to proceed. I was about to rework things to something like the above based on Barnabás' comments. But I'll hold of on doing a v2 until we've some agreement on how to handle the threading otherwise I'll just be wasting my time. Regards, Hans
On Thu, Feb 19, 2026 at 06:07:28PM +0100, Hans de Goede wrote: > On 19-Feb-26 15:15, Laurent Pinchart wrote: > > On Tue, Feb 17, 2026 at 10:07:26AM +0100, Barnabás Pőcze wrote: > >> 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta: > >>> Add CPU soft ISP multi-threading support. > >>> > >>> Benchmark results for the Uno-Q with a weak CPU which is good for > >>> performance testing, all numbers with an IMX219 running at > >>> 3280x2464 -> 3272x2464: > >>> > >>> 1 thread : 147ms / frame, ~6.5 fps > >>> 2 threads: 81ms / frame, ~12 fps > >>> 3 threads: 66ms / frame, ~14.5 fps > >>> > >>> Adding a 4th thread does not improve performance. > >>> > >>> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > >>> --- > >>> src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++----- > >>> src/libcamera/software_isp/debayer_cpu.h | 2 +- > >>> 2 files changed, 40 insertions(+), 11 deletions(-) > >>> > >>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > >>> index 5e168554..c4b6c5b8 100644 > >>> --- a/src/libcamera/software_isp/debayer_cpu.cpp > >>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp > >>> @@ -14,6 +14,7 @@ > >>> #include <algorithm> > >>> #include <stdlib.h> > >>> #include <sys/ioctl.h> > >>> +#include <thread> > >>> #include <time.h> > >>> #include <utility> > >>> > >>> @@ -41,7 +42,7 @@ namespace libcamera { > >>> * \param[in] configuration The global configuration > >>> */ > >>> DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) > >>> - : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) > >>> + : Debayer(configuration), stats_(std::move(stats)) > >>> { > >>> /* > >>> * Reading from uncached buffers may be very slow. > >>> @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > >>> */ > >>> enableInputMemcpy_ = > >>> configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > >>> + threadCount_ = > >>> + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3); > >>> + threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads); > >>> } > >>> > >>> DebayerCpu::~DebayerCpu() = default; > >>> @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > >>> for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { > >>> shiftLinePointers(linePointers, src); > >>> memcpyNextLine(linePointers, threadData); > >>> - stats_->processLine0(frame, y, linePointers, &statsBuffer_); > >>> + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); > >>> (this->*debayer0_)(dst, linePointers); > >>> src += inputConfig_.stride; > >>> dst += outputConfig_.stride; > >>> @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, > >>> if (threadData->processLastLinesSeperately) { > >>> shiftLinePointers(linePointers, src); > >>> memcpyNextLine(linePointers, threadData); > >>> - stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); > >>> + stats_->processLine0(frame, threadData->yEnd, linePointers, > >>> + threadData->statsBuffer); > >>> (this->*debayer0_)(dst, linePointers); > >>> src += inputConfig_.stride; > >>> dst += outputConfig_.stride; > >>> @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > >>> for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { > >>> shiftLinePointers(linePointers, src); > >>> memcpyNextLine(linePointers, threadData); > >>> - stats_->processLine0(frame, y, linePointers, &statsBuffer_); > >>> + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); > >>> (this->*debayer0_)(dst, linePointers); > >>> src += inputConfig_.stride; > >>> dst += outputConfig_.stride; > >>> @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, > >>> > >>> shiftLinePointers(linePointers, src); > >>> memcpyNextLine(linePointers, threadData); > >>> - stats_->processLine2(frame, y, linePointers, &statsBuffer_); > >>> + stats_->processLine2(frame, y, linePointers, threadData->statsBuffer); > >>> (this->*debayer2_)(dst, linePointers); > >>> src += inputConfig_.stride; > >>> dst += outputConfig_.stride; > >>> @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams ¶ms) > >>> > >>> void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms) > >>> { > >>> + std::unique_ptr<std::thread> threads[threadCount_ - 1]; > >> > >> No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient. > >> > >> > >>> + SwIspStats statsBuffer[threadCount_]; > >>> + unsigned int i; > >>> + > >>> bench_.startFrame(); > >>> > >>> std::vector<DmaSyncer> dmaSyncers; > >>> @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > >>> return; > >>> } > >>> > >>> - stats_->startFrame(frame, &statsBuffer_, 1); > >>> + stats_->startFrame(frame, statsBuffer, threadCount_); > >>> > >>> - threadData_[0].yStart = 0; > >>> - threadData_[0].yEnd = window_.height; > >>> - (this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); > >>> + unsigned int yStart = 0; > >>> + unsigned int linesPerThread = (window_.height / threadCount_) & > >>> + ~(inputConfig_.patternSize.width - 1); > >>> + for (i = 0; i < (threadCount_ - 1); i++) { > >>> + threadData_[i].yStart = yStart; > >>> + threadData_[i].yEnd = yStart + linesPerThread; > >>> + threadData_[i].statsBuffer = &statsBuffer[i]; > >>> + threads[i] = std::make_unique<std::thread>( > >>> + processInner_, this, frame, > >>> + in.planes()[0].data(), > >>> + out.planes()[0].data() + yStart * outputConfig_.stride, > >>> + &threadData_[i]); > >>> + yStart += linesPerThread; > >>> + } > >>> + threadData_[i].yStart = yStart; > >>> + threadData_[i].yEnd = window_.height; > >>> + threadData_[i].statsBuffer = &statsBuffer[i]; > >>> + (this->*processInner_)(frame, in.planes()[0].data(), > >>> + out.planes()[0].data() + yStart * outputConfig_.stride, > >>> + &threadData_[i]); > >>> + > >>> + for (i = 0; i < (threadCount_ - 1); i++) > >>> + threads[i]->join(); > >> > >> This is creating and then joining x threads *per frame*, right? I feel like that > >> could be improved. Have you looked at having a set of long-running threads? (Even > >> something very simple, e.g. just threads with a mutex and condvar, started in start() > >> and stopped in stop()?) > > Ack, I agree that only creating N threads once and re-using them > would be better. > > I started with the KISS approach of just firing of a new thread > every frame for v1, because C++'s stdlib still lacks any kind > of ThreadPool mechanism which is kinda sad in 2026. > > > Furthermore, this should use the Thread class instead of std::thread, > > and it should subclass Thread to stored the per-thread data. > > I looked into using the libcamera Thread class for this, > but it seems unsuitable for this purpose. AFAICT is designed > to have methods called on an object which was moved to thread, > which is not what is happening here. > > I did consider introducing a new DebayerCpuThread object which > would then contain the per thread-data. This would require that > class to have a copy of the minimum set of other vars (of which > there are not much) needed for the actual inner-loop processing. > > Then I can move those objects to threads and use ConnectionTypeQueued > calls for the actual process() call. > > But I do not see any way to wait for queued calls to finish > on an Object type ? > > The whole Thread / Object / BoundMethod trio seems to be designed > around message dispatching / async calling methods. But here > I want a worker thread to repeatedly do the same thing. > > What I'm envisioning for this is using a std::thread with > a main func like this: > > > thread_main(int threadIdx) > { > while (1) { > workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx)); > > if (stop_) > break; > > process(); > > mutex_.lock() > pendingWork &= ~(1 << threadIdx); > mutex_.unlock() > > workDoneCondvar_.notify_all(); > } > } > > Which is something which the Thread class does not let me > do, AFAICT the Thread class does not let me specify my > own thread function, instead forcing use of the message > mechanism. You don't have to use messages. By default the Thread class will instantiate an event loop, but you can override the run() function to provide your own thread loop. Have you considered that ? Note that if you have a fixed number of threads, each processing a pre-defined part of the image, messages can still be useful as a communication primitive. You can use invokeMethod() with queued delivery, and benefit from the message queue to deliver the work to the thread asynchronously. If you want to implement a thread pool where each thread would automatically pick the next work item when it goes back to its idle loop then you will need your own run() function. > So I think sticking with std::thread is better in this case. > > Again please advise how you want to proceed. I was about to > rework things to something like the above based on Barnabás' > comments. But I'll hold of on doing a v2 until we've some > agreement on how to handle the threading otherwise I'll just > be wasting my time.
Hi Laurent, On 19-Feb-26 18:22, Laurent Pinchart wrote: <snip> >>>> This is creating and then joining x threads *per frame*, right? I feel like that >>>> could be improved. Have you looked at having a set of long-running threads? (Even >>>> something very simple, e.g. just threads with a mutex and condvar, started in start() >>>> and stopped in stop()?) >> >> Ack, I agree that only creating N threads once and re-using them >> would be better. >> >> I started with the KISS approach of just firing of a new thread >> every frame for v1, because C++'s stdlib still lacks any kind >> of ThreadPool mechanism which is kinda sad in 2026. >> >>> Furthermore, this should use the Thread class instead of std::thread, >>> and it should subclass Thread to stored the per-thread data. >> >> I looked into using the libcamera Thread class for this, >> but it seems unsuitable for this purpose. AFAICT is designed >> to have methods called on an object which was moved to thread, >> which is not what is happening here. >> >> I did consider introducing a new DebayerCpuThread object which >> would then contain the per thread-data. This would require that >> class to have a copy of the minimum set of other vars (of which >> there are not much) needed for the actual inner-loop processing. >> >> Then I can move those objects to threads and use ConnectionTypeQueued >> calls for the actual process() call. >> >> But I do not see any way to wait for queued calls to finish >> on an Object type ? >> >> The whole Thread / Object / BoundMethod trio seems to be designed >> around message dispatching / async calling methods. But here >> I want a worker thread to repeatedly do the same thing. >> >> What I'm envisioning for this is using a std::thread with >> a main func like this: >> >> >> thread_main(int threadIdx) >> { >> while (1) { >> workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx)); >> >> if (stop_) >> break; >> >> process(); >> >> mutex_.lock() >> pendingWork &= ~(1 << threadIdx); >> mutex_.unlock() >> >> workDoneCondvar_.notify_all(); >> } >> } >> >> Which is something which the Thread class does not let me >> do, AFAICT the Thread class does not let me specify my >> own thread function, instead forcing use of the message >> mechanism. > > You don't have to use messages. By default the Thread class will > instantiate an event loop, but you can override the run() function to > provide your own thread loop. Have you considered that ? I did not consider that before. I've just looked into this a bit, I think I can make this work. > Note that if you have a fixed number of threads, each processing a > pre-defined part of the image, messages can still be useful as a > communication primitive. You can use invokeMethod() with queued > delivery, and benefit from the message queue to deliver the work to the > thread asynchronously. I do plan to have a fixed (configurable in config file) number of threads, like in this v1 patch. and being able to use invokeMethod does sound useful, the question is how do I make the "main" process() method wait for all (1 per debayerCpuThread object) invokeMethod calls to complete ? Should I add a ConditionVariable for this and have the run() override function notify this ConditionVariable, or is their already some mechanism for this which I'm missing ? Regards, Hans
On Thu, Feb 19, 2026 at 07:20:41PM +0100, Hans de Goede wrote: > Hi Laurent, > > On 19-Feb-26 18:22, Laurent Pinchart wrote: > > <snip> > > >>>> This is creating and then joining x threads *per frame*, right? I feel like that > >>>> could be improved. Have you looked at having a set of long-running threads? (Even > >>>> something very simple, e.g. just threads with a mutex and condvar, started in start() > >>>> and stopped in stop()?) > >> > >> Ack, I agree that only creating N threads once and re-using them > >> would be better. > >> > >> I started with the KISS approach of just firing of a new thread > >> every frame for v1, because C++'s stdlib still lacks any kind > >> of ThreadPool mechanism which is kinda sad in 2026. > >> > >>> Furthermore, this should use the Thread class instead of std::thread, > >>> and it should subclass Thread to stored the per-thread data. > >> > >> I looked into using the libcamera Thread class for this, > >> but it seems unsuitable for this purpose. AFAICT is designed > >> to have methods called on an object which was moved to thread, > >> which is not what is happening here. > >> > >> I did consider introducing a new DebayerCpuThread object which > >> would then contain the per thread-data. This would require that > >> class to have a copy of the minimum set of other vars (of which > >> there are not much) needed for the actual inner-loop processing. > >> > >> Then I can move those objects to threads and use ConnectionTypeQueued > >> calls for the actual process() call. > >> > >> But I do not see any way to wait for queued calls to finish > >> on an Object type ? > >> > >> The whole Thread / Object / BoundMethod trio seems to be designed > >> around message dispatching / async calling methods. But here > >> I want a worker thread to repeatedly do the same thing. > >> > >> What I'm envisioning for this is using a std::thread with > >> a main func like this: > >> > >> > >> thread_main(int threadIdx) > >> { > >> while (1) { > >> workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx)); > >> > >> if (stop_) > >> break; > >> > >> process(); > >> > >> mutex_.lock() > >> pendingWork &= ~(1 << threadIdx); > >> mutex_.unlock() > >> > >> workDoneCondvar_.notify_all(); > >> } > >> } > >> > >> Which is something which the Thread class does not let me > >> do, AFAICT the Thread class does not let me specify my > >> own thread function, instead forcing use of the message > >> mechanism. > > > > You don't have to use messages. By default the Thread class will > > instantiate an event loop, but you can override the run() function to > > provide your own thread loop. Have you considered that ? > > I did not consider that before. I've just looked into this a bit, > I think I can make this work. > > > Note that if you have a fixed number of threads, each processing a > > pre-defined part of the image, messages can still be useful as a > > communication primitive. You can use invokeMethod() with queued > > delivery, and benefit from the message queue to deliver the work to the > > thread asynchronously. > > I do plan to have a fixed (configurable in config file) number > of threads, like in this v1 patch. > > and being able to use invokeMethod does sound useful, Note that invokeMethod() requires the receiving object to be bound to a thread that runs an event loop, so you won't be able to override run() in that case (or, to be precise, you can override run() but you will have to call exec() from your implementation, so you can't run your own thread loop). > the question > is how do I make the "main" process() method wait for all > (1 per debayerCpuThread object) invokeMethod calls to complete ? > > Should I add a ConditionVariable for this and have the run() > override function notify this ConditionVariable, or is their > already some mechanism for this which I'm missing ? That would work, but I think the design could be improved. In this series you introduce n-1 new threads, with the existing software ISP thread acting as the n'th thread. I think it would be nicer to have n identical instances of a class to handle the threads, with SoftwareIsp::process() handling the dispatching (with invokeMethod()). Completion would be signalled through signals, the same way it is done now with inputBufferReady and outputBufferReady. The SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would be called once per thread, and would handle the required accounting to emit the SoftwareIsp completion signals only after all threads complete their jobs. I haven't investigated in details to see if there would be dragons lurking in dark corners, but my gut feeling is that a design where no thread is special would be cleaner.
Hi, On 20-Feb-26 5:09 PM, Laurent Pinchart wrote: > On Thu, Feb 19, 2026 at 07:20:41PM +0100, Hans de Goede wrote: >> Hi Laurent, >> >> On 19-Feb-26 18:22, Laurent Pinchart wrote: >> >> <snip> >> >>>>>> This is creating and then joining x threads *per frame*, right? I feel like that >>>>>> could be improved. Have you looked at having a set of long-running threads? (Even >>>>>> something very simple, e.g. just threads with a mutex and condvar, started in start() >>>>>> and stopped in stop()?) >>>> >>>> Ack, I agree that only creating N threads once and re-using them >>>> would be better. >>>> >>>> I started with the KISS approach of just firing of a new thread >>>> every frame for v1, because C++'s stdlib still lacks any kind >>>> of ThreadPool mechanism which is kinda sad in 2026. >>>> >>>>> Furthermore, this should use the Thread class instead of std::thread, >>>>> and it should subclass Thread to stored the per-thread data. >>>> >>>> I looked into using the libcamera Thread class for this, >>>> but it seems unsuitable for this purpose. AFAICT is designed >>>> to have methods called on an object which was moved to thread, >>>> which is not what is happening here. >>>> >>>> I did consider introducing a new DebayerCpuThread object which >>>> would then contain the per thread-data. This would require that >>>> class to have a copy of the minimum set of other vars (of which >>>> there are not much) needed for the actual inner-loop processing. >>>> >>>> Then I can move those objects to threads and use ConnectionTypeQueued >>>> calls for the actual process() call. >>>> >>>> But I do not see any way to wait for queued calls to finish >>>> on an Object type ? >>>> >>>> The whole Thread / Object / BoundMethod trio seems to be designed >>>> around message dispatching / async calling methods. But here >>>> I want a worker thread to repeatedly do the same thing. >>>> >>>> What I'm envisioning for this is using a std::thread with >>>> a main func like this: >>>> >>>> >>>> thread_main(int threadIdx) >>>> { >>>> while (1) { >>>> workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx)); >>>> >>>> if (stop_) >>>> break; >>>> >>>> process(); >>>> >>>> mutex_.lock() >>>> pendingWork &= ~(1 << threadIdx); >>>> mutex_.unlock() >>>> >>>> workDoneCondvar_.notify_all(); >>>> } >>>> } >>>> >>>> Which is something which the Thread class does not let me >>>> do, AFAICT the Thread class does not let me specify my >>>> own thread function, instead forcing use of the message >>>> mechanism. >>> >>> You don't have to use messages. By default the Thread class will >>> instantiate an event loop, but you can override the run() function to >>> provide your own thread loop. Have you considered that ? >> >> I did not consider that before. I've just looked into this a bit, >> I think I can make this work. >> >>> Note that if you have a fixed number of threads, each processing a >>> pre-defined part of the image, messages can still be useful as a >>> communication primitive. You can use invokeMethod() with queued >>> delivery, and benefit from the message queue to deliver the work to the >>> thread asynchronously. >> >> I do plan to have a fixed (configurable in config file) number >> of threads, like in this v1 patch. >> >> and being able to use invokeMethod does sound useful, > > Note that invokeMethod() requires the receiving object to be bound to a > thread that runs an event loop, so you won't be able to override run() > in that case (or, to be precise, you can override run() but you will > have to call exec() from your implementation, so you can't run your own > thread loop). > >> the question >> is how do I make the "main" process() method wait for all >> (1 per debayerCpuThread object) invokeMethod calls to complete ? >> >> Should I add a ConditionVariable for this and have the run() >> override function notify this ConditionVariable, or is their >> already some mechanism for this which I'm missing ? > > That would work, but I think the design could be improved. > > In this series you introduce n-1 new threads, with the existing software > ISP thread acting as the n'th thread. I think it would be nicer to have > n identical instances of a class to handle the threads, with > SoftwareIsp::process() handling the dispatching (with invokeMethod()). > Completion would be signalled through signals, the same way it is done > now with inputBufferReady and outputBufferReady. The > SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would > be called once per thread, and would handle the required accounting to > emit the SoftwareIsp completion signals only after all threads complete > their jobs. > > I haven't investigated in details to see if there would be dragons > lurking in dark corners, but my gut feeling is that a design where no > thread is special would be cleaner. The signal approach is interesting, I'll look into that. Regards, Hans
Hi, On 20-Feb-26 7:51 PM, Hans de Goede wrote: > On 20-Feb-26 5:09 PM, Laurent Pinchart wrote: <snip> >> In this series you introduce n-1 new threads, with the existing software >> ISP thread acting as the n'th thread. I think it would be nicer to have >> n identical instances of a class to handle the threads, with >> SoftwareIsp::process() handling the dispatching (with invokeMethod()). >> Completion would be signalled through signals, the same way it is done >> now with inputBufferReady and outputBufferReady. The >> SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would >> be called once per thread, and would handle the required accounting to >> emit the SoftwareIsp completion signals only after all threads complete >> their jobs. >> >> I haven't investigated in details to see if there would be dragons >> lurking in dark corners, but my gut feeling is that a design where no >> thread is special would be cleaner. > > The signal approach is interesting, I'll look into that. Ok, so this does not work that well, for the CPU ISP we need to create mappings for the in and out buffer atm we use local MappedFrameBuffer objects relying on the destructor unmapping the buffers when the desctructors run when the objects run out of context. An other problem is that the emitted signals when processing is done take the input and output Framebuffers passed to the process() function. So if we move the completion to a signal handler then we need to temporarily store those Framebuffer pointers as class variables. So all in all just waiting in the process() function for all threads to complete with the threads signalling a conditional variable works better from design pov. So i'm going to go with that for v2. Regards, Hans > > Regards, > > Hans >
On Fri, Feb 20, 2026 at 09:41:01PM +0100, Hans de Goede wrote: > Hi, > > On 20-Feb-26 7:51 PM, Hans de Goede wrote: > > On 20-Feb-26 5:09 PM, Laurent Pinchart wrote: > > <snip> > > >> In this series you introduce n-1 new threads, with the existing software > >> ISP thread acting as the n'th thread. I think it would be nicer to have > >> n identical instances of a class to handle the threads, with > >> SoftwareIsp::process() handling the dispatching (with invokeMethod()). > >> Completion would be signalled through signals, the same way it is done > >> now with inputBufferReady and outputBufferReady. The > >> SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would > >> be called once per thread, and would handle the required accounting to > >> emit the SoftwareIsp completion signals only after all threads complete > >> their jobs. > >> > >> I haven't investigated in details to see if there would be dragons > >> lurking in dark corners, but my gut feeling is that a design where no > >> thread is special would be cleaner. > > > > The signal approach is interesting, I'll look into that. > > Ok, so this does not work that well, for the CPU ISP we need to > create mappings for the in and out buffer atm we use local > MappedFrameBuffer objects relying on the destructor unmapping > the buffers when the desctructors run when the objects run out > of context. > > > An other problem is that the emitted signals when processing > is done take the input and output Framebuffers passed to > the process() function. So if we move the completion > to a signal handler then we need to temporarily store those > Framebuffer pointers as class variables. > > So all in all just waiting in the process() function for all threads > to complete with the threads signalling a conditional variable > works better from design pov. So i'm going to go with that for v2. I still feel a symmetrical design would be nicer, but we can get there incrementally. Let's focus on progressing the patch series.
diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index 5e168554..c4b6c5b8 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -14,6 +14,7 @@ #include <algorithm> #include <stdlib.h> #include <sys/ioctl.h> +#include <thread> #include <time.h> #include <utility> @@ -41,7 +42,7 @@ namespace libcamera { * \param[in] configuration The global configuration */ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration) - : Debayer(configuration), stats_(std::move(stats)), threadCount_(1) + : Debayer(configuration), stats_(std::move(stats)) { /* * Reading from uncached buffers may be very slow. @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat */ enableInputMemcpy_ = configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); + threadCount_ = + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3); + threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads); } DebayerCpu::~DebayerCpu() = default; @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers, threadData); - stats_->processLine0(frame, y, linePointers, &statsBuffer_); + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst, if (threadData->processLastLinesSeperately) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers, threadData); - stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_); + stats_->processLine0(frame, threadData->yEnd, linePointers, + threadData->statsBuffer); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers, threadData); - stats_->processLine0(frame, y, linePointers, &statsBuffer_); + stats_->processLine0(frame, y, linePointers, threadData->statsBuffer); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst, shiftLinePointers(linePointers, src); memcpyNextLine(linePointers, threadData); - stats_->processLine2(frame, y, linePointers, &statsBuffer_); + stats_->processLine2(frame, y, linePointers, threadData->statsBuffer); (this->*debayer2_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams ¶ms) void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms) { + std::unique_ptr<std::thread> threads[threadCount_ - 1]; + SwIspStats statsBuffer[threadCount_]; + unsigned int i; + bench_.startFrame(); std::vector<DmaSyncer> dmaSyncers; @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output return; } - stats_->startFrame(frame, &statsBuffer_, 1); + stats_->startFrame(frame, statsBuffer, threadCount_); - threadData_[0].yStart = 0; - threadData_[0].yEnd = window_.height; - (this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]); + unsigned int yStart = 0; + unsigned int linesPerThread = (window_.height / threadCount_) & + ~(inputConfig_.patternSize.width - 1); + for (i = 0; i < (threadCount_ - 1); i++) { + threadData_[i].yStart = yStart; + threadData_[i].yEnd = yStart + linesPerThread; + threadData_[i].statsBuffer = &statsBuffer[i]; + threads[i] = std::make_unique<std::thread>( + processInner_, this, frame, + in.planes()[0].data(), + out.planes()[0].data() + yStart * outputConfig_.stride, + &threadData_[i]); + yStart += linesPerThread; + } + threadData_[i].yStart = yStart; + threadData_[i].yEnd = window_.height; + threadData_[i].statsBuffer = &statsBuffer[i]; + (this->*processInner_)(frame, in.planes()[0].data(), + out.planes()[0].data() + yStart * outputConfig_.stride, + &threadData_[i]); + + for (i = 0; i < (threadCount_ - 1); i++) + threads[i]->join(); metadata.planes()[0].bytesused = out.planes()[0].size(); @@ -909,7 +938,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output * * \todo Pass real bufferId once stats buffer passing is changed. */ - stats_->finishFrame(frame, 0, &statsBuffer_, 1); + stats_->finishFrame(frame, 0, statsBuffer, threadCount_); outputBufferReady.emit(output); inputBufferReady.emit(input); } diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index b85dd11c..63fa7710 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -85,6 +85,7 @@ private: unsigned int lineBufferIndex; /* Stored here to avoid causing register pressure in inner loop */ bool processLastLinesSeperately; + SwIspStats *statsBuffer; }; using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst, @@ -150,7 +151,6 @@ private: Rectangle window_; /* Variables used every line */ - SwIspStats statsBuffer_; debayerFn debayer0_; debayerFn debayer1_; debayerFn debayer2_;
Add CPU soft ISP multi-threading support. Benchmark results for the Uno-Q with a weak CPU which is good for performance testing, all numbers with an IMX219 running at 3280x2464 -> 3272x2464: 1 thread : 147ms / frame, ~6.5 fps 2 threads: 81ms / frame, ~12 fps 3 threads: 66ms / frame, ~14.5 fps Adding a 4th thread does not improve performance. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++----- src/libcamera/software_isp/debayer_cpu.h | 2 +- 2 files changed, 40 insertions(+), 11 deletions(-)