[5/5] software_isp: debayer_cpu: Add multi-threading support
diff mbox series

Message ID 20260216190204.106922-6-johannes.goede@oss.qualcomm.com
State Superseded
Headers show
Series
  • software_isp: debayer_cpu: Add multi-threading support
Related show

Commit Message

Hans de Goede Feb. 16, 2026, 7:02 p.m. UTC
Add CPU soft ISP multi-threading support.

Benchmark results for the Uno-Q with a weak CPU which is good for
performance testing, all numbers with an IMX219 running at
3280x2464 -> 3272x2464:

1 thread : 147ms / frame, ~6.5 fps
2 threads:  81ms / frame, ~12 fps
3 threads:  66ms / frame, ~14.5 fps

Adding a 4th thread does not improve performance.

Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
---
 src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++-----
 src/libcamera/software_isp/debayer_cpu.h   |  2 +-
 2 files changed, 40 insertions(+), 11 deletions(-)

Comments

Barnabás Pőcze Feb. 17, 2026, 9:07 a.m. UTC | #1
Hi

2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta:
> Add CPU soft ISP multi-threading support.
> 
> Benchmark results for the Uno-Q with a weak CPU which is good for
> performance testing, all numbers with an IMX219 running at
> 3280x2464 -> 3272x2464:
> 
> 1 thread : 147ms / frame, ~6.5 fps
> 2 threads:  81ms / frame, ~12 fps
> 3 threads:  66ms / frame, ~14.5 fps
> 
> Adding a 4th thread does not improve performance.
> 
> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
> ---
>   src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++-----
>   src/libcamera/software_isp/debayer_cpu.h   |  2 +-
>   2 files changed, 40 insertions(+), 11 deletions(-)
> 
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> index 5e168554..c4b6c5b8 100644
> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -14,6 +14,7 @@
>   #include <algorithm>
>   #include <stdlib.h>
>   #include <sys/ioctl.h>
> +#include <thread>
>   #include <time.h>
>   #include <utility>
>   
> @@ -41,7 +42,7 @@ namespace libcamera {
>    * \param[in] configuration The global configuration
>    */
>   DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
> -	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
> +	: Debayer(configuration), stats_(std::move(stats))
>   {
>   	/*
>   	 * Reading from uncached buffers may be very slow.
> @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
>   	 */
>   	enableInputMemcpy_ =
>   		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
> +	threadCount_ =
> +		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3);
> +	threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads);
>   }
>   
>   DebayerCpu::~DebayerCpu() = default;
> @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
>   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
>   		shiftLinePointers(linePointers, src);
>   		memcpyNextLine(linePointers, threadData);
> -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
> +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
>   		(this->*debayer0_)(dst, linePointers);
>   		src += inputConfig_.stride;
>   		dst += outputConfig_.stride;
> @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
>   	if (threadData->processLastLinesSeperately) {
>   		shiftLinePointers(linePointers, src);
>   		memcpyNextLine(linePointers, threadData);
> -		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
> +		stats_->processLine0(frame, threadData->yEnd, linePointers,
> +				     threadData->statsBuffer);
>   		(this->*debayer0_)(dst, linePointers);
>   		src += inputConfig_.stride;
>   		dst += outputConfig_.stride;
> @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
>   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
>   		shiftLinePointers(linePointers, src);
>   		memcpyNextLine(linePointers, threadData);
> -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
> +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
>   		(this->*debayer0_)(dst, linePointers);
>   		src += inputConfig_.stride;
>   		dst += outputConfig_.stride;
> @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
>   
>   		shiftLinePointers(linePointers, src);
>   		memcpyNextLine(linePointers, threadData);
> -		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
> +		stats_->processLine2(frame, y, linePointers, threadData->statsBuffer);
>   		(this->*debayer2_)(dst, linePointers);
>   		src += inputConfig_.stride;
>   		dst += outputConfig_.stride;
> @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams &params)
>   
>   void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params)
>   {
> +	std::unique_ptr<std::thread> threads[threadCount_ - 1];

No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient.


> +	SwIspStats statsBuffer[threadCount_];
> +	unsigned int i;
> +
>   	bench_.startFrame();
>   
>   	std::vector<DmaSyncer> dmaSyncers;
> @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
>   		return;
>   	}
>   
> -	stats_->startFrame(frame, &statsBuffer_, 1);
> +	stats_->startFrame(frame, statsBuffer, threadCount_);
>   
> -	threadData_[0].yStart = 0;
> -	threadData_[0].yEnd = window_.height;
> -	(this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
> +	unsigned int yStart = 0;
> +	unsigned int linesPerThread = (window_.height / threadCount_) &
> +				      ~(inputConfig_.patternSize.width - 1);
> +	for (i = 0; i < (threadCount_ - 1); i++) {
> +		threadData_[i].yStart = yStart;
> +		threadData_[i].yEnd = yStart + linesPerThread;
> +		threadData_[i].statsBuffer = &statsBuffer[i];
> +		threads[i] = std::make_unique<std::thread>(
> +				processInner_, this, frame,
> +				in.planes()[0].data(),
> +				out.planes()[0].data() + yStart * outputConfig_.stride,
> +				&threadData_[i]);
> +		yStart += linesPerThread;
> +	}
> +	threadData_[i].yStart = yStart;
> +	threadData_[i].yEnd = window_.height;
> +	threadData_[i].statsBuffer = &statsBuffer[i];
> +	(this->*processInner_)(frame, in.planes()[0].data(),
> +			       out.planes()[0].data() + yStart * outputConfig_.stride,
> +			       &threadData_[i]);
> +
> +	for (i = 0; i < (threadCount_ - 1); i++)
> +		threads[i]->join();

This is creating and then joining x threads *per frame*, right? I feel like that
could be improved. Have you looked at having a set of long-running threads? (Even
something very simple, e.g. just threads with a mutex and condvar, started in start()
and stopped in stop()?)


>   
>   	metadata.planes()[0].bytesused = out.planes()[0].size();
>   
> @@ -909,7 +938,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
>   	 *
>   	 * \todo Pass real bufferId once stats buffer passing is changed.
>   	 */
> -	stats_->finishFrame(frame, 0, &statsBuffer_, 1);
> +	stats_->finishFrame(frame, 0, statsBuffer, threadCount_);
>   	outputBufferReady.emit(output);
>   	inputBufferReady.emit(input);
>   }
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> index b85dd11c..63fa7710 100644
> --- a/src/libcamera/software_isp/debayer_cpu.h
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -85,6 +85,7 @@ private:
>   		unsigned int lineBufferIndex;
>   		/* Stored here to avoid causing register pressure in inner loop */
>   		bool processLastLinesSeperately;
> +		SwIspStats *statsBuffer;
>   	};
>   
>   	using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst,
> @@ -150,7 +151,6 @@ private:
>   	Rectangle window_;
>   
>   	/* Variables used every line */
> -	SwIspStats statsBuffer_;
>   	debayerFn debayer0_;
>   	debayerFn debayer1_;
>   	debayerFn debayer2_;
Laurent Pinchart Feb. 19, 2026, 2:15 p.m. UTC | #2
On Tue, Feb 17, 2026 at 10:07:26AM +0100, Barnabás Pőcze wrote:
> Hi
> 
> 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta:
> > Add CPU soft ISP multi-threading support.
> > 
> > Benchmark results for the Uno-Q with a weak CPU which is good for
> > performance testing, all numbers with an IMX219 running at
> > 3280x2464 -> 3272x2464:
> > 
> > 1 thread : 147ms / frame, ~6.5 fps
> > 2 threads:  81ms / frame, ~12 fps
> > 3 threads:  66ms / frame, ~14.5 fps
> > 
> > Adding a 4th thread does not improve performance.
> > 
> > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
> > ---
> >   src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++-----
> >   src/libcamera/software_isp/debayer_cpu.h   |  2 +-
> >   2 files changed, 40 insertions(+), 11 deletions(-)
> > 
> > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> > index 5e168554..c4b6c5b8 100644
> > --- a/src/libcamera/software_isp/debayer_cpu.cpp
> > +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> > @@ -14,6 +14,7 @@
> >   #include <algorithm>
> >   #include <stdlib.h>
> >   #include <sys/ioctl.h>
> > +#include <thread>
> >   #include <time.h>
> >   #include <utility>
> >   
> > @@ -41,7 +42,7 @@ namespace libcamera {
> >    * \param[in] configuration The global configuration
> >    */
> >   DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
> > -	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
> > +	: Debayer(configuration), stats_(std::move(stats))
> >   {
> >   	/*
> >   	 * Reading from uncached buffers may be very slow.
> > @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
> >   	 */
> >   	enableInputMemcpy_ =
> >   		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
> > +	threadCount_ =
> > +		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3);
> > +	threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads);
> >   }
> >   
> >   DebayerCpu::~DebayerCpu() = default;
> > @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
> >   		shiftLinePointers(linePointers, src);
> >   		memcpyNextLine(linePointers, threadData);
> > -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
> > +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
> >   		(this->*debayer0_)(dst, linePointers);
> >   		src += inputConfig_.stride;
> >   		dst += outputConfig_.stride;
> > @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >   	if (threadData->processLastLinesSeperately) {
> >   		shiftLinePointers(linePointers, src);
> >   		memcpyNextLine(linePointers, threadData);
> > -		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
> > +		stats_->processLine0(frame, threadData->yEnd, linePointers,
> > +				     threadData->statsBuffer);
> >   		(this->*debayer0_)(dst, linePointers);
> >   		src += inputConfig_.stride;
> >   		dst += outputConfig_.stride;
> > @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
> >   		shiftLinePointers(linePointers, src);
> >   		memcpyNextLine(linePointers, threadData);
> > -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
> > +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
> >   		(this->*debayer0_)(dst, linePointers);
> >   		src += inputConfig_.stride;
> >   		dst += outputConfig_.stride;
> > @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >   
> >   		shiftLinePointers(linePointers, src);
> >   		memcpyNextLine(linePointers, threadData);
> > -		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
> > +		stats_->processLine2(frame, y, linePointers, threadData->statsBuffer);
> >   		(this->*debayer2_)(dst, linePointers);
> >   		src += inputConfig_.stride;
> >   		dst += outputConfig_.stride;
> > @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams &params)
> >   
> >   void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params)
> >   {
> > +	std::unique_ptr<std::thread> threads[threadCount_ - 1];
> 
> No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient.
> 
> 
> > +	SwIspStats statsBuffer[threadCount_];
> > +	unsigned int i;
> > +
> >   	bench_.startFrame();
> >   
> >   	std::vector<DmaSyncer> dmaSyncers;
> > @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
> >   		return;
> >   	}
> >   
> > -	stats_->startFrame(frame, &statsBuffer_, 1);
> > +	stats_->startFrame(frame, statsBuffer, threadCount_);
> >   
> > -	threadData_[0].yStart = 0;
> > -	threadData_[0].yEnd = window_.height;
> > -	(this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
> > +	unsigned int yStart = 0;
> > +	unsigned int linesPerThread = (window_.height / threadCount_) &
> > +				      ~(inputConfig_.patternSize.width - 1);
> > +	for (i = 0; i < (threadCount_ - 1); i++) {
> > +		threadData_[i].yStart = yStart;
> > +		threadData_[i].yEnd = yStart + linesPerThread;
> > +		threadData_[i].statsBuffer = &statsBuffer[i];
> > +		threads[i] = std::make_unique<std::thread>(
> > +				processInner_, this, frame,
> > +				in.planes()[0].data(),
> > +				out.planes()[0].data() + yStart * outputConfig_.stride,
> > +				&threadData_[i]);
> > +		yStart += linesPerThread;
> > +	}
> > +	threadData_[i].yStart = yStart;
> > +	threadData_[i].yEnd = window_.height;
> > +	threadData_[i].statsBuffer = &statsBuffer[i];
> > +	(this->*processInner_)(frame, in.planes()[0].data(),
> > +			       out.planes()[0].data() + yStart * outputConfig_.stride,
> > +			       &threadData_[i]);
> > +
> > +	for (i = 0; i < (threadCount_ - 1); i++)
> > +		threads[i]->join();
> 
> This is creating and then joining x threads *per frame*, right? I feel like that
> could be improved. Have you looked at having a set of long-running threads? (Even
> something very simple, e.g. just threads with a mutex and condvar, started in start()
> and stopped in stop()?)

Furthermore, this should use the Thread class instead of std::thread,
and it should subclass Thread to stored the per-thread data.

> >   
> >   	metadata.planes()[0].bytesused = out.planes()[0].size();
> >   
> > @@ -909,7 +938,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
> >   	 *
> >   	 * \todo Pass real bufferId once stats buffer passing is changed.
> >   	 */
> > -	stats_->finishFrame(frame, 0, &statsBuffer_, 1);
> > +	stats_->finishFrame(frame, 0, statsBuffer, threadCount_);
> >   	outputBufferReady.emit(output);
> >   	inputBufferReady.emit(input);
> >   }
> > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> > index b85dd11c..63fa7710 100644
> > --- a/src/libcamera/software_isp/debayer_cpu.h
> > +++ b/src/libcamera/software_isp/debayer_cpu.h
> > @@ -85,6 +85,7 @@ private:
> >   		unsigned int lineBufferIndex;
> >   		/* Stored here to avoid causing register pressure in inner loop */
> >   		bool processLastLinesSeperately;
> > +		SwIspStats *statsBuffer;
> >   	};
> >   
> >   	using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst,
> > @@ -150,7 +151,6 @@ private:
> >   	Rectangle window_;
> >   
> >   	/* Variables used every line */
> > -	SwIspStats statsBuffer_;
> >   	debayerFn debayer0_;
> >   	debayerFn debayer1_;
> >   	debayerFn debayer2_;
Hans de Goede Feb. 19, 2026, 5:07 p.m. UTC | #3
Hi Laurent,

On 19-Feb-26 15:15, Laurent Pinchart wrote:
> On Tue, Feb 17, 2026 at 10:07:26AM +0100, Barnabás Pőcze wrote:
>> Hi
>>
>> 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta:
>>> Add CPU soft ISP multi-threading support.
>>>
>>> Benchmark results for the Uno-Q with a weak CPU which is good for
>>> performance testing, all numbers with an IMX219 running at
>>> 3280x2464 -> 3272x2464:
>>>
>>> 1 thread : 147ms / frame, ~6.5 fps
>>> 2 threads:  81ms / frame, ~12 fps
>>> 3 threads:  66ms / frame, ~14.5 fps
>>>
>>> Adding a 4th thread does not improve performance.
>>>
>>> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
>>> ---
>>>   src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++-----
>>>   src/libcamera/software_isp/debayer_cpu.h   |  2 +-
>>>   2 files changed, 40 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
>>> index 5e168554..c4b6c5b8 100644
>>> --- a/src/libcamera/software_isp/debayer_cpu.cpp
>>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
>>> @@ -14,6 +14,7 @@
>>>   #include <algorithm>
>>>   #include <stdlib.h>
>>>   #include <sys/ioctl.h>
>>> +#include <thread>
>>>   #include <time.h>
>>>   #include <utility>
>>>   
>>> @@ -41,7 +42,7 @@ namespace libcamera {
>>>    * \param[in] configuration The global configuration
>>>    */
>>>   DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
>>> -	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
>>> +	: Debayer(configuration), stats_(std::move(stats))
>>>   {
>>>   	/*
>>>   	 * Reading from uncached buffers may be very slow.
>>> @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
>>>   	 */
>>>   	enableInputMemcpy_ =
>>>   		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
>>> +	threadCount_ =
>>> +		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3);
>>> +	threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads);
>>>   }
>>>   
>>>   DebayerCpu::~DebayerCpu() = default;
>>> @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
>>>   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
>>>   		shiftLinePointers(linePointers, src);
>>>   		memcpyNextLine(linePointers, threadData);
>>> -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
>>> +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
>>>   		(this->*debayer0_)(dst, linePointers);
>>>   		src += inputConfig_.stride;
>>>   		dst += outputConfig_.stride;
>>> @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
>>>   	if (threadData->processLastLinesSeperately) {
>>>   		shiftLinePointers(linePointers, src);
>>>   		memcpyNextLine(linePointers, threadData);
>>> -		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
>>> +		stats_->processLine0(frame, threadData->yEnd, linePointers,
>>> +				     threadData->statsBuffer);
>>>   		(this->*debayer0_)(dst, linePointers);
>>>   		src += inputConfig_.stride;
>>>   		dst += outputConfig_.stride;
>>> @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
>>>   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
>>>   		shiftLinePointers(linePointers, src);
>>>   		memcpyNextLine(linePointers, threadData);
>>> -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
>>> +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
>>>   		(this->*debayer0_)(dst, linePointers);
>>>   		src += inputConfig_.stride;
>>>   		dst += outputConfig_.stride;
>>> @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
>>>   
>>>   		shiftLinePointers(linePointers, src);
>>>   		memcpyNextLine(linePointers, threadData);
>>> -		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
>>> +		stats_->processLine2(frame, y, linePointers, threadData->statsBuffer);
>>>   		(this->*debayer2_)(dst, linePointers);
>>>   		src += inputConfig_.stride;
>>>   		dst += outputConfig_.stride;
>>> @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams &params)
>>>   
>>>   void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params)
>>>   {
>>> +	std::unique_ptr<std::thread> threads[threadCount_ - 1];
>>
>> No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient.
>>
>>
>>> +	SwIspStats statsBuffer[threadCount_];
>>> +	unsigned int i;
>>> +
>>>   	bench_.startFrame();
>>>   
>>>   	std::vector<DmaSyncer> dmaSyncers;
>>> @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
>>>   		return;
>>>   	}
>>>   
>>> -	stats_->startFrame(frame, &statsBuffer_, 1);
>>> +	stats_->startFrame(frame, statsBuffer, threadCount_);
>>>   
>>> -	threadData_[0].yStart = 0;
>>> -	threadData_[0].yEnd = window_.height;
>>> -	(this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
>>> +	unsigned int yStart = 0;
>>> +	unsigned int linesPerThread = (window_.height / threadCount_) &
>>> +				      ~(inputConfig_.patternSize.width - 1);
>>> +	for (i = 0; i < (threadCount_ - 1); i++) {
>>> +		threadData_[i].yStart = yStart;
>>> +		threadData_[i].yEnd = yStart + linesPerThread;
>>> +		threadData_[i].statsBuffer = &statsBuffer[i];
>>> +		threads[i] = std::make_unique<std::thread>(
>>> +				processInner_, this, frame,
>>> +				in.planes()[0].data(),
>>> +				out.planes()[0].data() + yStart * outputConfig_.stride,
>>> +				&threadData_[i]);
>>> +		yStart += linesPerThread;
>>> +	}
>>> +	threadData_[i].yStart = yStart;
>>> +	threadData_[i].yEnd = window_.height;
>>> +	threadData_[i].statsBuffer = &statsBuffer[i];
>>> +	(this->*processInner_)(frame, in.planes()[0].data(),
>>> +			       out.planes()[0].data() + yStart * outputConfig_.stride,
>>> +			       &threadData_[i]);
>>> +
>>> +	for (i = 0; i < (threadCount_ - 1); i++)
>>> +		threads[i]->join();
>>
>> This is creating and then joining x threads *per frame*, right? I feel like that
>> could be improved. Have you looked at having a set of long-running threads? (Even
>> something very simple, e.g. just threads with a mutex and condvar, started in start()
>> and stopped in stop()?)

Ack, I agree that only creating N threads once and re-using them
would be better.

I started with the KISS approach of just firing of a new thread
every frame for v1, because C++'s stdlib still lacks any kind
of ThreadPool mechanism which is kinda sad in 2026.

> Furthermore, this should use the Thread class instead of std::thread,
> and it should subclass Thread to stored the per-thread data.

I looked into using the libcamera Thread class for this,
but it seems unsuitable for this purpose. AFAICT is designed
to have methods called on an object which was moved to thread,
which is not what is happening here.

I did consider introducing a new DebayerCpuThread object which
would then contain the per thread-data. This would require that
class to have a copy of the minimum set of other vars (of which
there are not much) needed for the actual inner-loop processing.

Then I can move those objects to threads and use ConnectionTypeQueued
calls for the actual process() call.

But I do not see any way to wait for queued calls to finish
on an Object type ?

The whole Thread / Object / BoundMethod trio seems to be designed
around message dispatching / async calling methods. But here
I want a worker thread to repeatedly do the same thing.

What I'm envisioning for this is using a std::thread with
a main func like this:


thread_main(int threadIdx)
{
	while (1) {
		workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx));

		if (stop_)
			break;

		process();
		
		mutex_.lock()
		pendingWork &= ~(1 << threadIdx);
		mutex_.unlock()

		workDoneCondvar_.notify_all();
	}	
}

Which is something which the Thread class does not let me
do, AFAICT the Thread class does not let me specify my
own thread function, instead forcing use of the message
mechanism.

So I think sticking with std::thread is better in this case.

Again please advise how you want to proceed. I was about to
rework things to something like the above based on Barnabás'
comments. But I'll hold of on doing a v2 until we've some
agreement on how to handle the threading otherwise I'll just
be wasting my time.

Regards,

Hans
Laurent Pinchart Feb. 19, 2026, 5:22 p.m. UTC | #4
On Thu, Feb 19, 2026 at 06:07:28PM +0100, Hans de Goede wrote:
> On 19-Feb-26 15:15, Laurent Pinchart wrote:
> > On Tue, Feb 17, 2026 at 10:07:26AM +0100, Barnabás Pőcze wrote:
> >> 2026. 02. 16. 20:02 keltezéssel, Hans de Goede írta:
> >>> Add CPU soft ISP multi-threading support.
> >>>
> >>> Benchmark results for the Uno-Q with a weak CPU which is good for
> >>> performance testing, all numbers with an IMX219 running at
> >>> 3280x2464 -> 3272x2464:
> >>>
> >>> 1 thread : 147ms / frame, ~6.5 fps
> >>> 2 threads:  81ms / frame, ~12 fps
> >>> 3 threads:  66ms / frame, ~14.5 fps
> >>>
> >>> Adding a 4th thread does not improve performance.
> >>>
> >>> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
> >>> ---
> >>>   src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++-----
> >>>   src/libcamera/software_isp/debayer_cpu.h   |  2 +-
> >>>   2 files changed, 40 insertions(+), 11 deletions(-)
> >>>
> >>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> >>> index 5e168554..c4b6c5b8 100644
> >>> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> >>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> >>> @@ -14,6 +14,7 @@
> >>>   #include <algorithm>
> >>>   #include <stdlib.h>
> >>>   #include <sys/ioctl.h>
> >>> +#include <thread>
> >>>   #include <time.h>
> >>>   #include <utility>
> >>>   
> >>> @@ -41,7 +42,7 @@ namespace libcamera {
> >>>    * \param[in] configuration The global configuration
> >>>    */
> >>>   DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
> >>> -	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
> >>> +	: Debayer(configuration), stats_(std::move(stats))
> >>>   {
> >>>   	/*
> >>>   	 * Reading from uncached buffers may be very slow.
> >>> @@ -56,6 +57,9 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
> >>>   	 */
> >>>   	enableInputMemcpy_ =
> >>>   		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
> >>> +	threadCount_ =
> >>> +		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3);
> >>> +	threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads);
> >>>   }
> >>>   
> >>>   DebayerCpu::~DebayerCpu() = default;
> >>> @@ -692,7 +696,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >>>   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
> >>>   		shiftLinePointers(linePointers, src);
> >>>   		memcpyNextLine(linePointers, threadData);
> >>> -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
> >>> +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
> >>>   		(this->*debayer0_)(dst, linePointers);
> >>>   		src += inputConfig_.stride;
> >>>   		dst += outputConfig_.stride;
> >>> @@ -707,7 +711,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >>>   	if (threadData->processLastLinesSeperately) {
> >>>   		shiftLinePointers(linePointers, src);
> >>>   		memcpyNextLine(linePointers, threadData);
> >>> -		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
> >>> +		stats_->processLine0(frame, threadData->yEnd, linePointers,
> >>> +				     threadData->statsBuffer);
> >>>   		(this->*debayer0_)(dst, linePointers);
> >>>   		src += inputConfig_.stride;
> >>>   		dst += outputConfig_.stride;
> >>> @@ -749,7 +754,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >>>   	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
> >>>   		shiftLinePointers(linePointers, src);
> >>>   		memcpyNextLine(linePointers, threadData);
> >>> -		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
> >>> +		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
> >>>   		(this->*debayer0_)(dst, linePointers);
> >>>   		src += inputConfig_.stride;
> >>>   		dst += outputConfig_.stride;
> >>> @@ -762,7 +767,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
> >>>   
> >>>   		shiftLinePointers(linePointers, src);
> >>>   		memcpyNextLine(linePointers, threadData);
> >>> -		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
> >>> +		stats_->processLine2(frame, y, linePointers, threadData->statsBuffer);
> >>>   		(this->*debayer2_)(dst, linePointers);
> >>>   		src += inputConfig_.stride;
> >>>   		dst += outputConfig_.stride;
> >>> @@ -869,6 +874,10 @@ void DebayerCpu::updateLookupTables(const DebayerParams &params)
> >>>   
> >>>   void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params)
> >>>   {
> >>> +	std::unique_ptr<std::thread> threads[threadCount_ - 1];
> >>
> >> No need for the unique_ptr here, just `std::array<std::thread, kMaxThread - 1>` should be sufficient.
> >>
> >>
> >>> +	SwIspStats statsBuffer[threadCount_];
> >>> +	unsigned int i;
> >>> +
> >>>   	bench_.startFrame();
> >>>   
> >>>   	std::vector<DmaSyncer> dmaSyncers;
> >>> @@ -891,11 +900,31 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
> >>>   		return;
> >>>   	}
> >>>   
> >>> -	stats_->startFrame(frame, &statsBuffer_, 1);
> >>> +	stats_->startFrame(frame, statsBuffer, threadCount_);
> >>>   
> >>> -	threadData_[0].yStart = 0;
> >>> -	threadData_[0].yEnd = window_.height;
> >>> -	(this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
> >>> +	unsigned int yStart = 0;
> >>> +	unsigned int linesPerThread = (window_.height / threadCount_) &
> >>> +				      ~(inputConfig_.patternSize.width - 1);
> >>> +	for (i = 0; i < (threadCount_ - 1); i++) {
> >>> +		threadData_[i].yStart = yStart;
> >>> +		threadData_[i].yEnd = yStart + linesPerThread;
> >>> +		threadData_[i].statsBuffer = &statsBuffer[i];
> >>> +		threads[i] = std::make_unique<std::thread>(
> >>> +				processInner_, this, frame,
> >>> +				in.planes()[0].data(),
> >>> +				out.planes()[0].data() + yStart * outputConfig_.stride,
> >>> +				&threadData_[i]);
> >>> +		yStart += linesPerThread;
> >>> +	}
> >>> +	threadData_[i].yStart = yStart;
> >>> +	threadData_[i].yEnd = window_.height;
> >>> +	threadData_[i].statsBuffer = &statsBuffer[i];
> >>> +	(this->*processInner_)(frame, in.planes()[0].data(),
> >>> +			       out.planes()[0].data() + yStart * outputConfig_.stride,
> >>> +			       &threadData_[i]);
> >>> +
> >>> +	for (i = 0; i < (threadCount_ - 1); i++)
> >>> +		threads[i]->join();
> >>
> >> This is creating and then joining x threads *per frame*, right? I feel like that
> >> could be improved. Have you looked at having a set of long-running threads? (Even
> >> something very simple, e.g. just threads with a mutex and condvar, started in start()
> >> and stopped in stop()?)
> 
> Ack, I agree that only creating N threads once and re-using them
> would be better.
> 
> I started with the KISS approach of just firing of a new thread
> every frame for v1, because C++'s stdlib still lacks any kind
> of ThreadPool mechanism which is kinda sad in 2026.
> 
> > Furthermore, this should use the Thread class instead of std::thread,
> > and it should subclass Thread to stored the per-thread data.
> 
> I looked into using the libcamera Thread class for this,
> but it seems unsuitable for this purpose. AFAICT is designed
> to have methods called on an object which was moved to thread,
> which is not what is happening here.
> 
> I did consider introducing a new DebayerCpuThread object which
> would then contain the per thread-data. This would require that
> class to have a copy of the minimum set of other vars (of which
> there are not much) needed for the actual inner-loop processing.
> 
> Then I can move those objects to threads and use ConnectionTypeQueued
> calls for the actual process() call.
> 
> But I do not see any way to wait for queued calls to finish
> on an Object type ?
> 
> The whole Thread / Object / BoundMethod trio seems to be designed
> around message dispatching / async calling methods. But here
> I want a worker thread to repeatedly do the same thing.
> 
> What I'm envisioning for this is using a std::thread with
> a main func like this:
> 
> 
> thread_main(int threadIdx)
> {
> 	while (1) {
> 		workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx));
> 
> 		if (stop_)
> 			break;
> 
> 		process();
> 		
> 		mutex_.lock()
> 		pendingWork &= ~(1 << threadIdx);
> 		mutex_.unlock()
> 
> 		workDoneCondvar_.notify_all();
> 	}	
> }
> 
> Which is something which the Thread class does not let me
> do, AFAICT the Thread class does not let me specify my
> own thread function, instead forcing use of the message
> mechanism.

You don't have to use messages. By default the Thread class will
instantiate an event loop, but you can override the run() function to
provide your own thread loop. Have you considered that ?

Note that if you have a fixed number of threads, each processing a
pre-defined part of the image, messages can still be useful as a
communication primitive. You can use invokeMethod() with queued
delivery, and benefit from the message queue to deliver the work to the
thread asynchronously. If you want to implement a thread pool where each
thread would automatically pick the next work item when it goes back to
its idle loop then you will need your own run() function.

> So I think sticking with std::thread is better in this case.
> 
> Again please advise how you want to proceed. I was about to
> rework things to something like the above based on Barnabás'
> comments. But I'll hold of on doing a v2 until we've some
> agreement on how to handle the threading otherwise I'll just
> be wasting my time.
Hans de Goede Feb. 19, 2026, 6:20 p.m. UTC | #5
Hi Laurent,

On 19-Feb-26 18:22, Laurent Pinchart wrote:

<snip>

>>>> This is creating and then joining x threads *per frame*, right? I feel like that
>>>> could be improved. Have you looked at having a set of long-running threads? (Even
>>>> something very simple, e.g. just threads with a mutex and condvar, started in start()
>>>> and stopped in stop()?)
>>
>> Ack, I agree that only creating N threads once and re-using them
>> would be better.
>>
>> I started with the KISS approach of just firing of a new thread
>> every frame for v1, because C++'s stdlib still lacks any kind
>> of ThreadPool mechanism which is kinda sad in 2026.
>>
>>> Furthermore, this should use the Thread class instead of std::thread,
>>> and it should subclass Thread to stored the per-thread data.
>>
>> I looked into using the libcamera Thread class for this,
>> but it seems unsuitable for this purpose. AFAICT is designed
>> to have methods called on an object which was moved to thread,
>> which is not what is happening here.
>>
>> I did consider introducing a new DebayerCpuThread object which
>> would then contain the per thread-data. This would require that
>> class to have a copy of the minimum set of other vars (of which
>> there are not much) needed for the actual inner-loop processing.
>>
>> Then I can move those objects to threads and use ConnectionTypeQueued
>> calls for the actual process() call.
>>
>> But I do not see any way to wait for queued calls to finish
>> on an Object type ?
>>
>> The whole Thread / Object / BoundMethod trio seems to be designed
>> around message dispatching / async calling methods. But here
>> I want a worker thread to repeatedly do the same thing.
>>
>> What I'm envisioning for this is using a std::thread with
>> a main func like this:
>>
>>
>> thread_main(int threadIdx)
>> {
>> 	while (1) {
>> 		workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx));
>>
>> 		if (stop_)
>> 			break;
>>
>> 		process();
>> 		
>> 		mutex_.lock()
>> 		pendingWork &= ~(1 << threadIdx);
>> 		mutex_.unlock()
>>
>> 		workDoneCondvar_.notify_all();
>> 	}	
>> }
>>
>> Which is something which the Thread class does not let me
>> do, AFAICT the Thread class does not let me specify my
>> own thread function, instead forcing use of the message
>> mechanism.
> 
> You don't have to use messages. By default the Thread class will
> instantiate an event loop, but you can override the run() function to
> provide your own thread loop. Have you considered that ?

I did not consider that before. I've just looked into this a bit,
I think I can make this work.

> Note that if you have a fixed number of threads, each processing a
> pre-defined part of the image, messages can still be useful as a
> communication primitive. You can use invokeMethod() with queued
> delivery, and benefit from the message queue to deliver the work to the
> thread asynchronously.

I do plan to have a fixed (configurable in config file) number
of threads, like in this v1 patch.

and being able to use invokeMethod does sound useful, the question
is how do I make the "main" process() method wait for all
(1 per debayerCpuThread object) invokeMethod calls to complete ?

Should I add a ConditionVariable for this and have the run()
override function notify this ConditionVariable, or is their
already some mechanism for this which I'm missing ?

Regards,

Hans
Laurent Pinchart Feb. 20, 2026, 4:09 p.m. UTC | #6
On Thu, Feb 19, 2026 at 07:20:41PM +0100, Hans de Goede wrote:
> Hi Laurent,
> 
> On 19-Feb-26 18:22, Laurent Pinchart wrote:
> 
> <snip>
> 
> >>>> This is creating and then joining x threads *per frame*, right? I feel like that
> >>>> could be improved. Have you looked at having a set of long-running threads? (Even
> >>>> something very simple, e.g. just threads with a mutex and condvar, started in start()
> >>>> and stopped in stop()?)
> >>
> >> Ack, I agree that only creating N threads once and re-using them
> >> would be better.
> >>
> >> I started with the KISS approach of just firing of a new thread
> >> every frame for v1, because C++'s stdlib still lacks any kind
> >> of ThreadPool mechanism which is kinda sad in 2026.
> >>
> >>> Furthermore, this should use the Thread class instead of std::thread,
> >>> and it should subclass Thread to stored the per-thread data.
> >>
> >> I looked into using the libcamera Thread class for this,
> >> but it seems unsuitable for this purpose. AFAICT is designed
> >> to have methods called on an object which was moved to thread,
> >> which is not what is happening here.
> >>
> >> I did consider introducing a new DebayerCpuThread object which
> >> would then contain the per thread-data. This would require that
> >> class to have a copy of the minimum set of other vars (of which
> >> there are not much) needed for the actual inner-loop processing.
> >>
> >> Then I can move those objects to threads and use ConnectionTypeQueued
> >> calls for the actual process() call.
> >>
> >> But I do not see any way to wait for queued calls to finish
> >> on an Object type ?
> >>
> >> The whole Thread / Object / BoundMethod trio seems to be designed
> >> around message dispatching / async calling methods. But here
> >> I want a worker thread to repeatedly do the same thing.
> >>
> >> What I'm envisioning for this is using a std::thread with
> >> a main func like this:
> >>
> >>
> >> thread_main(int threadIdx)
> >> {
> >> 	while (1) {
> >> 		workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx));
> >>
> >> 		if (stop_)
> >> 			break;
> >>
> >> 		process();
> >> 		
> >> 		mutex_.lock()
> >> 		pendingWork &= ~(1 << threadIdx);
> >> 		mutex_.unlock()
> >>
> >> 		workDoneCondvar_.notify_all();
> >> 	}	
> >> }
> >>
> >> Which is something which the Thread class does not let me
> >> do, AFAICT the Thread class does not let me specify my
> >> own thread function, instead forcing use of the message
> >> mechanism.
> > 
> > You don't have to use messages. By default the Thread class will
> > instantiate an event loop, but you can override the run() function to
> > provide your own thread loop. Have you considered that ?
> 
> I did not consider that before. I've just looked into this a bit,
> I think I can make this work.
> 
> > Note that if you have a fixed number of threads, each processing a
> > pre-defined part of the image, messages can still be useful as a
> > communication primitive. You can use invokeMethod() with queued
> > delivery, and benefit from the message queue to deliver the work to the
> > thread asynchronously.
> 
> I do plan to have a fixed (configurable in config file) number
> of threads, like in this v1 patch.
> 
> and being able to use invokeMethod does sound useful,

Note that invokeMethod() requires the receiving object to be bound to a
thread that runs an event loop, so you won't be able to override run()
in that case (or, to be precise, you can override run() but you will
have to call exec() from your implementation, so you can't run your own
thread loop).

> the question
> is how do I make the "main" process() method wait for all
> (1 per debayerCpuThread object) invokeMethod calls to complete ?
> 
> Should I add a ConditionVariable for this and have the run()
> override function notify this ConditionVariable, or is their
> already some mechanism for this which I'm missing ?

That would work, but I think the design could be improved.

In this series you introduce n-1 new threads, with the existing software
ISP thread acting as the n'th thread. I think it would be nicer to have
n identical instances of a class to handle the threads, with
SoftwareIsp::process() handling the dispatching (with invokeMethod()).
Completion would be signalled through signals, the same way it is done
now with inputBufferReady and outputBufferReady. The
SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would
be called once per thread, and would handle the required accounting to
emit the SoftwareIsp completion signals only after all threads complete
their jobs.

I haven't investigated in details to see if there would be dragons
lurking in dark corners, but my gut feeling is that a design where no
thread is special would be cleaner.
Hans de Goede Feb. 20, 2026, 6:51 p.m. UTC | #7
Hi,

On 20-Feb-26 5:09 PM, Laurent Pinchart wrote:
> On Thu, Feb 19, 2026 at 07:20:41PM +0100, Hans de Goede wrote:
>> Hi Laurent,
>>
>> On 19-Feb-26 18:22, Laurent Pinchart wrote:
>>
>> <snip>
>>
>>>>>> This is creating and then joining x threads *per frame*, right? I feel like that
>>>>>> could be improved. Have you looked at having a set of long-running threads? (Even
>>>>>> something very simple, e.g. just threads with a mutex and condvar, started in start()
>>>>>> and stopped in stop()?)
>>>>
>>>> Ack, I agree that only creating N threads once and re-using them
>>>> would be better.
>>>>
>>>> I started with the KISS approach of just firing of a new thread
>>>> every frame for v1, because C++'s stdlib still lacks any kind
>>>> of ThreadPool mechanism which is kinda sad in 2026.
>>>>
>>>>> Furthermore, this should use the Thread class instead of std::thread,
>>>>> and it should subclass Thread to stored the per-thread data.
>>>>
>>>> I looked into using the libcamera Thread class for this,
>>>> but it seems unsuitable for this purpose. AFAICT is designed
>>>> to have methods called on an object which was moved to thread,
>>>> which is not what is happening here.
>>>>
>>>> I did consider introducing a new DebayerCpuThread object which
>>>> would then contain the per thread-data. This would require that
>>>> class to have a copy of the minimum set of other vars (of which
>>>> there are not much) needed for the actual inner-loop processing.
>>>>
>>>> Then I can move those objects to threads and use ConnectionTypeQueued
>>>> calls for the actual process() call.
>>>>
>>>> But I do not see any way to wait for queued calls to finish
>>>> on an Object type ?
>>>>
>>>> The whole Thread / Object / BoundMethod trio seems to be designed
>>>> around message dispatching / async calling methods. But here
>>>> I want a worker thread to repeatedly do the same thing.
>>>>
>>>> What I'm envisioning for this is using a std::thread with
>>>> a main func like this:
>>>>
>>>>
>>>> thread_main(int threadIdx)
>>>> {
>>>> 	while (1) {
>>>> 		workPendingCondvar_.wait(mutex_, workPendingOrStop(threadIdx));
>>>>
>>>> 		if (stop_)
>>>> 			break;
>>>>
>>>> 		process();
>>>> 		
>>>> 		mutex_.lock()
>>>> 		pendingWork &= ~(1 << threadIdx);
>>>> 		mutex_.unlock()
>>>>
>>>> 		workDoneCondvar_.notify_all();
>>>> 	}	
>>>> }
>>>>
>>>> Which is something which the Thread class does not let me
>>>> do, AFAICT the Thread class does not let me specify my
>>>> own thread function, instead forcing use of the message
>>>> mechanism.
>>>
>>> You don't have to use messages. By default the Thread class will
>>> instantiate an event loop, but you can override the run() function to
>>> provide your own thread loop. Have you considered that ?
>>
>> I did not consider that before. I've just looked into this a bit,
>> I think I can make this work.
>>
>>> Note that if you have a fixed number of threads, each processing a
>>> pre-defined part of the image, messages can still be useful as a
>>> communication primitive. You can use invokeMethod() with queued
>>> delivery, and benefit from the message queue to deliver the work to the
>>> thread asynchronously.
>>
>> I do plan to have a fixed (configurable in config file) number
>> of threads, like in this v1 patch.
>>
>> and being able to use invokeMethod does sound useful,
> 
> Note that invokeMethod() requires the receiving object to be bound to a
> thread that runs an event loop, so you won't be able to override run()
> in that case (or, to be precise, you can override run() but you will
> have to call exec() from your implementation, so you can't run your own
> thread loop).
> 
>> the question
>> is how do I make the "main" process() method wait for all
>> (1 per debayerCpuThread object) invokeMethod calls to complete ?
>>
>> Should I add a ConditionVariable for this and have the run()
>> override function notify this ConditionVariable, or is their
>> already some mechanism for this which I'm missing ?
> 
> That would work, but I think the design could be improved.
> 
> In this series you introduce n-1 new threads, with the existing software
> ISP thread acting as the n'th thread. I think it would be nicer to have
> n identical instances of a class to handle the threads, with
> SoftwareIsp::process() handling the dispatching (with invokeMethod()).
> Completion would be signalled through signals, the same way it is done
> now with inputBufferReady and outputBufferReady. The
> SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would
> be called once per thread, and would handle the required accounting to
> emit the SoftwareIsp completion signals only after all threads complete
> their jobs.
> 
> I haven't investigated in details to see if there would be dragons
> lurking in dark corners, but my gut feeling is that a design where no
> thread is special would be cleaner.

The signal approach is interesting, I'll look into that.

Regards,

Hans
Hans de Goede Feb. 20, 2026, 8:41 p.m. UTC | #8
Hi,

On 20-Feb-26 7:51 PM, Hans de Goede wrote:
> On 20-Feb-26 5:09 PM, Laurent Pinchart wrote:

<snip>

>> In this series you introduce n-1 new threads, with the existing software
>> ISP thread acting as the n'th thread. I think it would be nicer to have
>> n identical instances of a class to handle the threads, with
>> SoftwareIsp::process() handling the dispatching (with invokeMethod()).
>> Completion would be signalled through signals, the same way it is done
>> now with inputBufferReady and outputBufferReady. The
>> SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would
>> be called once per thread, and would handle the required accounting to
>> emit the SoftwareIsp completion signals only after all threads complete
>> their jobs.
>>
>> I haven't investigated in details to see if there would be dragons
>> lurking in dark corners, but my gut feeling is that a design where no
>> thread is special would be cleaner.
> 
> The signal approach is interesting, I'll look into that.

Ok, so this does not work that well, for the CPU ISP we need to
create mappings for the in and out buffer atm we use local
MappedFrameBuffer objects relying on the destructor unmapping
the buffers when the desctructors run when the objects run out
of context.


An other problem is that the emitted signals when processing
is done take the input and output Framebuffers passed to
the process() function. So if we move the completion
to a signal handler then we need to temporarily store those
Framebuffer pointers as class variables.

So all in all just waiting in the process() function for all threads
to complete with the threads signalling a conditional variable
works better from design pov. So i'm going to go with that for v2.

Regards,

Hans







> 
> Regards,
> 
> Hans
>
Laurent Pinchart Feb. 25, 2026, 10 p.m. UTC | #9
On Fri, Feb 20, 2026 at 09:41:01PM +0100, Hans de Goede wrote:
> Hi,
> 
> On 20-Feb-26 7:51 PM, Hans de Goede wrote:
> > On 20-Feb-26 5:09 PM, Laurent Pinchart wrote:
> 
> <snip>
> 
> >> In this series you introduce n-1 new threads, with the existing software
> >> ISP thread acting as the n'th thread. I think it would be nicer to have
> >> n identical instances of a class to handle the threads, with
> >> SoftwareIsp::process() handling the dispatching (with invokeMethod()).
> >> Completion would be signalled through signals, the same way it is done
> >> now with inputBufferReady and outputBufferReady. The
> >> SoftwareIsp::inputReady() and SoftwareIsp::outputReady() functions would
> >> be called once per thread, and would handle the required accounting to
> >> emit the SoftwareIsp completion signals only after all threads complete
> >> their jobs.
> >>
> >> I haven't investigated in details to see if there would be dragons
> >> lurking in dark corners, but my gut feeling is that a design where no
> >> thread is special would be cleaner.
> > 
> > The signal approach is interesting, I'll look into that.
> 
> Ok, so this does not work that well, for the CPU ISP we need to
> create mappings for the in and out buffer atm we use local
> MappedFrameBuffer objects relying on the destructor unmapping
> the buffers when the desctructors run when the objects run out
> of context.
> 
> 
> An other problem is that the emitted signals when processing
> is done take the input and output Framebuffers passed to
> the process() function. So if we move the completion
> to a signal handler then we need to temporarily store those
> Framebuffer pointers as class variables.
> 
> So all in all just waiting in the process() function for all threads
> to complete with the threads signalling a conditional variable
> works better from design pov. So i'm going to go with that for v2.

I still feel a symmetrical design would be nicer, but we can get there
incrementally. Let's focus on progressing the patch series.

Patch
diff mbox series

diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
index 5e168554..c4b6c5b8 100644
--- a/src/libcamera/software_isp/debayer_cpu.cpp
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -14,6 +14,7 @@ 
 #include <algorithm>
 #include <stdlib.h>
 #include <sys/ioctl.h>
+#include <thread>
 #include <time.h>
 #include <utility>
 
@@ -41,7 +42,7 @@  namespace libcamera {
  * \param[in] configuration The global configuration
  */
 DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
-	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
+	: Debayer(configuration), stats_(std::move(stats))
 {
 	/*
 	 * Reading from uncached buffers may be very slow.
@@ -56,6 +57,9 @@  DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
 	 */
 	enableInputMemcpy_ =
 		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
+	threadCount_ =
+		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3);
+	threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads);
 }
 
 DebayerCpu::~DebayerCpu() = default;
@@ -692,7 +696,7 @@  void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
 	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
+		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -707,7 +711,8 @@  void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
 	if (threadData->processLastLinesSeperately) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
+		stats_->processLine0(frame, threadData->yEnd, linePointers,
+				     threadData->statsBuffer);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -749,7 +754,7 @@  void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
 	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
+		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -762,7 +767,7 @@  void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
 
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
+		stats_->processLine2(frame, y, linePointers, threadData->statsBuffer);
 		(this->*debayer2_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -869,6 +874,10 @@  void DebayerCpu::updateLookupTables(const DebayerParams &params)
 
 void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params)
 {
+	std::unique_ptr<std::thread> threads[threadCount_ - 1];
+	SwIspStats statsBuffer[threadCount_];
+	unsigned int i;
+
 	bench_.startFrame();
 
 	std::vector<DmaSyncer> dmaSyncers;
@@ -891,11 +900,31 @@  void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 		return;
 	}
 
-	stats_->startFrame(frame, &statsBuffer_, 1);
+	stats_->startFrame(frame, statsBuffer, threadCount_);
 
-	threadData_[0].yStart = 0;
-	threadData_[0].yEnd = window_.height;
-	(this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
+	unsigned int yStart = 0;
+	unsigned int linesPerThread = (window_.height / threadCount_) &
+				      ~(inputConfig_.patternSize.width - 1);
+	for (i = 0; i < (threadCount_ - 1); i++) {
+		threadData_[i].yStart = yStart;
+		threadData_[i].yEnd = yStart + linesPerThread;
+		threadData_[i].statsBuffer = &statsBuffer[i];
+		threads[i] = std::make_unique<std::thread>(
+				processInner_, this, frame,
+				in.planes()[0].data(),
+				out.planes()[0].data() + yStart * outputConfig_.stride,
+				&threadData_[i]);
+		yStart += linesPerThread;
+	}
+	threadData_[i].yStart = yStart;
+	threadData_[i].yEnd = window_.height;
+	threadData_[i].statsBuffer = &statsBuffer[i];
+	(this->*processInner_)(frame, in.planes()[0].data(),
+			       out.planes()[0].data() + yStart * outputConfig_.stride,
+			       &threadData_[i]);
+
+	for (i = 0; i < (threadCount_ - 1); i++)
+		threads[i]->join();
 
 	metadata.planes()[0].bytesused = out.planes()[0].size();
 
@@ -909,7 +938,7 @@  void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 	 *
 	 * \todo Pass real bufferId once stats buffer passing is changed.
 	 */
-	stats_->finishFrame(frame, 0, &statsBuffer_, 1);
+	stats_->finishFrame(frame, 0, statsBuffer, threadCount_);
 	outputBufferReady.emit(output);
 	inputBufferReady.emit(input);
 }
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
index b85dd11c..63fa7710 100644
--- a/src/libcamera/software_isp/debayer_cpu.h
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -85,6 +85,7 @@  private:
 		unsigned int lineBufferIndex;
 		/* Stored here to avoid causing register pressure in inner loop */
 		bool processLastLinesSeperately;
+		SwIspStats *statsBuffer;
 	};
 
 	using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst,
@@ -150,7 +151,6 @@  private:
 	Rectangle window_;
 
 	/* Variables used every line */
-	SwIspStats statsBuffer_;
 	debayerFn debayer0_;
 	debayerFn debayer1_;
 	debayerFn debayer2_;