[2/5] software_isp: debayer_cpu: Add per render thread data
diff mbox series

Message ID 20260216190204.106922-3-johannes.goede@oss.qualcomm.com
State Superseded
Headers show
Series
  • software_isp: debayer_cpu: Add multi-threading support
Related show

Commit Message

Hans de Goede Feb. 16, 2026, 7:02 p.m. UTC
Add a DebayerCpuThreadData data struct and use this in the inner render
loop. This contains data which needs to be separate per thread.

This is a preparation patch for making DebayerCpu support multi-threading.

Note this passed the DebayerCpuThreadData with a pointer rather then by
reference, because passing by reference is not supported for functions
passed as the thread function to std::thread().

Benchmarking on the Uno-Q with a weak CPU which is good for performance
testing, shows 146-147ms per 3272x2464 frame both before and after this
change, with things maybe being 0.5 ms slower after this change.

Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
---
 src/libcamera/software_isp/debayer_cpu.cpp | 90 ++++++++++++++--------
 src/libcamera/software_isp/debayer_cpu.h   | 30 +++++---
 2 files changed, 77 insertions(+), 43 deletions(-)

Comments

Milan Zamazal Feb. 17, 2026, 9:22 p.m. UTC | #1
Hi Hans,

thank you for the patch.

Looks basically correct to me, some comments below.

Hans de Goede <johannes.goede@oss.qualcomm.com> writes:

> Add a DebayerCpuThreadData data struct and use this in the inner render
> loop. This contains data which needs to be separate per thread.
>
> This is a preparation patch for making DebayerCpu support multi-threading.
>
> Note this passed the DebayerCpuThreadData with a pointer rather then by

s/then/than/

> reference, because passing by reference is not supported for functions
> passed as the thread function to std::thread().
>
> Benchmarking on the Uno-Q with a weak CPU which is good for performance
> testing, shows 146-147ms per 3272x2464 frame both before and after this
> change, with things maybe being 0.5 ms slower after this change.
>
> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
> ---
>  src/libcamera/software_isp/debayer_cpu.cpp | 90 ++++++++++++++--------
>  src/libcamera/software_isp/debayer_cpu.h   | 30 +++++---
>  2 files changed, 77 insertions(+), 43 deletions(-)
>
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> index 97c1959a..e1d3c164 100644
> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -41,7 +41,7 @@ namespace libcamera {
>   * \param[in] configuration The global configuration
>   */
>  DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
> -	: Debayer(configuration), stats_(std::move(stats))
> +	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
>  {
>  	/*
>  	 * Reading from uncached buffers may be very slow.
> @@ -555,8 +555,9 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg,
>  			    2 * lineBufferPadding_;
>  
>  	if (enableInputMemcpy_) {
> -		for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++)
> -			lineBuffers_[i].resize(lineBufferLength_);
> +		for (unsigned int i = 0; i < threadCount_; i++)
> +			for (unsigned int j = 0; j <= inputConfig_.patternSize.height; j++)
> +				threadData_[i].lineBuffers[j].resize(lineBufferLength_);
>  	}
>  
>  	return 0;
> @@ -600,7 +601,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
>  	return std::make_tuple(stride, stride * size.height);
>  }
>  
> -void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
> +void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[],
> +				  DebayerCpuThreadData *threadData)
>  {
>  	const unsigned int patternHeight = inputConfig_.patternSize.height;
>  
> @@ -608,14 +610,14 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
>  		return;
>  
>  	for (unsigned int i = 0; i < patternHeight; i++) {
> -		memcpy(lineBuffers_[i].data(),
> +		memcpy(threadData->lineBuffers[i].data(),
>  		       linePointers[i + 1] - lineBufferPadding_,
>  		       lineBufferLength_);
> -		linePointers[i + 1] = lineBuffers_[i].data() + lineBufferPadding_;
> +		linePointers[i + 1] = threadData->lineBuffers[i].data() + lineBufferPadding_;
>  	}
>  
>  	/* Point lineBufferIndex_ to first unused lineBuffer */
> -	lineBufferIndex_ = patternHeight;
> +	threadData->lineBufferIndex = patternHeight;
>  }
>  
>  void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
> @@ -629,66 +631,78 @@ void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t
>  				      (patternHeight / 2) * (int)inputConfig_.stride;
>  }
>  
> -void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
> +void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[],
> +				DebayerCpuThreadData *threadData)
>  {
>  	const unsigned int patternHeight = inputConfig_.patternSize.height;
>  
>  	if (!enableInputMemcpy_)
>  		return;
>  
> -	memcpy(lineBuffers_[lineBufferIndex_].data(),
> +	memcpy(threadData->lineBuffers[threadData->lineBufferIndex].data(),
>  	       linePointers[patternHeight] - lineBufferPadding_,
>  	       lineBufferLength_);
> -	linePointers[patternHeight] = lineBuffers_[lineBufferIndex_].data() + lineBufferPadding_;
> +	linePointers[patternHeight] = threadData->lineBuffers[threadData->lineBufferIndex].data() + lineBufferPadding_;
>  
> -	lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
> +	threadData->lineBufferIndex = (threadData->lineBufferIndex + 1) % (patternHeight + 1);
>  }
>  
> -void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
> +void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
> +			  DebayerCpuThreadData *threadData)
>  {
> -	unsigned int yEnd = window_.height;
>  	/* Holds [0] previous- [1] current- [2] next-line */
>  	const uint8_t *linePointers[3];
>  
>  	/* Adjust src to top left corner of the window */
> -	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
> +	src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
>  
>  	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
> -	if (window_.y) {
> +	if (window_.y + threadData->yStart) {
>  		linePointers[1] = src - inputConfig_.stride; /* previous-line */
>  		linePointers[2] = src;
>  	} else {
> -		/* window_.y == 0, use the next line as prev line */
> +		/* Top line, use the next line as prev line */
>  		linePointers[1] = src + inputConfig_.stride;
>  		linePointers[2] = src;
> +	}
> +
> +	if (window_.y == 0 && threadData->yEnd == window_.height) {
>  		/*
>  		 * Last 2 lines also need special handling.
>  		 * (And configure() ensures that yEnd >= 2.)
>  		 */
> -		yEnd -= 2;
> +		threadData->yEnd -= 2;
> +		threadData->processLastLinesSeperately = true;

s/Seperately/Separately/

> +	} else {
> +		threadData->processLastLinesSeperately = false;
>  	}
>  
> -	setupInputMemcpy(linePointers);
> +	setupInputMemcpy(linePointers, threadData);
>  
> -	for (unsigned int y = 0; y < yEnd; y += 2) {
> +	/*
> +	 * Note y is the line-number *inside* the window, since stats_' window
> +	 * is the stats window inside/relative to the debayer window. IOW for
> +	 * single thread rendering y goes from 0 - window_.height.
> +	 */
> +	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> +		memcpyNextLine(linePointers, threadData);
>  		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
>  		(this->*debayer0_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
>  
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> +		memcpyNextLine(linePointers, threadData);
>  		(this->*debayer1_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
>  	}
>  
> -	if (window_.y == 0) {
> +	if (threadData->processLastLinesSeperately) {
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> -		stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_);
> +		memcpyNextLine(linePointers, threadData);
> +		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
>  		(this->*debayer0_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
> @@ -702,7 +716,8 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
>  	}
>  }
>  
> -void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
> +void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
> +			  DebayerCpuThreadData *threadData)
>  {
>  	/*
>  	 * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
> @@ -711,7 +726,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
>  	const uint8_t *linePointers[5];
>  
>  	/* Adjust src to top left corner of the window */
> -	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
> +	src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
>  
>  	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
>  	linePointers[1] = src - 2 * inputConfig_.stride;
> @@ -719,31 +734,36 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
>  	linePointers[3] = src;
>  	linePointers[4] = src + inputConfig_.stride;
>  
> -	setupInputMemcpy(linePointers);
> +	setupInputMemcpy(linePointers, threadData);
>  
> -	for (unsigned int y = 0; y < window_.height; y += 4) {
> +	/*
> +	 * Note y is the line-number *inside* the window, since stats_' window
> +	 * is the stats window inside/relative to the debayer window. IOW for
> +	 * single thread rendering y goes from 0 - window_.height.

s/-/to/

(The same in process4.)

> +	 */
> +	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> +		memcpyNextLine(linePointers, threadData);
>  		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
>  		(this->*debayer0_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
>  
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> +		memcpyNextLine(linePointers, threadData);
>  		(this->*debayer1_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
>  
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> +		memcpyNextLine(linePointers, threadData);
>  		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
>  		(this->*debayer2_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
>  
>  		shiftLinePointers(linePointers, src);
> -		memcpyNextLine(linePointers);
> +		memcpyNextLine(linePointers, threadData);
>  		(this->*debayer3_)(dst, linePointers);
>  		src += inputConfig_.stride;
>  		dst += outputConfig_.stride;
> @@ -868,10 +888,12 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
>  
>  	stats_->startFrame(frame, &statsBuffer_, 1);
>  
> +	threadData_[0].yStart = 0;
> +	threadData_[0].yEnd = window_.height;
>  	if (inputConfig_.patternSize.height == 2)
> -		process2(frame, in.planes()[0].data(), out.planes()[0].data());
> +		process2(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
>  	else
> -		process4(frame, in.planes()[0].data(), out.planes()[0].data());
> +		process4(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
>  
>  	metadata.planes()[0].bytesused = out.planes()[0].size();
>  
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> index 8abf5168..800b018c 100644
> --- a/src/libcamera/software_isp/debayer_cpu.h
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -74,6 +74,19 @@ private:
>  	 */
>  	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
>  
> +	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */

Not a matter of this patch but I wonder why 5?

> +	static constexpr unsigned int kMaxLineBuffers = 5;
> +
> +	/* Per render thread data */
> +	struct DebayerCpuThreadData {
> +		unsigned int yStart;
> +		unsigned int yEnd;
> +		std::vector<uint8_t> lineBuffers[kMaxLineBuffers];
> +		unsigned int lineBufferIndex;
> +		/* Stored here to avoid causing register pressure in inner loop */

What inner loop?

> +		bool processLastLinesSeperately;
> +	};
> +
>  	/* 8-bit raw bayer format */
>  	template<bool addAlphaByte, bool ccmEnabled>
>  	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> @@ -105,17 +118,14 @@ private:
>  	int setDebayerFunctions(PixelFormat inputFormat,
>  				PixelFormat outputFormat,
>  				bool ccmEnabled);
> -	void setupInputMemcpy(const uint8_t *linePointers[]);
> +	void setupInputMemcpy(const uint8_t *linePointers[], DebayerCpuThreadData *threadData);
>  	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
> -	void memcpyNextLine(const uint8_t *linePointers[]);
> -	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
> -	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
> +	void memcpyNextLine(const uint8_t *linePointers[], DebayerCpuThreadData *threadData);
> +	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData);
> +	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData);
>  	void updateGammaTable(const DebayerParams &params);
>  	void updateLookupTables(const DebayerParams &params);
>  
> -	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
> -	static constexpr unsigned int kMaxLineBuffers = 5;
> -
>  	static constexpr unsigned int kRGBLookupSize = 256;
>  	static constexpr unsigned int kGammaLookupSize = 1024;
>  	struct CcmColumn {
> @@ -143,12 +153,14 @@ private:
>  	debayerFn debayer3_;
>  	Rectangle window_;
>  	std::unique_ptr<SwStatsCpu> stats_;
> -	std::vector<uint8_t> lineBuffers_[kMaxLineBuffers];
>  	unsigned int lineBufferLength_;
>  	unsigned int lineBufferPadding_;
> -	unsigned int lineBufferIndex_;
>  	unsigned int xShift_; /* Offset of 0/1 applied to window_.x */
>  	bool enableInputMemcpy_;
> +
> +	static constexpr unsigned int kMaxThreads = 4;
> +	struct DebayerCpuThreadData threadData_[kMaxThreads];

I think std::array is preferred.

> +	unsigned int threadCount_;
>  };
>  
>  } /* namespace libcamera */
Hans de Goede Feb. 23, 2026, 3:34 p.m. UTC | #2
Hi,

On 17-Feb-26 10:22 PM, Milan Zamazal wrote:
> Hi Hans,
> 
> thank you for the patch.
> 
> Looks basically correct to me, some comments below.
> 
> Hans de Goede <johannes.goede@oss.qualcomm.com> writes:
> 
>> Add a DebayerCpuThreadData data struct and use this in the inner render
>> loop. This contains data which needs to be separate per thread.
>>
>> This is a preparation patch for making DebayerCpu support multi-threading.
>>
>> Note this passed the DebayerCpuThreadData with a pointer rather then by
> 
> s/then/than/

No longer relevant for v2, but I did end up doing a couple other s/then/than/
fixes for v2.

...

>> +	if (window_.y == 0 && threadData->yEnd == window_.height) {
>>  		/*
>>  		 * Last 2 lines also need special handling.
>>  		 * (And configure() ensures that yEnd >= 2.)
>>  		 */
>> -		yEnd -= 2;
>> +		threadData->yEnd -= 2;
>> +		threadData->processLastLinesSeperately = true;
> 
> s/Seperately/Separately/

No longer relevant for v2 (which is significantly reworked).

...

>> -	for (unsigned int y = 0; y < window_.height; y += 4) {
>> +	/*
>> +	 * Note y is the line-number *inside* the window, since stats_' window
>> +	 * is the stats window inside/relative to the debayer window. IOW for
>> +	 * single thread rendering y goes from 0 - window_.height.
> 
> s/-/to/
> 
> (The same in process4.)

Ack both fixed for v2.

...

>> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
>> index 8abf5168..800b018c 100644
>> --- a/src/libcamera/software_isp/debayer_cpu.h
>> +++ b/src/libcamera/software_isp/debayer_cpu.h
>> @@ -74,6 +74,19 @@ private:
>>  	 */
>>  	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
>>  
>> +	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
> 
> Not a matter of this patch but I wonder why 5?

this is for a mixed RGB-IR pattern which requires 2 lines above and below
the current line to get all colors. ATM the whole process4() support is
unused I've out of tree patches for this here:

https://github.com/jwrdegoede/libcamera/commits/ov01a1s/

but those are waiting for the new bayer-pattern control to allow
communicating the special IGIG_GBGR_IGIG_GRGB bayer pattern to
userspace.

...

>> +
>> +	static constexpr unsigned int kMaxThreads = 4;
>> +	struct DebayerCpuThreadData threadData_[kMaxThreads];
> 
> I think std::array is preferred.

Ack.

Regards,

Hans

Patch
diff mbox series

diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
index 97c1959a..e1d3c164 100644
--- a/src/libcamera/software_isp/debayer_cpu.cpp
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -41,7 +41,7 @@  namespace libcamera {
  * \param[in] configuration The global configuration
  */
 DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
-	: Debayer(configuration), stats_(std::move(stats))
+	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
 {
 	/*
 	 * Reading from uncached buffers may be very slow.
@@ -555,8 +555,9 @@  int DebayerCpu::configure(const StreamConfiguration &inputCfg,
 			    2 * lineBufferPadding_;
 
 	if (enableInputMemcpy_) {
-		for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++)
-			lineBuffers_[i].resize(lineBufferLength_);
+		for (unsigned int i = 0; i < threadCount_; i++)
+			for (unsigned int j = 0; j <= inputConfig_.patternSize.height; j++)
+				threadData_[i].lineBuffers[j].resize(lineBufferLength_);
 	}
 
 	return 0;
@@ -600,7 +601,8 @@  DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
 	return std::make_tuple(stride, stride * size.height);
 }
 
-void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
+void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[],
+				  DebayerCpuThreadData *threadData)
 {
 	const unsigned int patternHeight = inputConfig_.patternSize.height;
 
@@ -608,14 +610,14 @@  void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
 		return;
 
 	for (unsigned int i = 0; i < patternHeight; i++) {
-		memcpy(lineBuffers_[i].data(),
+		memcpy(threadData->lineBuffers[i].data(),
 		       linePointers[i + 1] - lineBufferPadding_,
 		       lineBufferLength_);
-		linePointers[i + 1] = lineBuffers_[i].data() + lineBufferPadding_;
+		linePointers[i + 1] = threadData->lineBuffers[i].data() + lineBufferPadding_;
 	}
 
 	/* Point lineBufferIndex_ to first unused lineBuffer */
-	lineBufferIndex_ = patternHeight;
+	threadData->lineBufferIndex = patternHeight;
 }
 
 void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
@@ -629,66 +631,78 @@  void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t
 				      (patternHeight / 2) * (int)inputConfig_.stride;
 }
 
-void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
+void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[],
+				DebayerCpuThreadData *threadData)
 {
 	const unsigned int patternHeight = inputConfig_.patternSize.height;
 
 	if (!enableInputMemcpy_)
 		return;
 
-	memcpy(lineBuffers_[lineBufferIndex_].data(),
+	memcpy(threadData->lineBuffers[threadData->lineBufferIndex].data(),
 	       linePointers[patternHeight] - lineBufferPadding_,
 	       lineBufferLength_);
-	linePointers[patternHeight] = lineBuffers_[lineBufferIndex_].data() + lineBufferPadding_;
+	linePointers[patternHeight] = threadData->lineBuffers[threadData->lineBufferIndex].data() + lineBufferPadding_;
 
-	lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
+	threadData->lineBufferIndex = (threadData->lineBufferIndex + 1) % (patternHeight + 1);
 }
 
-void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
+void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
+			  DebayerCpuThreadData *threadData)
 {
-	unsigned int yEnd = window_.height;
 	/* Holds [0] previous- [1] current- [2] next-line */
 	const uint8_t *linePointers[3];
 
 	/* Adjust src to top left corner of the window */
-	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
+	src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
 
 	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
-	if (window_.y) {
+	if (window_.y + threadData->yStart) {
 		linePointers[1] = src - inputConfig_.stride; /* previous-line */
 		linePointers[2] = src;
 	} else {
-		/* window_.y == 0, use the next line as prev line */
+		/* Top line, use the next line as prev line */
 		linePointers[1] = src + inputConfig_.stride;
 		linePointers[2] = src;
+	}
+
+	if (window_.y == 0 && threadData->yEnd == window_.height) {
 		/*
 		 * Last 2 lines also need special handling.
 		 * (And configure() ensures that yEnd >= 2.)
 		 */
-		yEnd -= 2;
+		threadData->yEnd -= 2;
+		threadData->processLastLinesSeperately = true;
+	} else {
+		threadData->processLastLinesSeperately = false;
 	}
 
-	setupInputMemcpy(linePointers);
+	setupInputMemcpy(linePointers, threadData);
 
-	for (unsigned int y = 0; y < yEnd; y += 2) {
+	/*
+	 * Note y is the line-number *inside* the window, since stats_' window
+	 * is the stats window inside/relative to the debayer window. IOW for
+	 * single thread rendering y goes from 0 - window_.height.
+	 */
+	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
+		memcpyNextLine(linePointers, threadData);
 		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
 
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
+		memcpyNextLine(linePointers, threadData);
 		(this->*debayer1_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
 	}
 
-	if (window_.y == 0) {
+	if (threadData->processLastLinesSeperately) {
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
-		stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_);
+		memcpyNextLine(linePointers, threadData);
+		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -702,7 +716,8 @@  void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
 	}
 }
 
-void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
+void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
+			  DebayerCpuThreadData *threadData)
 {
 	/*
 	 * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
@@ -711,7 +726,7 @@  void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
 	const uint8_t *linePointers[5];
 
 	/* Adjust src to top left corner of the window */
-	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
+	src += (window_.y + threadData->yStart) * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
 
 	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
 	linePointers[1] = src - 2 * inputConfig_.stride;
@@ -719,31 +734,36 @@  void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
 	linePointers[3] = src;
 	linePointers[4] = src + inputConfig_.stride;
 
-	setupInputMemcpy(linePointers);
+	setupInputMemcpy(linePointers, threadData);
 
-	for (unsigned int y = 0; y < window_.height; y += 4) {
+	/*
+	 * Note y is the line-number *inside* the window, since stats_' window
+	 * is the stats window inside/relative to the debayer window. IOW for
+	 * single thread rendering y goes from 0 - window_.height.
+	 */
+	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
+		memcpyNextLine(linePointers, threadData);
 		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
 
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
+		memcpyNextLine(linePointers, threadData);
 		(this->*debayer1_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
 
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
+		memcpyNextLine(linePointers, threadData);
 		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
 		(this->*debayer2_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
 
 		shiftLinePointers(linePointers, src);
-		memcpyNextLine(linePointers);
+		memcpyNextLine(linePointers, threadData);
 		(this->*debayer3_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -868,10 +888,12 @@  void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 
 	stats_->startFrame(frame, &statsBuffer_, 1);
 
+	threadData_[0].yStart = 0;
+	threadData_[0].yEnd = window_.height;
 	if (inputConfig_.patternSize.height == 2)
-		process2(frame, in.planes()[0].data(), out.planes()[0].data());
+		process2(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
 	else
-		process4(frame, in.planes()[0].data(), out.planes()[0].data());
+		process4(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
 
 	metadata.planes()[0].bytesused = out.planes()[0].size();
 
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
index 8abf5168..800b018c 100644
--- a/src/libcamera/software_isp/debayer_cpu.h
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -74,6 +74,19 @@  private:
 	 */
 	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
 
+	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
+	static constexpr unsigned int kMaxLineBuffers = 5;
+
+	/* Per render thread data */
+	struct DebayerCpuThreadData {
+		unsigned int yStart;
+		unsigned int yEnd;
+		std::vector<uint8_t> lineBuffers[kMaxLineBuffers];
+		unsigned int lineBufferIndex;
+		/* Stored here to avoid causing register pressure in inner loop */
+		bool processLastLinesSeperately;
+	};
+
 	/* 8-bit raw bayer format */
 	template<bool addAlphaByte, bool ccmEnabled>
 	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
@@ -105,17 +118,14 @@  private:
 	int setDebayerFunctions(PixelFormat inputFormat,
 				PixelFormat outputFormat,
 				bool ccmEnabled);
-	void setupInputMemcpy(const uint8_t *linePointers[]);
+	void setupInputMemcpy(const uint8_t *linePointers[], DebayerCpuThreadData *threadData);
 	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
-	void memcpyNextLine(const uint8_t *linePointers[]);
-	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
-	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
+	void memcpyNextLine(const uint8_t *linePointers[], DebayerCpuThreadData *threadData);
+	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData);
+	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst, DebayerCpuThreadData *threadData);
 	void updateGammaTable(const DebayerParams &params);
 	void updateLookupTables(const DebayerParams &params);
 
-	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
-	static constexpr unsigned int kMaxLineBuffers = 5;
-
 	static constexpr unsigned int kRGBLookupSize = 256;
 	static constexpr unsigned int kGammaLookupSize = 1024;
 	struct CcmColumn {
@@ -143,12 +153,14 @@  private:
 	debayerFn debayer3_;
 	Rectangle window_;
 	std::unique_ptr<SwStatsCpu> stats_;
-	std::vector<uint8_t> lineBuffers_[kMaxLineBuffers];
 	unsigned int lineBufferLength_;
 	unsigned int lineBufferPadding_;
-	unsigned int lineBufferIndex_;
 	unsigned int xShift_; /* Offset of 0/1 applied to window_.x */
 	bool enableInputMemcpy_;
+
+	static constexpr unsigned int kMaxThreads = 4;
+	struct DebayerCpuThreadData threadData_[kMaxThreads];
+	unsigned int threadCount_;
 };
 
 } /* namespace libcamera */