[v2,2/4] software_isp: debayer_cpu: Add DebayerCpuThread class
diff mbox series

Message ID 20260223160930.27913-3-johannes.goede@oss.qualcomm.com
State New
Headers show
Series
  • software_isp: debayer_cpu: Add multi-threading support
Related show

Commit Message

Hans de Goede Feb. 23, 2026, 4:09 p.m. UTC
Add a DebayerCpuThreadclass and use this in the inner render loop.
This contains data which needs to be separate per thread.

This is a preparation patch for making DebayerCpu support multi-threading.

Benchmarking on the Arduino Uno-Q with a weak CPU which is good for
performance testing, shows 146-147ms per 3272x2464 frame both before and
after this change, with things maybe being 0.5 ms slower after this change.

Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
---
Changes in v2:
- Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread
  class, derived from Object to allow calling invokeMethod for thread re-use
  in followup patches
- As part of this also move a bunch of methods which primarily deal with
  per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(),
  process*() to the new DebayerCpuThread class
---
 src/libcamera/software_isp/debayer_cpu.cpp | 215 ++++++++++++++-------
 src/libcamera/software_isp/debayer_cpu.h   |  20 +-
 2 files changed, 159 insertions(+), 76 deletions(-)

Comments

Barnabás Pőcze Feb. 23, 2026, 4:33 p.m. UTC | #1
Hi

2026. 02. 23. 17:09 keltezéssel, Hans de Goede írta:
> Add a DebayerCpuThreadclass and use this in the inner render loop.
> This contains data which needs to be separate per thread.
> 
> This is a preparation patch for making DebayerCpu support multi-threading.
> 
> Benchmarking on the Arduino Uno-Q with a weak CPU which is good for
> performance testing, shows 146-147ms per 3272x2464 frame both before and
> after this change, with things maybe being 0.5 ms slower after this change.
> 
> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
> ---
> Changes in v2:
> - Replace the DebayerCpuThreadData struct from v1 with a DebayerCpuThread
>    class, derived from Object to allow calling invokeMethod for thread re-use
>    in followup patches
> - As part of this also move a bunch of methods which primarily deal with
>    per thread data: setupInputMemcpy(), shiftLinePointers(), memcpyNextLine(),
>    process*() to the new DebayerCpuThread class
> ---
>   src/libcamera/software_isp/debayer_cpu.cpp | 215 ++++++++++++++-------
>   src/libcamera/software_isp/debayer_cpu.h   |  20 +-
>   2 files changed, 159 insertions(+), 76 deletions(-)
> 
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> index e7b012105..122bfbb05 100644
> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -27,6 +27,38 @@
> [...]
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> index 7a6517462..7196dcdd0 100644
> --- a/src/libcamera/software_isp/debayer_cpu.h
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -26,6 +26,7 @@
>   
>   namespace libcamera {
>   
> +class DebayerCpuThread;
>   class DebayerCpu : public Debayer
>   {
>   public:
> @@ -44,6 +45,8 @@ public:
>   	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
>   
>   private:
> +	friend class DebayerCpuThread;
> +
>   	/**
>   	 * \brief Called to debayer 1 line of Bayer input data to output format
>   	 * \param[out] dst Pointer to the start of the output line to write
> @@ -74,6 +77,11 @@ private:
>   	 */
>   	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
>   
> +	void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); }
> +	void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); }
> +	void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); }
> +	void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); }
> +
>   	/* 8-bit raw bayer format */
>   	template<bool addAlphaByte, bool ccmEnabled>
>   	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> @@ -105,11 +113,6 @@ private:
>   	int setDebayerFunctions(PixelFormat inputFormat,
>   				PixelFormat outputFormat,
>   				bool ccmEnabled);
> -	void setupInputMemcpy(const uint8_t *linePointers[]);
> -	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
> -	void memcpyNextLine(const uint8_t *linePointers[]);
> -	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
> -	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
>   	void updateGammaTable(const DebayerParams &params);
>   	void updateLookupTables(const DebayerParams &params);
>   
> @@ -142,12 +145,9 @@ private:
>   	debayerFn debayer3_;
>   	Rectangle window_;
>   	std::unique_ptr<SwStatsCpu> stats_;
> -	std::vector<uint8_t> lineBuffers_[kMaxLineBuffers];
> -	unsigned int lineBufferLength_;
> -	unsigned int lineBufferPadding_;
> -	unsigned int lineBufferIndex_;
>   	unsigned int xShift_; /* Offset of 0/1 applied to window_.x */
> -	bool enableInputMemcpy_;
> +
> +	std::vector<DebayerCpuThread *>threads_;

This should be `std::unique_ptr<>` or similar.


>   };
>   
>   } /* namespace libcamera */

Patch
diff mbox series

diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
index e7b012105..122bfbb05 100644
--- a/src/libcamera/software_isp/debayer_cpu.cpp
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -27,6 +27,38 @@ 
 
 namespace libcamera {
 
+class DebayerCpuThread : public Object
+{
+public:
+	DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
+			 bool enableInputMemcpy);
+
+	void configure(unsigned int yStart, unsigned int yEnd);
+	void setupInputMemcpy(const uint8_t *linePointers[]);
+	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
+	void memcpyNextLine(const uint8_t *linePointers[]);
+	void process(uint32_t frame, const uint8_t *src, uint8_t *dst);
+	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
+	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
+
+	DebayerCpu *debayer_;
+	unsigned int threadIndex_;
+	unsigned int yStart_;
+	unsigned int yEnd_;
+	unsigned int lineBufferLength_;
+	unsigned int lineBufferPadding_;
+	unsigned int lineBufferIndex_;
+	std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers];
+	bool enableInputMemcpy_;
+};
+
+DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
+				   bool enableInputMemcpy)
+	: debayer_(debayer), threadIndex_(threadIndex),
+	  enableInputMemcpy_(enableInputMemcpy)
+{
+}
+
 /**
  * \class DebayerCpu
  * \brief Class for debayering on the CPU
@@ -53,8 +85,14 @@  DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
 	 * \todo Make memcpy automatic based on runtime detection of platform
 	 * capabilities.
 	 */
-	enableInputMemcpy_ =
+	bool enableInputMemcpy =
 		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
+
+	/* Just one thread object for now, which will be called inline rather than async */
+	threads_.resize(1);
+
+	for (unsigned int i = 0; i < threads_.size(); i++)
+		threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy);
 }
 
 DebayerCpu::~DebayerCpu() = default;
@@ -484,7 +522,7 @@  int DebayerCpu::configure(const StreamConfiguration &inputCfg,
 	if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0)
 		return -EINVAL;
 
-	if (stats_->configure(inputCfg) != 0)
+	if (stats_->configure(inputCfg, threads_.size()) != 0)
 		return -EINVAL;
 
 	const Size &statsPatternSize = stats_->patternSize();
@@ -548,17 +586,36 @@  int DebayerCpu::configure(const StreamConfiguration &inputCfg,
 	 */
 	stats_->setWindow(Rectangle(window_.size()));
 
+	unsigned int yStart = 0;
+	unsigned int linesPerThread = (window_.height / threads_.size()) &
+				      ~(inputConfig_.patternSize.height - 1);
+	unsigned int i;
+
+	for (i = 0; i < (threads_.size() - 1); i++) {
+		threads_[i]->configure(yStart, yStart + linesPerThread);
+		yStart += linesPerThread;
+	}
+	threads_[i]->configure(yStart, window_.height);
+
+	return 0;
+}
+
+void DebayerCpuThread::configure(unsigned int yStart, unsigned int yEnd)
+{
+	Debayer::DebayerInputConfig &inputConfig = debayer_->inputConfig_;
+
+	yStart_ = yStart;
+	yEnd_ = yEnd;
+
 	/* pad with patternSize.Width on both left and right side */
-	lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8;
-	lineBufferLength_ = window_.width * inputConfig_.bpp / 8 +
+	lineBufferPadding_ = inputConfig.patternSize.width * inputConfig.bpp / 8;
+	lineBufferLength_ = debayer_->window_.width * inputConfig.bpp / 8 +
 			    2 * lineBufferPadding_;
 
 	if (enableInputMemcpy_) {
-		for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++)
+		for (unsigned int i = 0; i <= inputConfig.patternSize.height; i++)
 			lineBuffers_[i].resize(lineBufferLength_);
 	}
-
-	return 0;
 }
 
 /*
@@ -599,9 +656,9 @@  DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
 	return std::make_tuple(stride, stride * size.height);
 }
 
-void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
+void DebayerCpuThread::setupInputMemcpy(const uint8_t *linePointers[])
 {
-	const unsigned int patternHeight = inputConfig_.patternSize.height;
+	const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
 
 	if (!enableInputMemcpy_)
 		return;
@@ -617,20 +674,20 @@  void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
 	lineBufferIndex_ = patternHeight;
 }
 
-void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
+void DebayerCpuThread::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
 {
-	const unsigned int patternHeight = inputConfig_.patternSize.height;
+	const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
 
 	for (unsigned int i = 0; i < patternHeight; i++)
 		linePointers[i] = linePointers[i + 1];
 
-	linePointers[patternHeight] = src +
-				      (patternHeight / 2) * (int)inputConfig_.stride;
+	linePointers[patternHeight] =
+		src + (patternHeight / 2) * (int)debayer_->inputConfig_.stride;
 }
 
-void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
+void DebayerCpuThread::memcpyNextLine(const uint8_t *linePointers[])
 {
-	const unsigned int patternHeight = inputConfig_.patternSize.height;
+	const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
 
 	if (!enableInputMemcpy_)
 		return;
@@ -643,23 +700,42 @@  void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
 	lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
 }
 
-void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
+void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst)
 {
-	unsigned int yEnd = window_.height;
+	Rectangle &window = debayer_->window_;
+
+	/* Adjust src to top left corner of the window */
+	src += (window.y + yStart_) * debayer_->inputConfig_.stride +
+	       window.x * debayer_->inputConfig_.bpp / 8;
+	/* Adjust dst for yStart_ */
+	dst += yStart_ * debayer_->outputConfig_.stride;
+
+	if (debayer_->inputConfig_.patternSize.height == 2)
+		process2(frame, src, dst);
+	else
+		process4(frame, src, dst);
+}
+
+void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
+{
+	unsigned int outputStride = debayer_->outputConfig_.stride;
+	unsigned int inputStride = debayer_->inputConfig_.stride;
+	Rectangle &window = debayer_->window_;
+	unsigned int yEnd = yEnd_;
 	/* Holds [0] previous- [1] current- [2] next-line */
 	const uint8_t *linePointers[3];
 
-	/* Adjust src to top left corner of the window */
-	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
-
 	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
-	if (window_.y) {
-		linePointers[1] = src - inputConfig_.stride; /* previous-line */
+	if (window.y + yStart_) {
+		linePointers[1] = src - inputStride; /* previous-line */
 		linePointers[2] = src;
 	} else {
-		/* window_.y == 0, use the next line as prev line */
-		linePointers[1] = src + inputConfig_.stride;
+		/* Top line, use the next line as prev line */
+		linePointers[1] = src + inputStride;
 		linePointers[2] = src;
+	}
+
+	if (window.y == 0 && yEnd_ == window.height) {
 		/*
 		 * Last 2 lines also need special handling.
 		 * (And configure() ensures that yEnd >= 2.)
@@ -669,83 +745,93 @@  void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
 
 	setupInputMemcpy(linePointers);
 
-	for (unsigned int y = 0; y < yEnd; y += 2) {
+	/*
+	 * Note y is the line-number *inside* the window, since stats_' window
+	 * is the stats window inside/relative to the debayer window. IOW for
+	 * single thread rendering y goes from 0 to window.height.
+	 */
+	for (unsigned int y = yStart_; y < yEnd; y += 2) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		stats_->processLine0(frame, y, linePointers);
-		(this->*debayer0_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_);
+		debayer_->debayer0(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		(this->*debayer1_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->debayer1(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 	}
 
-	if (window_.y == 0) {
+	if (window.y == 0 && yEnd_ == window.height) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		stats_->processLine0(frame, yEnd, linePointers);
-		(this->*debayer0_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->stats_->processLine0(frame, yEnd, linePointers, threadIndex_);
+		debayer_->debayer0(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 
 		shiftLinePointers(linePointers, src);
 		/* next line may point outside of src, use prev. */
 		linePointers[2] = linePointers[0];
-		(this->*debayer1_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->debayer1(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 	}
 }
 
-void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
+void DebayerCpuThread::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
 {
+	unsigned int outputStride = debayer_->outputConfig_.stride;
+	unsigned int inputStride = debayer_->inputConfig_.stride;
+
 	/*
 	 * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
 	 * [3] 1-line-down [4] 2-lines-down.
 	 */
 	const uint8_t *linePointers[5];
 
-	/* Adjust src to top left corner of the window */
-	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
-
 	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
-	linePointers[1] = src - 2 * inputConfig_.stride;
-	linePointers[2] = src - inputConfig_.stride;
+	linePointers[1] = src - 2 * inputStride;
+	linePointers[2] = src - inputStride;
 	linePointers[3] = src;
-	linePointers[4] = src + inputConfig_.stride;
+	linePointers[4] = src + inputStride;
 
 	setupInputMemcpy(linePointers);
 
-	for (unsigned int y = 0; y < window_.height; y += 4) {
+	/*
+	 * Note y is the line-number *inside* the window, since stats_' window
+	 * is the stats window inside/relative to the debayer window. IOW for
+	 * single thread rendering y goes from 0 to window.height.
+	 */
+	for (unsigned int y = yStart_; y < yEnd_; y += 4) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		stats_->processLine0(frame, y, linePointers);
-		(this->*debayer0_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_);
+		debayer_->debayer0(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		(this->*debayer1_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->debayer1(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		stats_->processLine2(frame, y, linePointers);
-		(this->*debayer2_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->stats_->processLine2(frame, y, linePointers, threadIndex_);
+		debayer_->debayer2(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers);
-		(this->*debayer3_)(dst, linePointers);
-		src += inputConfig_.stride;
-		dst += outputConfig_.stride;
+		debayer_->debayer3(dst, linePointers);
+		src += inputStride;
+		dst += outputStride;
 	}
 }
 
@@ -867,10 +953,7 @@  void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 
 	stats_->startFrame(frame);
 
-	if (inputConfig_.patternSize.height == 2)
-		process2(frame, in.planes()[0].data(), out.planes()[0].data());
-	else
-		process4(frame, in.planes()[0].data(), out.planes()[0].data());
+	threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data());
 
 	metadata.planes()[0].bytesused = out.planes()[0].size();
 
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
index 7a6517462..7196dcdd0 100644
--- a/src/libcamera/software_isp/debayer_cpu.h
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -26,6 +26,7 @@ 
 
 namespace libcamera {
 
+class DebayerCpuThread;
 class DebayerCpu : public Debayer
 {
 public:
@@ -44,6 +45,8 @@  public:
 	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
 
 private:
+	friend class DebayerCpuThread;
+
 	/**
 	 * \brief Called to debayer 1 line of Bayer input data to output format
 	 * \param[out] dst Pointer to the start of the output line to write
@@ -74,6 +77,11 @@  private:
 	 */
 	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
 
+	void debayer0(uint8_t *dst, const uint8_t *src[]) { (this->*debayer0_)(dst, src); }
+	void debayer1(uint8_t *dst, const uint8_t *src[]) { (this->*debayer1_)(dst, src); }
+	void debayer2(uint8_t *dst, const uint8_t *src[]) { (this->*debayer2_)(dst, src); }
+	void debayer3(uint8_t *dst, const uint8_t *src[]) { (this->*debayer3_)(dst, src); }
+
 	/* 8-bit raw bayer format */
 	template<bool addAlphaByte, bool ccmEnabled>
 	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
@@ -105,11 +113,6 @@  private:
 	int setDebayerFunctions(PixelFormat inputFormat,
 				PixelFormat outputFormat,
 				bool ccmEnabled);
-	void setupInputMemcpy(const uint8_t *linePointers[]);
-	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
-	void memcpyNextLine(const uint8_t *linePointers[]);
-	void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
-	void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
 	void updateGammaTable(const DebayerParams &params);
 	void updateLookupTables(const DebayerParams &params);
 
@@ -142,12 +145,9 @@  private:
 	debayerFn debayer3_;
 	Rectangle window_;
 	std::unique_ptr<SwStatsCpu> stats_;
-	std::vector<uint8_t> lineBuffers_[kMaxLineBuffers];
-	unsigned int lineBufferLength_;
-	unsigned int lineBufferPadding_;
-	unsigned int lineBufferIndex_;
 	unsigned int xShift_; /* Offset of 0/1 applied to window_.x */
-	bool enableInputMemcpy_;
+
+	std::vector<DebayerCpuThread *>threads_;
 };
 
 } /* namespace libcamera */