[5/5] software_isp: debayer_cpu: Add multi-threading support
diff mbox series

Message ID 20260216190204.106922-6-johannes.goede@oss.qualcomm.com
State New
Headers show
Series
  • software_isp: debayer_cpu: Add multi-threading support
Related show

Commit Message

Hans de Goede Feb. 16, 2026, 7:02 p.m. UTC
Add CPU soft ISP multi-threading support.

Benchmark results for the Uno-Q with a weak CPU which is good for
performance testing, all numbers with an IMX219 running at
3280x2464 -> 3272x2464:

1 thread : 147ms / frame, ~6.5 fps
2 threads:  81ms / frame, ~12 fps
3 threads:  66ms / frame, ~14.5 fps

Adding a 4th thread does not improve performance.

Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
---
 src/libcamera/software_isp/debayer_cpu.cpp | 49 +++++++++++++++++-----
 src/libcamera/software_isp/debayer_cpu.h   |  2 +-
 2 files changed, 40 insertions(+), 11 deletions(-)

Patch
diff mbox series

diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
index 5e168554..c4b6c5b8 100644
--- a/src/libcamera/software_isp/debayer_cpu.cpp
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -14,6 +14,7 @@ 
 #include <algorithm>
 #include <stdlib.h>
 #include <sys/ioctl.h>
+#include <thread>
 #include <time.h>
 #include <utility>
 
@@ -41,7 +42,7 @@  namespace libcamera {
  * \param[in] configuration The global configuration
  */
 DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfiguration &configuration)
-	: Debayer(configuration), stats_(std::move(stats)), threadCount_(1)
+	: Debayer(configuration), stats_(std::move(stats))
 {
 	/*
 	 * Reading from uncached buffers may be very slow.
@@ -56,6 +57,9 @@  DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
 	 */
 	enableInputMemcpy_ =
 		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
+	threadCount_ =
+		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(3);
+	threadCount_ = std::clamp(threadCount_, 1u, kMaxThreads);
 }
 
 DebayerCpu::~DebayerCpu() = default;
@@ -692,7 +696,7 @@  void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
 	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 2) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
+		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -707,7 +711,8 @@  void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst,
 	if (threadData->processLastLinesSeperately) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine0(frame, threadData->yEnd, linePointers, &statsBuffer_);
+		stats_->processLine0(frame, threadData->yEnd, linePointers,
+				     threadData->statsBuffer);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -749,7 +754,7 @@  void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
 	for (unsigned int y = threadData->yStart; y < threadData->yEnd; y += 4) {
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine0(frame, y, linePointers, &statsBuffer_);
+		stats_->processLine0(frame, y, linePointers, threadData->statsBuffer);
 		(this->*debayer0_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -762,7 +767,7 @@  void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst,
 
 		shiftLinePointers(linePointers, src);
 		memcpyNextLine(linePointers, threadData);
-		stats_->processLine2(frame, y, linePointers, &statsBuffer_);
+		stats_->processLine2(frame, y, linePointers, threadData->statsBuffer);
 		(this->*debayer2_)(dst, linePointers);
 		src += inputConfig_.stride;
 		dst += outputConfig_.stride;
@@ -869,6 +874,10 @@  void DebayerCpu::updateLookupTables(const DebayerParams &params)
 
 void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params)
 {
+	std::unique_ptr<std::thread> threads[threadCount_ - 1];
+	SwIspStats statsBuffer[threadCount_];
+	unsigned int i;
+
 	bench_.startFrame();
 
 	std::vector<DmaSyncer> dmaSyncers;
@@ -891,11 +900,31 @@  void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 		return;
 	}
 
-	stats_->startFrame(frame, &statsBuffer_, 1);
+	stats_->startFrame(frame, statsBuffer, threadCount_);
 
-	threadData_[0].yStart = 0;
-	threadData_[0].yEnd = window_.height;
-	(this->*processInner_)(frame, in.planes()[0].data(), out.planes()[0].data(), &threadData_[0]);
+	unsigned int yStart = 0;
+	unsigned int linesPerThread = (window_.height / threadCount_) &
+				      ~(inputConfig_.patternSize.width - 1);
+	for (i = 0; i < (threadCount_ - 1); i++) {
+		threadData_[i].yStart = yStart;
+		threadData_[i].yEnd = yStart + linesPerThread;
+		threadData_[i].statsBuffer = &statsBuffer[i];
+		threads[i] = std::make_unique<std::thread>(
+				processInner_, this, frame,
+				in.planes()[0].data(),
+				out.planes()[0].data() + yStart * outputConfig_.stride,
+				&threadData_[i]);
+		yStart += linesPerThread;
+	}
+	threadData_[i].yStart = yStart;
+	threadData_[i].yEnd = window_.height;
+	threadData_[i].statsBuffer = &statsBuffer[i];
+	(this->*processInner_)(frame, in.planes()[0].data(),
+			       out.planes()[0].data() + yStart * outputConfig_.stride,
+			       &threadData_[i]);
+
+	for (i = 0; i < (threadCount_ - 1); i++)
+		threads[i]->join();
 
 	metadata.planes()[0].bytesused = out.planes()[0].size();
 
@@ -909,7 +938,7 @@  void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 	 *
 	 * \todo Pass real bufferId once stats buffer passing is changed.
 	 */
-	stats_->finishFrame(frame, 0, &statsBuffer_, 1);
+	stats_->finishFrame(frame, 0, statsBuffer, threadCount_);
 	outputBufferReady.emit(output);
 	inputBufferReady.emit(input);
 }
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
index b85dd11c..63fa7710 100644
--- a/src/libcamera/software_isp/debayer_cpu.h
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -85,6 +85,7 @@  private:
 		unsigned int lineBufferIndex;
 		/* Stored here to avoid causing register pressure in inner loop */
 		bool processLastLinesSeperately;
+		SwIspStats *statsBuffer;
 	};
 
 	using processFn = void (DebayerCpu::*)(uint32_t frame, const uint8_t *src, uint8_t *dst,
@@ -150,7 +151,6 @@  private:
 	Rectangle window_;
 
 	/* Variables used every line */
-	SwIspStats statsBuffer_;
 	debayerFn debayer0_;
 	debayerFn debayer1_;
 	debayerFn debayer2_;