[v2,1/4] software_isp: swstats_cpu: Prepare for multi-threading support
diff mbox series

Message ID 20260223160930.27913-2-johannes.goede@oss.qualcomm.com
State New
Headers show
Series
  • software_isp: debayer_cpu: Add multi-threading support
Related show

Commit Message

Hans de Goede Feb. 23, 2026, 4:09 p.m. UTC
Make the storage used to accumulate the RGB sums and the Y histogram
value a vector of SwIspStats objects instead of a single object so
that when using multi-threading every thread can use its own storage to
collect intermediate stats to avoid cache-line bouncing.

Benchmarking with the GPU-ISP which does separate swstats benchmarking,
on the Arduino Uno-Q which has a weak CPU which is good for performance
testing, shows 20ms to generate stats for a 3272x2464 frame both before
and after this change.

Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
---
Changes in v2:
- Move the allocation of the vector of SwIspStats objects to inside
  the SwStatsCpu class, controlled by a configure() arguments instead
  of making the caller allocate the objects
---
 .../internal/software_isp/swstats_cpu.h       | 25 ++++-----
 src/libcamera/software_isp/swstats_cpu.cpp    | 54 +++++++++++++------
 2 files changed, 50 insertions(+), 29 deletions(-)

Patch
diff mbox series

diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h
index 64b3e23f5..feee92f99 100644
--- a/include/libcamera/internal/software_isp/swstats_cpu.h
+++ b/include/libcamera/internal/software_isp/swstats_cpu.h
@@ -12,6 +12,7 @@ 
 #pragma once
 
 #include <stdint.h>
+#include <vector>
 
 #include <libcamera/base/signal.h>
 
@@ -51,13 +52,13 @@  public:
 
 	const Size &patternSize() { return patternSize_; }
 
-	int configure(const StreamConfiguration &inputCfg);
+	int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1);
 	void setWindow(const Rectangle &window);
 	void startFrame(uint32_t frame);
 	void finishFrame(uint32_t frame, uint32_t bufferId);
 	void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input);
 
-	void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
+	void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
 	{
 		if (frame % kStatPerNumFrames)
 			return;
@@ -66,10 +67,10 @@  public:
 		    y >= (window_.y + window_.height))
 			return;
 
-		(this->*stats0_)(src);
+		(this->*stats0_)(src, stats_[statsBufferIndex]);
 	}
 
-	void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
+	void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
 	{
 		if (frame % kStatPerNumFrames)
 			return;
@@ -78,25 +79,25 @@  public:
 		    y >= (window_.y + window_.height))
 			return;
 
-		(this->*stats2_)(src);
+		(this->*stats2_)(src, stats_[statsBufferIndex]);
 	}
 
 	Signal<uint32_t, uint32_t> statsReady;
 
 private:
-	using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]);
+	using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats);
 	using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in);
 
 	int setupStandardBayerOrder(BayerFormat::Order order);
 	/* Bayer 8 bpp unpacked */
-	void statsBGGR8Line0(const uint8_t *src[]);
+	void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats);
 	/* Bayer 10 bpp unpacked */
-	void statsBGGR10Line0(const uint8_t *src[]);
+	void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats);
 	/* Bayer 12 bpp unpacked */
-	void statsBGGR12Line0(const uint8_t *src[]);
+	void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats);
 	/* Bayer 10 bpp packed */
-	void statsBGGR10PLine0(const uint8_t *src[]);
-	void statsGBRG10PLine0(const uint8_t *src[]);
+	void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats);
+	void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats);
 
 	void processBayerFrame2(MappedFrameBuffer &in);
 
@@ -116,8 +117,8 @@  private:
 	unsigned int xShift_;
 	unsigned int stride_;
 
+	std::vector<SwIspStats> stats_;
 	SharedMemObject<SwIspStats> sharedStats_;
-	SwIspStats stats_;
 	Benchmark bench_;
 };
 
diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp
index 1cedcfbc1..7c71aed96 100644
--- a/src/libcamera/software_isp/swstats_cpu.cpp
+++ b/src/libcamera/software_isp/swstats_cpu.cpp
@@ -74,11 +74,12 @@  namespace libcamera {
  */
 
 /**
- * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
+ * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
  * \brief Process line 0
  * \param[in] frame The frame number
  * \param[in] y The y coordinate.
  * \param[in] src The input data.
+ * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
  *
  * This function processes line 0 for input formats with
  * patternSize height == 1.
@@ -97,14 +98,18 @@  namespace libcamera {
  * to the line in plane 0, etc.
  *
  * For non Bayer single plane input data only a single src pointer is required.
+ *
+ * The statsBufferIndex value must be less than the statsBufferCount value passed
+ * to configure().
  */
 
 /**
- * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
+ * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
  * \brief Process line 2 and 3
  * \param[in] frame The frame number
  * \param[in] y The y coordinate.
  * \param[in] src The input data.
+ * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
  *
  * This function processes line 2 and 3 for input formats with
  * patternSize height == 4.
@@ -182,14 +187,14 @@  static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */
 	yVal = r * kRedYMul;               \
 	yVal += g * kGreenYMul;            \
 	yVal += b * kBlueYMul;             \
-	stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
+	stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
 
 #define SWSTATS_FINISH_LINE_STATS() \
-	stats_.sum_.r() += sumR;    \
-	stats_.sum_.g() += sumG;    \
-	stats_.sum_.b() += sumB;
+	stats.sum_.r() += sumR;    \
+	stats.sum_.g() += sumG;    \
+	stats.sum_.b() += sumB;
 
-void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint8_t *src0 = src[1] + window_.x;
 	const uint8_t *src1 = src[2] + window_.x;
@@ -214,7 +219,7 @@  void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
 	const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -240,7 +245,7 @@  void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
 	const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -266,7 +271,7 @@  void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint8_t *src0 = src[1] + window_.x * 5 / 4;
 	const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -292,7 +297,7 @@  void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[])
+void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint8_t *src0 = src[1] + window_.x * 5 / 4;
 	const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -332,8 +337,10 @@  void SwStatsCpu::startFrame(uint32_t frame)
 	if (window_.width == 0)
 		LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()";
 
-	stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 });
-	stats_.yHistogram.fill(0);
+	for (unsigned int i = 0; i < stats_.size(); i++) {
+		stats_[i].sum_ = RGB<uint64_t>({ 0, 0, 0 });
+		stats_[i].yHistogram.fill(0);
+	}
 }
 
 /**
@@ -345,8 +352,19 @@  void SwStatsCpu::startFrame(uint32_t frame)
  */
 void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId)
 {
-	stats_.valid = frame % kStatPerNumFrames == 0;
-	*sharedStats_ = stats_;
+	bool valid = frame % kStatPerNumFrames == 0;
+
+	if (valid) {
+		sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 });
+		sharedStats_->yHistogram.fill(0);
+		for (unsigned int i = 0; i < stats_.size(); i++) {
+			sharedStats_->sum_ += stats_[i].sum_;
+			for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++)
+				sharedStats_->yHistogram[j] += stats_[i].yHistogram[j];
+		}
+	}
+
+	sharedStats_->valid = valid;
 	statsReady.emit(frame, bufferId);
 }
 
@@ -389,12 +407,14 @@  int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order)
 /**
  * \brief Configure the statistics object for the passed in input format
  * \param[in] inputCfg The input format
+ * \param[in] statsBufferCount number of internal stats buffers to use for multi-threading
  *
  * \return 0 on success, a negative errno value on failure
  */
-int SwStatsCpu::configure(const StreamConfiguration &inputCfg)
+int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount)
 {
 	stride_ = inputCfg.stride;
+	stats_.resize(statsBufferCount);
 
 	BayerFormat bayerFormat =
 		BayerFormat::fromPixelFormat(inputCfg.pixelFormat);
@@ -504,7 +524,7 @@  void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in)
 		/* linePointers[0] is not used by any stats0_ functions */
 		linePointers[1] = src;
 		linePointers[2] = src + stride_;
-		(this->*stats0_)(linePointers);
+		(this->*stats0_)(linePointers, stats_[0]);
 		src += stride_ * 2;
 	}
 }