[v2,1/4] software_isp: swstats_cpu: Prepare for multi-threading support
diff mbox series

Message ID 20260223160930.27913-2-johannes.goede@oss.qualcomm.com
State Superseded
Headers show
Series
  • software_isp: debayer_cpu: Add multi-threading support
Related show

Commit Message

Hans de Goede Feb. 23, 2026, 4:09 p.m. UTC
Make the storage used to accumulate the RGB sums and the Y histogram
value a vector of SwIspStats objects instead of a single object so
that when using multi-threading every thread can use its own storage to
collect intermediate stats to avoid cache-line bouncing.

Benchmarking with the GPU-ISP which does separate swstats benchmarking,
on the Arduino Uno-Q which has a weak CPU which is good for performance
testing, shows 20ms to generate stats for a 3272x2464 frame both before
and after this change.

Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
---
Changes in v2:
- Move the allocation of the vector of SwIspStats objects to inside
  the SwStatsCpu class, controlled by a configure() arguments instead
  of making the caller allocate the objects
---
 .../internal/software_isp/swstats_cpu.h       | 25 ++++-----
 src/libcamera/software_isp/swstats_cpu.cpp    | 54 +++++++++++++------
 2 files changed, 50 insertions(+), 29 deletions(-)

Comments

Milan Zamazal Feb. 24, 2026, 10:23 a.m. UTC | #1
Hi Hans,

thank you for the update.

Hans de Goede <johannes.goede@oss.qualcomm.com> writes:

> Make the storage used to accumulate the RGB sums and the Y histogram
> value a vector of SwIspStats objects instead of a single object so
> that when using multi-threading every thread can use its own storage to
> collect intermediate stats to avoid cache-line bouncing.
>
> Benchmarking with the GPU-ISP which does separate swstats benchmarking,
> on the Arduino Uno-Q which has a weak CPU which is good for performance
> testing, shows 20ms to generate stats for a 3272x2464 frame both before
> and after this change.
>
> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
> ---
> Changes in v2:
> - Move the allocation of the vector of SwIspStats objects to inside
>   the SwStatsCpu class, controlled by a configure() arguments instead
>   of making the caller allocate the objects
> ---
>  .../internal/software_isp/swstats_cpu.h       | 25 ++++-----
>  src/libcamera/software_isp/swstats_cpu.cpp    | 54 +++++++++++++------
>  2 files changed, 50 insertions(+), 29 deletions(-)
>
> diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h
> index 64b3e23f5..feee92f99 100644
> --- a/include/libcamera/internal/software_isp/swstats_cpu.h
> +++ b/include/libcamera/internal/software_isp/swstats_cpu.h
> @@ -12,6 +12,7 @@
>  #pragma once
>  
>  #include <stdint.h>
> +#include <vector>
>  
>  #include <libcamera/base/signal.h>
>  
> @@ -51,13 +52,13 @@ public:
>  
>  	const Size &patternSize() { return patternSize_; }
>  
> -	int configure(const StreamConfiguration &inputCfg);
> +	int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1);
>  	void setWindow(const Rectangle &window);
>  	void startFrame(uint32_t frame);
>  	void finishFrame(uint32_t frame, uint32_t bufferId);
>  	void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input);
>  
> -	void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
> +	void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
>  	{
>  		if (frame % kStatPerNumFrames)
>  			return;
> @@ -66,10 +67,10 @@ public:
>  		    y >= (window_.y + window_.height))
>  			return;
>  
> -		(this->*stats0_)(src);
> +		(this->*stats0_)(src, stats_[statsBufferIndex]);
>  	}
>  
> -	void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
> +	void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
>  	{
>  		if (frame % kStatPerNumFrames)
>  			return;
> @@ -78,25 +79,25 @@ public:
>  		    y >= (window_.y + window_.height))
>  			return;
>  
> -		(this->*stats2_)(src);
> +		(this->*stats2_)(src, stats_[statsBufferIndex]);
>  	}
>  
>  	Signal<uint32_t, uint32_t> statsReady;
>  
>  private:
> -	using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]);
> +	using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats);
>  	using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in);
>  
>  	int setupStandardBayerOrder(BayerFormat::Order order);
>  	/* Bayer 8 bpp unpacked */
> -	void statsBGGR8Line0(const uint8_t *src[]);
> +	void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats);
>  	/* Bayer 10 bpp unpacked */
> -	void statsBGGR10Line0(const uint8_t *src[]);
> +	void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats);
>  	/* Bayer 12 bpp unpacked */
> -	void statsBGGR12Line0(const uint8_t *src[]);
> +	void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats);
>  	/* Bayer 10 bpp packed */
> -	void statsBGGR10PLine0(const uint8_t *src[]);
> -	void statsGBRG10PLine0(const uint8_t *src[]);
> +	void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats);
> +	void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats);
>  
>  	void processBayerFrame2(MappedFrameBuffer &in);
>  
> @@ -116,8 +117,8 @@ private:
>  	unsigned int xShift_;
>  	unsigned int stride_;
>  
> +	std::vector<SwIspStats> stats_;
>  	SharedMemObject<SwIspStats> sharedStats_;
> -	SwIspStats stats_;
>  	Benchmark bench_;
>  };
>  
> diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp
> index 1cedcfbc1..7c71aed96 100644
> --- a/src/libcamera/software_isp/swstats_cpu.cpp
> +++ b/src/libcamera/software_isp/swstats_cpu.cpp
> @@ -74,11 +74,12 @@ namespace libcamera {
>   */
>  
>  /**
> - * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
> + * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
>   * \brief Process line 0
>   * \param[in] frame The frame number
>   * \param[in] y The y coordinate.
>   * \param[in] src The input data.
> + * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
>   *
>   * This function processes line 0 for input formats with
>   * patternSize height == 1.
> @@ -97,14 +98,18 @@ namespace libcamera {
>   * to the line in plane 0, etc.
>   *
>   * For non Bayer single plane input data only a single src pointer is required.
> + *
> + * The statsBufferIndex value must be less than the statsBufferCount value passed
> + * to configure().
>   */
>  
>  /**
> - * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
> + * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
>   * \brief Process line 2 and 3
>   * \param[in] frame The frame number
>   * \param[in] y The y coordinate.
>   * \param[in] src The input data.
> + * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
>   *
>   * This function processes line 2 and 3 for input formats with
>   * patternSize height == 4.
> @@ -182,14 +187,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */
>  	yVal = r * kRedYMul;               \
>  	yVal += g * kGreenYMul;            \
>  	yVal += b * kBlueYMul;             \
> -	stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
> +	stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
>  
>  #define SWSTATS_FINISH_LINE_STATS() \
> -	stats_.sum_.r() += sumR;    \
> -	stats_.sum_.g() += sumG;    \
> -	stats_.sum_.b() += sumB;
> +	stats.sum_.r() += sumR;    \
> +	stats.sum_.g() += sumG;    \
> +	stats.sum_.b() += sumB;
>  
> -void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
> +void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats)
>  {
>  	const uint8_t *src0 = src[1] + window_.x;
>  	const uint8_t *src1 = src[2] + window_.x;
> @@ -214,7 +219,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
>  	SWSTATS_FINISH_LINE_STATS()
>  }
>  
> -void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
> +void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats)
>  {
>  	const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
>  	const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
> @@ -240,7 +245,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
>  	SWSTATS_FINISH_LINE_STATS()
>  }
>  
> -void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
> +void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats)
>  {
>  	const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
>  	const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
> @@ -266,7 +271,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
>  	SWSTATS_FINISH_LINE_STATS()
>  }
>  
> -void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
> +void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats)
>  {
>  	const uint8_t *src0 = src[1] + window_.x * 5 / 4;
>  	const uint8_t *src1 = src[2] + window_.x * 5 / 4;
> @@ -292,7 +297,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
>  	SWSTATS_FINISH_LINE_STATS()
>  }
>  
> -void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[])
> +void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats)
>  {
>  	const uint8_t *src0 = src[1] + window_.x * 5 / 4;
>  	const uint8_t *src1 = src[2] + window_.x * 5 / 4;
> @@ -332,8 +337,10 @@ void SwStatsCpu::startFrame(uint32_t frame)
>  	if (window_.width == 0)
>  		LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()";
>  
> -	stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 });
> -	stats_.yHistogram.fill(0);
> +	for (unsigned int i = 0; i < stats_.size(); i++) {

How about:

  for (auto &s : stats_)

> +		stats_[i].sum_ = RGB<uint64_t>({ 0, 0, 0 });
> +		stats_[i].yHistogram.fill(0);
> +	}
>  }
>  
>  /**
> @@ -345,8 +352,19 @@ void SwStatsCpu::startFrame(uint32_t frame)
>   */
>  void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId)
>  {
> -	stats_.valid = frame % kStatPerNumFrames == 0;
> -	*sharedStats_ = stats_;
> +	bool valid = frame % kStatPerNumFrames == 0;
> +
> +	if (valid) {
> +		sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 });
> +		sharedStats_->yHistogram.fill(0);
> +		for (unsigned int i = 0; i < stats_.size(); i++) {

The same here.

Other than these, the change looks good to me.

> +			sharedStats_->sum_ += stats_[i].sum_;
> +			for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++)
> +				sharedStats_->yHistogram[j] += stats_[i].yHistogram[j];
> +		}
> +	}
> +
> +	sharedStats_->valid = valid;
>  	statsReady.emit(frame, bufferId);
>  }
>  
> @@ -389,12 +407,14 @@ int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order)
>  /**
>   * \brief Configure the statistics object for the passed in input format
>   * \param[in] inputCfg The input format
> + * \param[in] statsBufferCount number of internal stats buffers to use for multi-threading
>   *
>   * \return 0 on success, a negative errno value on failure
>   */
> -int SwStatsCpu::configure(const StreamConfiguration &inputCfg)
> +int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount)
>  {
>  	stride_ = inputCfg.stride;
> +	stats_.resize(statsBufferCount);
>  
>  	BayerFormat bayerFormat =
>  		BayerFormat::fromPixelFormat(inputCfg.pixelFormat);
> @@ -504,7 +524,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in)
>  		/* linePointers[0] is not used by any stats0_ functions */
>  		linePointers[1] = src;
>  		linePointers[2] = src + stride_;
> -		(this->*stats0_)(linePointers);
> +		(this->*stats0_)(linePointers, stats_[0]);
>  		src += stride_ * 2;
>  	}
>  }

Patch
diff mbox series

diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h
index 64b3e23f5..feee92f99 100644
--- a/include/libcamera/internal/software_isp/swstats_cpu.h
+++ b/include/libcamera/internal/software_isp/swstats_cpu.h
@@ -12,6 +12,7 @@ 
 #pragma once
 
 #include <stdint.h>
+#include <vector>
 
 #include <libcamera/base/signal.h>
 
@@ -51,13 +52,13 @@  public:
 
 	const Size &patternSize() { return patternSize_; }
 
-	int configure(const StreamConfiguration &inputCfg);
+	int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1);
 	void setWindow(const Rectangle &window);
 	void startFrame(uint32_t frame);
 	void finishFrame(uint32_t frame, uint32_t bufferId);
 	void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input);
 
-	void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
+	void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
 	{
 		if (frame % kStatPerNumFrames)
 			return;
@@ -66,10 +67,10 @@  public:
 		    y >= (window_.y + window_.height))
 			return;
 
-		(this->*stats0_)(src);
+		(this->*stats0_)(src, stats_[statsBufferIndex]);
 	}
 
-	void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
+	void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
 	{
 		if (frame % kStatPerNumFrames)
 			return;
@@ -78,25 +79,25 @@  public:
 		    y >= (window_.y + window_.height))
 			return;
 
-		(this->*stats2_)(src);
+		(this->*stats2_)(src, stats_[statsBufferIndex]);
 	}
 
 	Signal<uint32_t, uint32_t> statsReady;
 
 private:
-	using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]);
+	using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats);
 	using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in);
 
 	int setupStandardBayerOrder(BayerFormat::Order order);
 	/* Bayer 8 bpp unpacked */
-	void statsBGGR8Line0(const uint8_t *src[]);
+	void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats);
 	/* Bayer 10 bpp unpacked */
-	void statsBGGR10Line0(const uint8_t *src[]);
+	void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats);
 	/* Bayer 12 bpp unpacked */
-	void statsBGGR12Line0(const uint8_t *src[]);
+	void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats);
 	/* Bayer 10 bpp packed */
-	void statsBGGR10PLine0(const uint8_t *src[]);
-	void statsGBRG10PLine0(const uint8_t *src[]);
+	void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats);
+	void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats);
 
 	void processBayerFrame2(MappedFrameBuffer &in);
 
@@ -116,8 +117,8 @@  private:
 	unsigned int xShift_;
 	unsigned int stride_;
 
+	std::vector<SwIspStats> stats_;
 	SharedMemObject<SwIspStats> sharedStats_;
-	SwIspStats stats_;
 	Benchmark bench_;
 };
 
diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp
index 1cedcfbc1..7c71aed96 100644
--- a/src/libcamera/software_isp/swstats_cpu.cpp
+++ b/src/libcamera/software_isp/swstats_cpu.cpp
@@ -74,11 +74,12 @@  namespace libcamera {
  */
 
 /**
- * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
+ * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
  * \brief Process line 0
  * \param[in] frame The frame number
  * \param[in] y The y coordinate.
  * \param[in] src The input data.
+ * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
  *
  * This function processes line 0 for input formats with
  * patternSize height == 1.
@@ -97,14 +98,18 @@  namespace libcamera {
  * to the line in plane 0, etc.
  *
  * For non Bayer single plane input data only a single src pointer is required.
+ *
+ * The statsBufferIndex value must be less than the statsBufferCount value passed
+ * to configure().
  */
 
 /**
- * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
+ * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
  * \brief Process line 2 and 3
  * \param[in] frame The frame number
  * \param[in] y The y coordinate.
  * \param[in] src The input data.
+ * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
  *
  * This function processes line 2 and 3 for input formats with
  * patternSize height == 4.
@@ -182,14 +187,14 @@  static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */
 	yVal = r * kRedYMul;               \
 	yVal += g * kGreenYMul;            \
 	yVal += b * kBlueYMul;             \
-	stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
+	stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
 
 #define SWSTATS_FINISH_LINE_STATS() \
-	stats_.sum_.r() += sumR;    \
-	stats_.sum_.g() += sumG;    \
-	stats_.sum_.b() += sumB;
+	stats.sum_.r() += sumR;    \
+	stats.sum_.g() += sumG;    \
+	stats.sum_.b() += sumB;
 
-void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint8_t *src0 = src[1] + window_.x;
 	const uint8_t *src1 = src[2] + window_.x;
@@ -214,7 +219,7 @@  void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
 	const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -240,7 +245,7 @@  void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
 	const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -266,7 +271,7 @@  void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint8_t *src0 = src[1] + window_.x * 5 / 4;
 	const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -292,7 +297,7 @@  void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
 	SWSTATS_FINISH_LINE_STATS()
 }
 
-void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[])
+void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats)
 {
 	const uint8_t *src0 = src[1] + window_.x * 5 / 4;
 	const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -332,8 +337,10 @@  void SwStatsCpu::startFrame(uint32_t frame)
 	if (window_.width == 0)
 		LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()";
 
-	stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 });
-	stats_.yHistogram.fill(0);
+	for (unsigned int i = 0; i < stats_.size(); i++) {
+		stats_[i].sum_ = RGB<uint64_t>({ 0, 0, 0 });
+		stats_[i].yHistogram.fill(0);
+	}
 }
 
 /**
@@ -345,8 +352,19 @@  void SwStatsCpu::startFrame(uint32_t frame)
  */
 void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId)
 {
-	stats_.valid = frame % kStatPerNumFrames == 0;
-	*sharedStats_ = stats_;
+	bool valid = frame % kStatPerNumFrames == 0;
+
+	if (valid) {
+		sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 });
+		sharedStats_->yHistogram.fill(0);
+		for (unsigned int i = 0; i < stats_.size(); i++) {
+			sharedStats_->sum_ += stats_[i].sum_;
+			for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++)
+				sharedStats_->yHistogram[j] += stats_[i].yHistogram[j];
+		}
+	}
+
+	sharedStats_->valid = valid;
 	statsReady.emit(frame, bufferId);
 }
 
@@ -389,12 +407,14 @@  int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order)
 /**
  * \brief Configure the statistics object for the passed in input format
  * \param[in] inputCfg The input format
+ * \param[in] statsBufferCount number of internal stats buffers to use for multi-threading
  *
  * \return 0 on success, a negative errno value on failure
  */
-int SwStatsCpu::configure(const StreamConfiguration &inputCfg)
+int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount)
 {
 	stride_ = inputCfg.stride;
+	stats_.resize(statsBufferCount);
 
 	BayerFormat bayerFormat =
 		BayerFormat::fromPixelFormat(inputCfg.pixelFormat);
@@ -504,7 +524,7 @@  void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in)
 		/* linePointers[0] is not used by any stats0_ functions */
 		linePointers[1] = src;
 		linePointers[2] = src + stride_;
-		(this->*stats0_)(linePointers);
+		(this->*stats0_)(linePointers, stats_[0]);
 		src += stride_ * 2;
 	}
 }