@@ -12,6 +12,7 @@
#pragma once
#include <stdint.h>
+#include <vector>
#include <libcamera/base/signal.h>
@@ -51,13 +52,13 @@ public:
const Size &patternSize() { return patternSize_; }
- int configure(const StreamConfiguration &inputCfg);
+ int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1);
void setWindow(const Rectangle &window);
void startFrame(uint32_t frame);
void finishFrame(uint32_t frame, uint32_t bufferId);
void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input);
- void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
+ void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
{
if (frame % kStatPerNumFrames)
return;
@@ -66,10 +67,10 @@ public:
y >= (window_.y + window_.height))
return;
- (this->*stats0_)(src);
+ (this->*stats0_)(src, stats_[statsBufferIndex]);
}
- void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
+ void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
{
if (frame % kStatPerNumFrames)
return;
@@ -78,25 +79,25 @@ public:
y >= (window_.y + window_.height))
return;
- (this->*stats2_)(src);
+ (this->*stats2_)(src, stats_[statsBufferIndex]);
}
Signal<uint32_t, uint32_t> statsReady;
private:
- using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]);
+ using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats);
using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in);
int setupStandardBayerOrder(BayerFormat::Order order);
/* Bayer 8 bpp unpacked */
- void statsBGGR8Line0(const uint8_t *src[]);
+ void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats);
/* Bayer 10 bpp unpacked */
- void statsBGGR10Line0(const uint8_t *src[]);
+ void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats);
/* Bayer 12 bpp unpacked */
- void statsBGGR12Line0(const uint8_t *src[]);
+ void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats);
/* Bayer 10 bpp packed */
- void statsBGGR10PLine0(const uint8_t *src[]);
- void statsGBRG10PLine0(const uint8_t *src[]);
+ void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats);
+ void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats);
void processBayerFrame2(MappedFrameBuffer &in);
@@ -116,8 +117,8 @@ private:
unsigned int xShift_;
unsigned int stride_;
+ std::vector<SwIspStats> stats_;
SharedMemObject<SwIspStats> sharedStats_;
- SwIspStats stats_;
Benchmark bench_;
};
@@ -74,11 +74,12 @@ namespace libcamera {
*/
/**
- * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
+ * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
* \brief Process line 0
* \param[in] frame The frame number
* \param[in] y The y coordinate.
* \param[in] src The input data.
+ * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
*
* This function processes line 0 for input formats with
* patternSize height == 1.
@@ -97,14 +98,18 @@ namespace libcamera {
* to the line in plane 0, etc.
*
* For non Bayer single plane input data only a single src pointer is required.
+ *
+ * The statsBufferIndex value must be less than the statsBufferCount value passed
+ * to configure().
*/
/**
- * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
+ * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
* \brief Process line 2 and 3
* \param[in] frame The frame number
* \param[in] y The y coordinate.
* \param[in] src The input data.
+ * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading.
*
* This function processes line 2 and 3 for input formats with
* patternSize height == 4.
@@ -182,14 +187,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */
yVal = r * kRedYMul; \
yVal += g * kGreenYMul; \
yVal += b * kBlueYMul; \
- stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
+ stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
#define SWSTATS_FINISH_LINE_STATS() \
- stats_.sum_.r() += sumR; \
- stats_.sum_.g() += sumG; \
- stats_.sum_.b() += sumB;
+ stats.sum_.r() += sumR; \
+ stats.sum_.g() += sumG; \
+ stats.sum_.b() += sumB;
-void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats)
{
const uint8_t *src0 = src[1] + window_.x;
const uint8_t *src1 = src[2] + window_.x;
@@ -214,7 +219,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
-void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats)
{
const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -240,7 +245,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
-void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats)
{
const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -266,7 +271,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
-void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
+void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats)
{
const uint8_t *src0 = src[1] + window_.x * 5 / 4;
const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -292,7 +297,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
-void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[])
+void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats)
{
const uint8_t *src0 = src[1] + window_.x * 5 / 4;
const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -332,8 +337,10 @@ void SwStatsCpu::startFrame(uint32_t frame)
if (window_.width == 0)
LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()";
- stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 });
- stats_.yHistogram.fill(0);
+ for (unsigned int i = 0; i < stats_.size(); i++) {
+ stats_[i].sum_ = RGB<uint64_t>({ 0, 0, 0 });
+ stats_[i].yHistogram.fill(0);
+ }
}
/**
@@ -345,8 +352,19 @@ void SwStatsCpu::startFrame(uint32_t frame)
*/
void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId)
{
- stats_.valid = frame % kStatPerNumFrames == 0;
- *sharedStats_ = stats_;
+ bool valid = frame % kStatPerNumFrames == 0;
+
+ if (valid) {
+ sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 });
+ sharedStats_->yHistogram.fill(0);
+ for (unsigned int i = 0; i < stats_.size(); i++) {
+ sharedStats_->sum_ += stats_[i].sum_;
+ for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++)
+ sharedStats_->yHistogram[j] += stats_[i].yHistogram[j];
+ }
+ }
+
+ sharedStats_->valid = valid;
statsReady.emit(frame, bufferId);
}
@@ -389,12 +407,14 @@ int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order)
/**
* \brief Configure the statistics object for the passed in input format
* \param[in] inputCfg The input format
+ * \param[in] statsBufferCount number of internal stats buffers to use for multi-threading
*
* \return 0 on success, a negative errno value on failure
*/
-int SwStatsCpu::configure(const StreamConfiguration &inputCfg)
+int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount)
{
stride_ = inputCfg.stride;
+ stats_.resize(statsBufferCount);
BayerFormat bayerFormat =
BayerFormat::fromPixelFormat(inputCfg.pixelFormat);
@@ -504,7 +524,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in)
/* linePointers[0] is not used by any stats0_ functions */
linePointers[1] = src;
linePointers[2] = src + stride_;
- (this->*stats0_)(linePointers);
+ (this->*stats0_)(linePointers, stats_[0]);
src += stride_ * 2;
}
}
Make the storage used to accumulate the RGB sums and the Y histogram value a vector of SwIspStats objects instead of a single object so that when using multi-threading every thread can use its own storage to collect intermediate stats to avoid cache-line bouncing. Benchmarking with the GPU-ISP which does separate swstats benchmarking, on the Arduino Uno-Q which has a weak CPU which is good for performance testing, shows 20ms to generate stats for a 3272x2464 frame both before and after this change. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- Changes in v2: - Move the allocation of the vector of SwIspStats objects to inside the SwStatsCpu class, controlled by a configure() arguments instead of making the caller allocate the objects --- .../internal/software_isp/swstats_cpu.h | 25 ++++----- src/libcamera/software_isp/swstats_cpu.cpp | 54 +++++++++++++------ 2 files changed, 50 insertions(+), 29 deletions(-)