| Message ID | 20260216190204.106922-2-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi Hans, thank you for the patches. Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > Move the storage used to accumulate the RGB sums and the Y histogram > out of the SwStatsCpu class and into the callers. > > The idea is to allow a single SwStatsCpu object to be shared between > multiple threads each processing part of the image, with finishFrame() > accumulating the per thread data into the final stats for the entire > frame. > > This is a preparation patch for making DebayerCpu support multi-threading > and this could also be used to make processFrame() multi-threaded. > > Benchmarking with the GPU-ISP which does separate swstats benchmarking, > on the Uno-Q which has a weak CPU which is good for performance testing, > shows 20-21ms to generate stats for a 3272x2464 frame both before and > after this change. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > .../internal/software_isp/swstats_cpu.h | 29 ++++----- > src/libcamera/software_isp/debayer_cpu.cpp | 12 ++-- > src/libcamera/software_isp/debayer_cpu.h | 1 + > src/libcamera/software_isp/swstats_cpu.cpp | 65 +++++++++++++------ > 4 files changed, 65 insertions(+), 42 deletions(-) > > diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h > index 64b3e23f..a157afe8 100644 > --- a/include/libcamera/internal/software_isp/swstats_cpu.h > +++ b/include/libcamera/internal/software_isp/swstats_cpu.h > @@ -53,11 +53,11 @@ public: > > int configure(const StreamConfiguration &inputCfg); > void setWindow(const Rectangle &window); > - void startFrame(uint32_t frame); > - void finishFrame(uint32_t frame, uint32_t bufferId); > + void startFrame(uint32_t frame, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); `struct' keyword not needed (here and elsewhere). Is there any reason not to use std::vector (+ to omit statsBufferCount argument then)? > + void finishFrame(uint32_t frame, uint32_t bufferId, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); > void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input); > > - void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) > + void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) Would it be perhaps better to use references rather than pointers for `stats'? > { > if (frame % kStatPerNumFrames) > return; > @@ -66,10 +66,10 @@ public: > y >= (window_.y + window_.height)) > return; > > - (this->*stats0_)(src); > + (this->*stats0_)(src, stats); > } > > - void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) > + void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) > { > if (frame % kStatPerNumFrames) > return; > @@ -78,27 +78,27 @@ public: > y >= (window_.y + window_.height)) > return; > > - (this->*stats2_)(src); > + (this->*stats2_)(src, stats); > } > > Signal<uint32_t, uint32_t> statsReady; > > private: > - using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]); > - using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in); > + using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats *stats); > + using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in, SwIspStats *stats); > > int setupStandardBayerOrder(BayerFormat::Order order); > /* Bayer 8 bpp unpacked */ > - void statsBGGR8Line0(const uint8_t *src[]); > + void statsBGGR8Line0(const uint8_t *src[], SwIspStats *stats); > /* Bayer 10 bpp unpacked */ > - void statsBGGR10Line0(const uint8_t *src[]); > + void statsBGGR10Line0(const uint8_t *src[], SwIspStats *stats); > /* Bayer 12 bpp unpacked */ > - void statsBGGR12Line0(const uint8_t *src[]); > + void statsBGGR12Line0(const uint8_t *src[], SwIspStats *stats); > /* Bayer 10 bpp packed */ > - void statsBGGR10PLine0(const uint8_t *src[]); > - void statsGBRG10PLine0(const uint8_t *src[]); > + void statsBGGR10PLine0(const uint8_t *src[], SwIspStats *stats); > + void statsGBRG10PLine0(const uint8_t *src[], SwIspStats *stats); > > - void processBayerFrame2(MappedFrameBuffer &in); > + void processBayerFrame2(MappedFrameBuffer &in, SwIspStats *stats); > > processFrameFn processFrame_; > > @@ -117,7 +117,6 @@ private: > unsigned int stride_; > > SharedMemObject<SwIspStats> sharedStats_; > - SwIspStats stats_; > Benchmark bench_; > }; > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index d0988357..97c1959a 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -673,7 +673,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > for (unsigned int y = 0; y < yEnd; y += 2) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine0(frame, y, linePointers); > + stats_->processLine0(frame, y, linePointers, &statsBuffer_); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -688,7 +688,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > if (window_.y == 0) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine0(frame, yEnd, linePointers); > + stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -724,7 +724,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > for (unsigned int y = 0; y < window_.height; y += 4) { > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine0(frame, y, linePointers); > + stats_->processLine0(frame, y, linePointers, &statsBuffer_); > (this->*debayer0_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -737,7 +737,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > > shiftLinePointers(linePointers, src); > memcpyNextLine(linePointers); > - stats_->processLine2(frame, y, linePointers); > + stats_->processLine2(frame, y, linePointers, &statsBuffer_); > (this->*debayer2_)(dst, linePointers); > src += inputConfig_.stride; > dst += outputConfig_.stride; > @@ -866,7 +866,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > return; > } > > - stats_->startFrame(frame); > + stats_->startFrame(frame, &statsBuffer_, 1); > > if (inputConfig_.patternSize.height == 2) > process2(frame, in.planes()[0].data(), out.planes()[0].data()); > @@ -885,7 +885,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > * > * \todo Pass real bufferId once stats buffer passing is changed. > */ > - stats_->finishFrame(frame, 0); > + stats_->finishFrame(frame, 0, &statsBuffer_, 1); > outputBufferReady.emit(output); > inputBufferReady.emit(input); > } > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 7a651746..8abf5168 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -135,6 +135,7 @@ private: > LookupTable gammaLut_; > bool ccmEnabled_; > DebayerParams params_; > + SwIspStats statsBuffer_; > > debayerFn debayer0_; > debayerFn debayer1_; > diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp > index 5c3011a7..23842f6c 100644 > --- a/src/libcamera/software_isp/swstats_cpu.cpp > +++ b/src/libcamera/software_isp/swstats_cpu.cpp > @@ -182,14 +182,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */ > yVal = r * kRedYMul; \ > yVal += g * kGreenYMul; \ > yVal += b * kBlueYMul; \ > - stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; > + stats->yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; > > #define SWSTATS_FINISH_LINE_STATS() \ > - stats_.sum_.r() += sumR; \ > - stats_.sum_.g() += sumG; \ > - stats_.sum_.b() += sumB; > + stats->sum_.r() += sumR; \ > + stats->sum_.g() += sumG; \ > + stats->sum_.b() += sumB; > > -void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats *stats) > { > const uint8_t *src0 = src[1] + window_.x; > const uint8_t *src1 = src[2] + window_.x; > @@ -214,7 +214,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats *stats) > { > const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; > const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; > @@ -240,7 +240,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats *stats) > { > const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; > const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; > @@ -266,7 +266,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats *stats) > { > const uint8_t *src0 = src[1] + window_.x * 5 / 4; > const uint8_t *src1 = src[2] + window_.x * 5 / 4; > @@ -292,7 +292,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) > +void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats *stats) > { > const uint8_t *src0 = src[1] + window_.x * 5 / 4; > const uint8_t *src1 = src[2] + window_.x * 5 / 4; > @@ -321,10 +321,13 @@ void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) > /** > * \brief Reset state to start statistics gathering for a new frame > * \param[in] frame The frame number > + * \param[in] statsBuffer Array of buffers storing stats > + * \param[in] statsBufferCount number of buffers in the statsBuffer array > * > * This may only be called after a successful setWindow() call. > */ > -void SwStatsCpu::startFrame(uint32_t frame) > +void SwStatsCpu::startFrame(uint32_t frame, > + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) > { > if (frame % kStatPerNumFrames) > return; > @@ -332,21 +335,39 @@ void SwStatsCpu::startFrame(uint32_t frame) > if (window_.width == 0) > LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()"; > > - stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 }); > - stats_.yHistogram.fill(0); > + for (unsigned int i = 0; i < statsBufferCount; i++) { > + statsBuffer[i].sum_ = RGB<uint64_t>({ 0, 0, 0 }); > + statsBuffer[i].yHistogram.fill(0); > + } > } > > /** > * \brief Finish statistics calculation for the current frame > * \param[in] frame The frame number > * \param[in] bufferId ID of the statistics buffer > + * \param[in] statsBuffer Array of buffers storing stats > + * \param[in] statsBufferCount number of buffers in the statsBuffer array > * > * This may only be called after a successful setWindow() call. > */ > -void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId) > +void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId, > + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) > { > - stats_.valid = frame % kStatPerNumFrames == 0; > - *sharedStats_ = stats_; > + if (frame % kStatPerNumFrames) { > + sharedStats_->valid = false; > + statsReady.emit(frame, bufferId); > + return; Using such a shortcut rather than if (stats_.valid) { ... } feels error-prone but up to the maintainers whether they like it or not. > + } > + > + sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 }); > + sharedStats_->yHistogram.fill(0); > + for (unsigned int i = 0; i < statsBufferCount; i++) { > + sharedStats_->sum_ += statsBuffer[i].sum_; > + for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++) > + sharedStats_->yHistogram[j] += statsBuffer[i].yHistogram[j]; > + } > + > + sharedStats_->valid = true; > statsReady.emit(frame, bufferId); > } > > @@ -487,7 +508,7 @@ void SwStatsCpu::setWindow(const Rectangle &window) > window_.height &= ~(patternSize_.height - 1); > } > > -void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > +void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in, SwIspStats *stats) > { > const uint8_t *src = in.planes()[0].data(); > const uint8_t *linePointers[3]; > @@ -504,7 +525,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > /* linePointers[0] is not used by any stats0_ functions */ > linePointers[1] = src; > linePointers[2] = src + stride_; > - (this->*stats0_)(linePointers); > + (this->*stats0_)(linePointers, stats); > src += stride_ * 2; > } > } > @@ -520,12 +541,14 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > void SwStatsCpu::processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input) > { > if (frame % kStatPerNumFrames) { > - finishFrame(frame, bufferId); > + finishFrame(frame, bufferId, NULL, 0); nullptr (or an empty std::vector) > return; > } > > + SwIspStats stats; > + > bench_.startFrame(); > - startFrame(frame); > + startFrame(frame, &stats, 1); Would it be useful to add a comment somewhere that this method is not interested in multihreading? Maybe obvious but still... These two ways of computing stats (CPU, multi-threaded, startFrame+finishFrame; GPU, single-go, processFrame) are somewhat confusing (not a fault of this patch). > MappedFrameBuffer in(input, MappedFrameBuffer::MapFlag::Read); > if (!in.isValid()) { > @@ -533,8 +556,8 @@ void SwStatsCpu::processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *in > return; > } > > - (this->*processFrame_)(in); > - finishFrame(frame, bufferId); > + (this->*processFrame_)(in, &stats); > + finishFrame(frame, bufferId, &stats, 1); > bench_.finishFrame(); > }
On Tue, Feb 17, 2026 at 05:21:20PM +0100, Milan Zamazal wrote: > Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > > > Move the storage used to accumulate the RGB sums and the Y histogram > > out of the SwStatsCpu class and into the callers. > > > > The idea is to allow a single SwStatsCpu object to be shared between > > multiple threads each processing part of the image, with finishFrame() > > accumulating the per thread data into the final stats for the entire > > frame. I'm not sure why it has to be done this way. In patch 5/5, where you introduce multi-threading, you still create a single instance of the SwStatsCpu class. You could allocate the array of stats data internally in the class. Why did you pick this design ? > > This is a preparation patch for making DebayerCpu support multi-threading > > and this could also be used to make processFrame() multi-threaded. > > > > Benchmarking with the GPU-ISP which does separate swstats benchmarking, > > on the Uno-Q which has a weak CPU which is good for performance testing, > > shows 20-21ms to generate stats for a 3272x2464 frame both before and > > after this change. > > > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > > --- > > .../internal/software_isp/swstats_cpu.h | 29 ++++----- > > src/libcamera/software_isp/debayer_cpu.cpp | 12 ++-- > > src/libcamera/software_isp/debayer_cpu.h | 1 + > > src/libcamera/software_isp/swstats_cpu.cpp | 65 +++++++++++++------ > > 4 files changed, 65 insertions(+), 42 deletions(-) > > > > diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h > > index 64b3e23f..a157afe8 100644 > > --- a/include/libcamera/internal/software_isp/swstats_cpu.h > > +++ b/include/libcamera/internal/software_isp/swstats_cpu.h > > @@ -53,11 +53,11 @@ public: > > > > int configure(const StreamConfiguration &inputCfg); > > void setWindow(const Rectangle &window); > > - void startFrame(uint32_t frame); > > - void finishFrame(uint32_t frame, uint32_t bufferId); > > + void startFrame(uint32_t frame, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); > > `struct' keyword not needed (here and elsewhere). > > Is there any reason not to use std::vector (+ to omit statsBufferCount > argument then)? Or the Span class. Pointer + size is a no-go anywhere in libcamera. > > + void finishFrame(uint32_t frame, uint32_t bufferId, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); > > void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input); > > > > - void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) > > + void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) > > Would it be perhaps better to use references rather than pointers for `stats'? We typically use const references for input parameters that can't be null, and pointers for input parameters that can be null and for output parameters (I think that's even documented in coding-style.rst). > > { > > if (frame % kStatPerNumFrames) > > return; > > @@ -66,10 +66,10 @@ public: > > y >= (window_.y + window_.height)) > > return; > > > > - (this->*stats0_)(src); > > + (this->*stats0_)(src, stats); > > } > > > > - void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) > > + void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) > > { > > if (frame % kStatPerNumFrames) > > return; > > @@ -78,27 +78,27 @@ public: > > y >= (window_.y + window_.height)) > > return; > > > > - (this->*stats2_)(src); > > + (this->*stats2_)(src, stats); > > } > > > > Signal<uint32_t, uint32_t> statsReady; > > > > private: > > - using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]); > > - using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in); > > + using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats *stats); > > + using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in, SwIspStats *stats); > > > > int setupStandardBayerOrder(BayerFormat::Order order); > > /* Bayer 8 bpp unpacked */ > > - void statsBGGR8Line0(const uint8_t *src[]); > > + void statsBGGR8Line0(const uint8_t *src[], SwIspStats *stats); > > /* Bayer 10 bpp unpacked */ > > - void statsBGGR10Line0(const uint8_t *src[]); > > + void statsBGGR10Line0(const uint8_t *src[], SwIspStats *stats); > > /* Bayer 12 bpp unpacked */ > > - void statsBGGR12Line0(const uint8_t *src[]); > > + void statsBGGR12Line0(const uint8_t *src[], SwIspStats *stats); > > /* Bayer 10 bpp packed */ > > - void statsBGGR10PLine0(const uint8_t *src[]); > > - void statsGBRG10PLine0(const uint8_t *src[]); > > + void statsBGGR10PLine0(const uint8_t *src[], SwIspStats *stats); > > + void statsGBRG10PLine0(const uint8_t *src[], SwIspStats *stats); > > > > - void processBayerFrame2(MappedFrameBuffer &in); > > + void processBayerFrame2(MappedFrameBuffer &in, SwIspStats *stats); > > > > processFrameFn processFrame_; > > > > @@ -117,7 +117,6 @@ private: > > unsigned int stride_; > > > > SharedMemObject<SwIspStats> sharedStats_; > > - SwIspStats stats_; > > Benchmark bench_; > > }; > > > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > > index d0988357..97c1959a 100644 > > --- a/src/libcamera/software_isp/debayer_cpu.cpp > > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > > @@ -673,7 +673,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > > for (unsigned int y = 0; y < yEnd; y += 2) { > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers); > > - stats_->processLine0(frame, y, linePointers); > > + stats_->processLine0(frame, y, linePointers, &statsBuffer_); > > (this->*debayer0_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -688,7 +688,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > > if (window_.y == 0) { > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers); > > - stats_->processLine0(frame, yEnd, linePointers); > > + stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_); > > (this->*debayer0_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -724,7 +724,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > > for (unsigned int y = 0; y < window_.height; y += 4) { > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers); > > - stats_->processLine0(frame, y, linePointers); > > + stats_->processLine0(frame, y, linePointers, &statsBuffer_); > > (this->*debayer0_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -737,7 +737,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) > > > > shiftLinePointers(linePointers, src); > > memcpyNextLine(linePointers); > > - stats_->processLine2(frame, y, linePointers); > > + stats_->processLine2(frame, y, linePointers, &statsBuffer_); > > (this->*debayer2_)(dst, linePointers); > > src += inputConfig_.stride; > > dst += outputConfig_.stride; > > @@ -866,7 +866,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > return; > > } > > > > - stats_->startFrame(frame); > > + stats_->startFrame(frame, &statsBuffer_, 1); > > > > if (inputConfig_.patternSize.height == 2) > > process2(frame, in.planes()[0].data(), out.planes()[0].data()); > > @@ -885,7 +885,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > * > > * \todo Pass real bufferId once stats buffer passing is changed. > > */ > > - stats_->finishFrame(frame, 0); > > + stats_->finishFrame(frame, 0, &statsBuffer_, 1); > > outputBufferReady.emit(output); > > inputBufferReady.emit(input); > > } > > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > > index 7a651746..8abf5168 100644 > > --- a/src/libcamera/software_isp/debayer_cpu.h > > +++ b/src/libcamera/software_isp/debayer_cpu.h > > @@ -135,6 +135,7 @@ private: > > LookupTable gammaLut_; > > bool ccmEnabled_; > > DebayerParams params_; > > + SwIspStats statsBuffer_; > > > > debayerFn debayer0_; > > debayerFn debayer1_; > > diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp > > index 5c3011a7..23842f6c 100644 > > --- a/src/libcamera/software_isp/swstats_cpu.cpp > > +++ b/src/libcamera/software_isp/swstats_cpu.cpp > > @@ -182,14 +182,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */ > > yVal = r * kRedYMul; \ > > yVal += g * kGreenYMul; \ > > yVal += b * kBlueYMul; \ > > - stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; > > + stats->yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; > > > > #define SWSTATS_FINISH_LINE_STATS() \ > > - stats_.sum_.r() += sumR; \ > > - stats_.sum_.g() += sumG; \ > > - stats_.sum_.b() += sumB; > > + stats->sum_.r() += sumR; \ > > + stats->sum_.g() += sumG; \ > > + stats->sum_.b() += sumB; > > > > -void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) > > +void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats *stats) > > { > > const uint8_t *src0 = src[1] + window_.x; > > const uint8_t *src1 = src[2] + window_.x; > > @@ -214,7 +214,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) > > SWSTATS_FINISH_LINE_STATS() > > } > > > > -void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) > > +void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats *stats) > > { > > const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; > > const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; > > @@ -240,7 +240,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) > > SWSTATS_FINISH_LINE_STATS() > > } > > > > -void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) > > +void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats *stats) > > { > > const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; > > const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; > > @@ -266,7 +266,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) > > SWSTATS_FINISH_LINE_STATS() > > } > > > > -void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) > > +void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats *stats) > > { > > const uint8_t *src0 = src[1] + window_.x * 5 / 4; > > const uint8_t *src1 = src[2] + window_.x * 5 / 4; > > @@ -292,7 +292,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) > > SWSTATS_FINISH_LINE_STATS() > > } > > > > -void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) > > +void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats *stats) > > { > > const uint8_t *src0 = src[1] + window_.x * 5 / 4; > > const uint8_t *src1 = src[2] + window_.x * 5 / 4; > > @@ -321,10 +321,13 @@ void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) > > /** > > * \brief Reset state to start statistics gathering for a new frame > > * \param[in] frame The frame number > > + * \param[in] statsBuffer Array of buffers storing stats > > + * \param[in] statsBufferCount number of buffers in the statsBuffer array > > * > > * This may only be called after a successful setWindow() call. > > */ > > -void SwStatsCpu::startFrame(uint32_t frame) > > +void SwStatsCpu::startFrame(uint32_t frame, > > + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) > > { > > if (frame % kStatPerNumFrames) > > return; > > @@ -332,21 +335,39 @@ void SwStatsCpu::startFrame(uint32_t frame) > > if (window_.width == 0) > > LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()"; > > > > - stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 }); > > - stats_.yHistogram.fill(0); > > + for (unsigned int i = 0; i < statsBufferCount; i++) { > > + statsBuffer[i].sum_ = RGB<uint64_t>({ 0, 0, 0 }); > > + statsBuffer[i].yHistogram.fill(0); > > + } > > } > > > > /** > > * \brief Finish statistics calculation for the current frame > > * \param[in] frame The frame number > > * \param[in] bufferId ID of the statistics buffer > > + * \param[in] statsBuffer Array of buffers storing stats > > + * \param[in] statsBufferCount number of buffers in the statsBuffer array > > * > > * This may only be called after a successful setWindow() call. > > */ > > -void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId) > > +void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId, > > + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) > > { > > - stats_.valid = frame % kStatPerNumFrames == 0; > > - *sharedStats_ = stats_; > > + if (frame % kStatPerNumFrames) { > > + sharedStats_->valid = false; > > + statsReady.emit(frame, bufferId); > > + return; > > Using such a shortcut rather than > > if (stats_.valid) { > ... > } > > feels error-prone but up to the maintainers whether they like it or not. > > > + } > > + > > + sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 }); > > + sharedStats_->yHistogram.fill(0); > > + for (unsigned int i = 0; i < statsBufferCount; i++) { > > + sharedStats_->sum_ += statsBuffer[i].sum_; > > + for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++) > > + sharedStats_->yHistogram[j] += statsBuffer[i].yHistogram[j]; > > + } > > + > > + sharedStats_->valid = true; > > statsReady.emit(frame, bufferId); > > } > > > > @@ -487,7 +508,7 @@ void SwStatsCpu::setWindow(const Rectangle &window) > > window_.height &= ~(patternSize_.height - 1); > > } > > > > -void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > > +void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in, SwIspStats *stats) > > { > > const uint8_t *src = in.planes()[0].data(); > > const uint8_t *linePointers[3]; > > @@ -504,7 +525,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > > /* linePointers[0] is not used by any stats0_ functions */ > > linePointers[1] = src; > > linePointers[2] = src + stride_; > > - (this->*stats0_)(linePointers); > > + (this->*stats0_)(linePointers, stats); > > src += stride_ * 2; > > } > > } > > @@ -520,12 +541,14 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > > void SwStatsCpu::processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input) > > { > > if (frame % kStatPerNumFrames) { > > - finishFrame(frame, bufferId); > > + finishFrame(frame, bufferId, NULL, 0); > > nullptr (or an empty std::vector) > > > return; > > } > > > > + SwIspStats stats; > > + > > bench_.startFrame(); > > - startFrame(frame); > > + startFrame(frame, &stats, 1); > > Would it be useful to add a comment somewhere that this method is not > interested in multihreading? Maybe obvious but still... These two ways > of computing stats (CPU, multi-threaded, startFrame+finishFrame; GPU, > single-go, processFrame) are somewhat confusing (not a fault of this > patch). > > > MappedFrameBuffer in(input, MappedFrameBuffer::MapFlag::Read); > > if (!in.isValid()) { > > @@ -533,8 +556,8 @@ void SwStatsCpu::processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *in > > return; > > } > > > > - (this->*processFrame_)(in); > > - finishFrame(frame, bufferId); > > + (this->*processFrame_)(in, &stats); > > + finishFrame(frame, bufferId, &stats, 1); > > bench_.finishFrame(); > > }
Hi, On 19-Feb-26 15:11, Laurent Pinchart wrote: > On Tue, Feb 17, 2026 at 05:21:20PM +0100, Milan Zamazal wrote: >> Hans de Goede <johannes.goede@oss.qualcomm.com> writes: >> >>> Move the storage used to accumulate the RGB sums and the Y histogram >>> out of the SwStatsCpu class and into the callers. >>> >>> The idea is to allow a single SwStatsCpu object to be shared between >>> multiple threads each processing part of the image, with finishFrame() >>> accumulating the per thread data into the final stats for the entire >>> frame. > > I'm not sure why it has to be done this way. In patch 5/5, where you > introduce multi-threading, you still create a single instance of the > SwStatsCpu class. You could allocate the array of stats data internally > in the class. Why did you pick this design ? Yes a single SwStatsCpu object, but an array of SwIspStats structs, one for each thread to store the statistics for its part of the entire input buffer before adding them. The DebayerCpu code needs access to the array of SwIspStats structs since each thread passes a pointer to its thread specific SwIspStats struct. This is done to avoid cache-line bouncing which would happen if multiple-threads were incrementing the values in a shared struct. I can move the allocation of the SwIspStats structs to the SwStatsCpu class, but then either the processLine0() method needs to take an index to know which of the N SwIspStats structs to use, or it needs to expose the array of SwIspStats structs which it allocated. I think that if we move the SwIspStats struct array allocation into SwStatsCpu, then passing an index of which buffer to use to processLine0() makes the most sense. Or we can just leave this as is. I don't have any preference either way, please let me know how you want to proceed. Regards, Hans
On Thu, Feb 19, 2026 at 05:37:02PM +0100, Hans de Goede wrote: > Hi, > > On 19-Feb-26 15:11, Laurent Pinchart wrote: > > On Tue, Feb 17, 2026 at 05:21:20PM +0100, Milan Zamazal wrote: > >> Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > >> > >>> Move the storage used to accumulate the RGB sums and the Y histogram > >>> out of the SwStatsCpu class and into the callers. > >>> > >>> The idea is to allow a single SwStatsCpu object to be shared between > >>> multiple threads each processing part of the image, with finishFrame() > >>> accumulating the per thread data into the final stats for the entire > >>> frame. > > > > I'm not sure why it has to be done this way. In patch 5/5, where you > > introduce multi-threading, you still create a single instance of the > > SwStatsCpu class. You could allocate the array of stats data internally > > in the class. Why did you pick this design ? > > Yes a single SwStatsCpu object, but an array of SwIspStats structs, > one for each thread to store the statistics for its part of > the entire input buffer before adding them. > > The DebayerCpu code needs access to the array of SwIspStats structs > since each thread passes a pointer to its thread specific SwIspStats > struct. This is done to avoid cache-line bouncing which would happen > if multiple-threads were incrementing the values in a shared struct. > > I can move the allocation of the SwIspStats structs to the SwStatsCpu > class, but then either the processLine0() method needs to take > an index to know which of the N SwIspStats structs to use, or it > needs to expose the array of SwIspStats structs which it allocated. > > I think that if we move the SwIspStats struct array allocation > into SwStatsCpu, then passing an index of which buffer to use > to processLine0() makes the most sense. > > Or we can just leave this as is. > > I don't have any preference either way, please let me know how you > want to proceed. Let's focus on the discussion in 5/5 first, as I think it will influence how you will end up storing and passing data around.
Hi, On 17-Feb-26 5:21 PM, Milan Zamazal wrote: > Hi Hans, > > thank you for the patches. > > Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > >> Move the storage used to accumulate the RGB sums and the Y histogram >> out of the SwStatsCpu class and into the callers. >> >> The idea is to allow a single SwStatsCpu object to be shared between >> multiple threads each processing part of the image, with finishFrame() >> accumulating the per thread data into the final stats for the entire >> frame. >> >> This is a preparation patch for making DebayerCpu support multi-threading >> and this could also be used to make processFrame() multi-threaded. >> >> Benchmarking with the GPU-ISP which does separate swstats benchmarking, >> on the Uno-Q which has a weak CPU which is good for performance testing, >> shows 20-21ms to generate stats for a 3272x2464 frame both before and >> after this change. >> >> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >> --- >> .../internal/software_isp/swstats_cpu.h | 29 ++++----- >> src/libcamera/software_isp/debayer_cpu.cpp | 12 ++-- >> src/libcamera/software_isp/debayer_cpu.h | 1 + >> src/libcamera/software_isp/swstats_cpu.cpp | 65 +++++++++++++------ >> 4 files changed, 65 insertions(+), 42 deletions(-) >> >> diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h >> index 64b3e23f..a157afe8 100644 >> --- a/include/libcamera/internal/software_isp/swstats_cpu.h >> +++ b/include/libcamera/internal/software_isp/swstats_cpu.h >> @@ -53,11 +53,11 @@ public: >> >> int configure(const StreamConfiguration &inputCfg); >> void setWindow(const Rectangle &window); >> - void startFrame(uint32_t frame); >> - void finishFrame(uint32_t frame, uint32_t bufferId); >> + void startFrame(uint32_t frame, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); > > `struct' keyword not needed (here and elsewhere). Ack. > Is there any reason not to use std::vector (+ to omit statsBufferCount > argument then)? Ack, v2 will use std::vector. >> + void finishFrame(uint32_t frame, uint32_t bufferId, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); >> void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input); >> >> - void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) >> + void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) > > Would it be perhaps better to use references rather than pointers for `stats'? Ack, v2 will use references. ... >> +void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId, >> + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) >> { >> - stats_.valid = frame % kStatPerNumFrames == 0; >> - *sharedStats_ = stats_; >> + if (frame % kStatPerNumFrames) { >> + sharedStats_->valid = false; >> + statsReady.emit(frame, bufferId); >> + return; > > Using such a shortcut rather than > > if (stats_.valid) { > ... > } > > feels error-prone but up to the maintainers whether they like it or not. Ack, fixed for v2. Regards, Hans
diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h index 64b3e23f..a157afe8 100644 --- a/include/libcamera/internal/software_isp/swstats_cpu.h +++ b/include/libcamera/internal/software_isp/swstats_cpu.h @@ -53,11 +53,11 @@ public: int configure(const StreamConfiguration &inputCfg); void setWindow(const Rectangle &window); - void startFrame(uint32_t frame); - void finishFrame(uint32_t frame, uint32_t bufferId); + void startFrame(uint32_t frame, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); + void finishFrame(uint32_t frame, uint32_t bufferId, struct SwIspStats statsBuffer[], unsigned int statsBufferCount); void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input); - void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) + void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) { if (frame % kStatPerNumFrames) return; @@ -66,10 +66,10 @@ public: y >= (window_.y + window_.height)) return; - (this->*stats0_)(src); + (this->*stats0_)(src, stats); } - void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) + void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], SwIspStats *stats) { if (frame % kStatPerNumFrames) return; @@ -78,27 +78,27 @@ public: y >= (window_.y + window_.height)) return; - (this->*stats2_)(src); + (this->*stats2_)(src, stats); } Signal<uint32_t, uint32_t> statsReady; private: - using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]); - using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in); + using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats *stats); + using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in, SwIspStats *stats); int setupStandardBayerOrder(BayerFormat::Order order); /* Bayer 8 bpp unpacked */ - void statsBGGR8Line0(const uint8_t *src[]); + void statsBGGR8Line0(const uint8_t *src[], SwIspStats *stats); /* Bayer 10 bpp unpacked */ - void statsBGGR10Line0(const uint8_t *src[]); + void statsBGGR10Line0(const uint8_t *src[], SwIspStats *stats); /* Bayer 12 bpp unpacked */ - void statsBGGR12Line0(const uint8_t *src[]); + void statsBGGR12Line0(const uint8_t *src[], SwIspStats *stats); /* Bayer 10 bpp packed */ - void statsBGGR10PLine0(const uint8_t *src[]); - void statsGBRG10PLine0(const uint8_t *src[]); + void statsBGGR10PLine0(const uint8_t *src[], SwIspStats *stats); + void statsGBRG10PLine0(const uint8_t *src[], SwIspStats *stats); - void processBayerFrame2(MappedFrameBuffer &in); + void processBayerFrame2(MappedFrameBuffer &in, SwIspStats *stats); processFrameFn processFrame_; @@ -117,7 +117,6 @@ private: unsigned int stride_; SharedMemObject<SwIspStats> sharedStats_; - SwIspStats stats_; Benchmark bench_; }; diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index d0988357..97c1959a 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -673,7 +673,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) for (unsigned int y = 0; y < yEnd; y += 2) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, y, linePointers); + stats_->processLine0(frame, y, linePointers, &statsBuffer_); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -688,7 +688,7 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) if (window_.y == 0) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, yEnd, linePointers); + stats_->processLine0(frame, yEnd, linePointers, &statsBuffer_); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -724,7 +724,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) for (unsigned int y = 0; y < window_.height; y += 4) { shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine0(frame, y, linePointers); + stats_->processLine0(frame, y, linePointers, &statsBuffer_); (this->*debayer0_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -737,7 +737,7 @@ void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst) shiftLinePointers(linePointers, src); memcpyNextLine(linePointers); - stats_->processLine2(frame, y, linePointers); + stats_->processLine2(frame, y, linePointers, &statsBuffer_); (this->*debayer2_)(dst, linePointers); src += inputConfig_.stride; dst += outputConfig_.stride; @@ -866,7 +866,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output return; } - stats_->startFrame(frame); + stats_->startFrame(frame, &statsBuffer_, 1); if (inputConfig_.patternSize.height == 2) process2(frame, in.planes()[0].data(), out.planes()[0].data()); @@ -885,7 +885,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output * * \todo Pass real bufferId once stats buffer passing is changed. */ - stats_->finishFrame(frame, 0); + stats_->finishFrame(frame, 0, &statsBuffer_, 1); outputBufferReady.emit(output); inputBufferReady.emit(input); } diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index 7a651746..8abf5168 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -135,6 +135,7 @@ private: LookupTable gammaLut_; bool ccmEnabled_; DebayerParams params_; + SwIspStats statsBuffer_; debayerFn debayer0_; debayerFn debayer1_; diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp index 5c3011a7..23842f6c 100644 --- a/src/libcamera/software_isp/swstats_cpu.cpp +++ b/src/libcamera/software_isp/swstats_cpu.cpp @@ -182,14 +182,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */ yVal = r * kRedYMul; \ yVal += g * kGreenYMul; \ yVal += b * kBlueYMul; \ - stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; + stats->yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; #define SWSTATS_FINISH_LINE_STATS() \ - stats_.sum_.r() += sumR; \ - stats_.sum_.g() += sumG; \ - stats_.sum_.b() += sumB; + stats->sum_.r() += sumR; \ + stats->sum_.g() += sumG; \ + stats->sum_.b() += sumB; -void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats *stats) { const uint8_t *src0 = src[1] + window_.x; const uint8_t *src1 = src[2] + window_.x; @@ -214,7 +214,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats *stats) { const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; @@ -240,7 +240,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats *stats) { const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; @@ -266,7 +266,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats *stats) { const uint8_t *src0 = src[1] + window_.x * 5 / 4; const uint8_t *src1 = src[2] + window_.x * 5 / 4; @@ -292,7 +292,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) +void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats *stats) { const uint8_t *src0 = src[1] + window_.x * 5 / 4; const uint8_t *src1 = src[2] + window_.x * 5 / 4; @@ -321,10 +321,13 @@ void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) /** * \brief Reset state to start statistics gathering for a new frame * \param[in] frame The frame number + * \param[in] statsBuffer Array of buffers storing stats + * \param[in] statsBufferCount number of buffers in the statsBuffer array * * This may only be called after a successful setWindow() call. */ -void SwStatsCpu::startFrame(uint32_t frame) +void SwStatsCpu::startFrame(uint32_t frame, + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) { if (frame % kStatPerNumFrames) return; @@ -332,21 +335,39 @@ void SwStatsCpu::startFrame(uint32_t frame) if (window_.width == 0) LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()"; - stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 }); - stats_.yHistogram.fill(0); + for (unsigned int i = 0; i < statsBufferCount; i++) { + statsBuffer[i].sum_ = RGB<uint64_t>({ 0, 0, 0 }); + statsBuffer[i].yHistogram.fill(0); + } } /** * \brief Finish statistics calculation for the current frame * \param[in] frame The frame number * \param[in] bufferId ID of the statistics buffer + * \param[in] statsBuffer Array of buffers storing stats + * \param[in] statsBufferCount number of buffers in the statsBuffer array * * This may only be called after a successful setWindow() call. */ -void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId) +void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId, + struct SwIspStats statsBuffer[], unsigned int statsBufferCount) { - stats_.valid = frame % kStatPerNumFrames == 0; - *sharedStats_ = stats_; + if (frame % kStatPerNumFrames) { + sharedStats_->valid = false; + statsReady.emit(frame, bufferId); + return; + } + + sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 }); + sharedStats_->yHistogram.fill(0); + for (unsigned int i = 0; i < statsBufferCount; i++) { + sharedStats_->sum_ += statsBuffer[i].sum_; + for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++) + sharedStats_->yHistogram[j] += statsBuffer[i].yHistogram[j]; + } + + sharedStats_->valid = true; statsReady.emit(frame, bufferId); } @@ -487,7 +508,7 @@ void SwStatsCpu::setWindow(const Rectangle &window) window_.height &= ~(patternSize_.height - 1); } -void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) +void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in, SwIspStats *stats) { const uint8_t *src = in.planes()[0].data(); const uint8_t *linePointers[3]; @@ -504,7 +525,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) /* linePointers[0] is not used by any stats0_ functions */ linePointers[1] = src; linePointers[2] = src + stride_; - (this->*stats0_)(linePointers); + (this->*stats0_)(linePointers, stats); src += stride_ * 2; } } @@ -520,12 +541,14 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) void SwStatsCpu::processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input) { if (frame % kStatPerNumFrames) { - finishFrame(frame, bufferId); + finishFrame(frame, bufferId, NULL, 0); return; } + SwIspStats stats; + bench_.startFrame(); - startFrame(frame); + startFrame(frame, &stats, 1); MappedFrameBuffer in(input, MappedFrameBuffer::MapFlag::Read); if (!in.isValid()) { @@ -533,8 +556,8 @@ void SwStatsCpu::processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *in return; } - (this->*processFrame_)(in); - finishFrame(frame, bufferId); + (this->*processFrame_)(in, &stats); + finishFrame(frame, bufferId, &stats, 1); bench_.finishFrame(); }
Move the storage used to accumulate the RGB sums and the Y histogram out of the SwStatsCpu class and into the callers. The idea is to allow a single SwStatsCpu object to be shared between multiple threads each processing part of the image, with finishFrame() accumulating the per thread data into the final stats for the entire frame. This is a preparation patch for making DebayerCpu support multi-threading and this could also be used to make processFrame() multi-threaded. Benchmarking with the GPU-ISP which does separate swstats benchmarking, on the Uno-Q which has a weak CPU which is good for performance testing, shows 20-21ms to generate stats for a 3272x2464 frame both before and after this change. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- .../internal/software_isp/swstats_cpu.h | 29 ++++----- src/libcamera/software_isp/debayer_cpu.cpp | 12 ++-- src/libcamera/software_isp/debayer_cpu.h | 1 + src/libcamera/software_isp/swstats_cpu.cpp | 65 +++++++++++++------ 4 files changed, 65 insertions(+), 42 deletions(-)