| Message ID | 20260223160930.27913-2-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi Hans, thank you for the update. Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > Make the storage used to accumulate the RGB sums and the Y histogram > value a vector of SwIspStats objects instead of a single object so > that when using multi-threading every thread can use its own storage to > collect intermediate stats to avoid cache-line bouncing. > > Benchmarking with the GPU-ISP which does separate swstats benchmarking, > on the Arduino Uno-Q which has a weak CPU which is good for performance > testing, shows 20ms to generate stats for a 3272x2464 frame both before > and after this change. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > Changes in v2: > - Move the allocation of the vector of SwIspStats objects to inside > the SwStatsCpu class, controlled by a configure() arguments instead > of making the caller allocate the objects > --- > .../internal/software_isp/swstats_cpu.h | 25 ++++----- > src/libcamera/software_isp/swstats_cpu.cpp | 54 +++++++++++++------ > 2 files changed, 50 insertions(+), 29 deletions(-) > > diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h > index 64b3e23f5..feee92f99 100644 > --- a/include/libcamera/internal/software_isp/swstats_cpu.h > +++ b/include/libcamera/internal/software_isp/swstats_cpu.h > @@ -12,6 +12,7 @@ > #pragma once > > #include <stdint.h> > +#include <vector> > > #include <libcamera/base/signal.h> > > @@ -51,13 +52,13 @@ public: > > const Size &patternSize() { return patternSize_; } > > - int configure(const StreamConfiguration &inputCfg); > + int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1); > void setWindow(const Rectangle &window); > void startFrame(uint32_t frame); > void finishFrame(uint32_t frame, uint32_t bufferId); > void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input); > > - void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) > + void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) > { > if (frame % kStatPerNumFrames) > return; > @@ -66,10 +67,10 @@ public: > y >= (window_.y + window_.height)) > return; > > - (this->*stats0_)(src); > + (this->*stats0_)(src, stats_[statsBufferIndex]); > } > > - void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) > + void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) > { > if (frame % kStatPerNumFrames) > return; > @@ -78,25 +79,25 @@ public: > y >= (window_.y + window_.height)) > return; > > - (this->*stats2_)(src); > + (this->*stats2_)(src, stats_[statsBufferIndex]); > } > > Signal<uint32_t, uint32_t> statsReady; > > private: > - using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]); > + using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats); > using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in); > > int setupStandardBayerOrder(BayerFormat::Order order); > /* Bayer 8 bpp unpacked */ > - void statsBGGR8Line0(const uint8_t *src[]); > + void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats); > /* Bayer 10 bpp unpacked */ > - void statsBGGR10Line0(const uint8_t *src[]); > + void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats); > /* Bayer 12 bpp unpacked */ > - void statsBGGR12Line0(const uint8_t *src[]); > + void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats); > /* Bayer 10 bpp packed */ > - void statsBGGR10PLine0(const uint8_t *src[]); > - void statsGBRG10PLine0(const uint8_t *src[]); > + void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats); > + void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats); > > void processBayerFrame2(MappedFrameBuffer &in); > > @@ -116,8 +117,8 @@ private: > unsigned int xShift_; > unsigned int stride_; > > + std::vector<SwIspStats> stats_; > SharedMemObject<SwIspStats> sharedStats_; > - SwIspStats stats_; > Benchmark bench_; > }; > > diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp > index 1cedcfbc1..7c71aed96 100644 > --- a/src/libcamera/software_isp/swstats_cpu.cpp > +++ b/src/libcamera/software_isp/swstats_cpu.cpp > @@ -74,11 +74,12 @@ namespace libcamera { > */ > > /** > - * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) > + * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) > * \brief Process line 0 > * \param[in] frame The frame number > * \param[in] y The y coordinate. > * \param[in] src The input data. > + * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading. > * > * This function processes line 0 for input formats with > * patternSize height == 1. > @@ -97,14 +98,18 @@ namespace libcamera { > * to the line in plane 0, etc. > * > * For non Bayer single plane input data only a single src pointer is required. > + * > + * The statsBufferIndex value must be less than the statsBufferCount value passed > + * to configure(). > */ > > /** > - * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) > + * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) > * \brief Process line 2 and 3 > * \param[in] frame The frame number > * \param[in] y The y coordinate. > * \param[in] src The input data. > + * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading. > * > * This function processes line 2 and 3 for input formats with > * patternSize height == 4. > @@ -182,14 +187,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */ > yVal = r * kRedYMul; \ > yVal += g * kGreenYMul; \ > yVal += b * kBlueYMul; \ > - stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; > + stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; > > #define SWSTATS_FINISH_LINE_STATS() \ > - stats_.sum_.r() += sumR; \ > - stats_.sum_.g() += sumG; \ > - stats_.sum_.b() += sumB; > + stats.sum_.r() += sumR; \ > + stats.sum_.g() += sumG; \ > + stats.sum_.b() += sumB; > > -void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats) > { > const uint8_t *src0 = src[1] + window_.x; > const uint8_t *src1 = src[2] + window_.x; > @@ -214,7 +219,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats) > { > const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; > const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; > @@ -240,7 +245,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats) > { > const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; > const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; > @@ -266,7 +271,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) > +void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats) > { > const uint8_t *src0 = src[1] + window_.x * 5 / 4; > const uint8_t *src1 = src[2] + window_.x * 5 / 4; > @@ -292,7 +297,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) > SWSTATS_FINISH_LINE_STATS() > } > > -void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) > +void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats) > { > const uint8_t *src0 = src[1] + window_.x * 5 / 4; > const uint8_t *src1 = src[2] + window_.x * 5 / 4; > @@ -332,8 +337,10 @@ void SwStatsCpu::startFrame(uint32_t frame) > if (window_.width == 0) > LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()"; > > - stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 }); > - stats_.yHistogram.fill(0); > + for (unsigned int i = 0; i < stats_.size(); i++) { How about: for (auto &s : stats_) > + stats_[i].sum_ = RGB<uint64_t>({ 0, 0, 0 }); > + stats_[i].yHistogram.fill(0); > + } > } > > /** > @@ -345,8 +352,19 @@ void SwStatsCpu::startFrame(uint32_t frame) > */ > void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId) > { > - stats_.valid = frame % kStatPerNumFrames == 0; > - *sharedStats_ = stats_; > + bool valid = frame % kStatPerNumFrames == 0; > + > + if (valid) { > + sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 }); > + sharedStats_->yHistogram.fill(0); > + for (unsigned int i = 0; i < stats_.size(); i++) { The same here. Other than these, the change looks good to me. > + sharedStats_->sum_ += stats_[i].sum_; > + for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++) > + sharedStats_->yHistogram[j] += stats_[i].yHistogram[j]; > + } > + } > + > + sharedStats_->valid = valid; > statsReady.emit(frame, bufferId); > } > > @@ -389,12 +407,14 @@ int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order) > /** > * \brief Configure the statistics object for the passed in input format > * \param[in] inputCfg The input format > + * \param[in] statsBufferCount number of internal stats buffers to use for multi-threading > * > * \return 0 on success, a negative errno value on failure > */ > -int SwStatsCpu::configure(const StreamConfiguration &inputCfg) > +int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount) > { > stride_ = inputCfg.stride; > + stats_.resize(statsBufferCount); > > BayerFormat bayerFormat = > BayerFormat::fromPixelFormat(inputCfg.pixelFormat); > @@ -504,7 +524,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) > /* linePointers[0] is not used by any stats0_ functions */ > linePointers[1] = src; > linePointers[2] = src + stride_; > - (this->*stats0_)(linePointers); > + (this->*stats0_)(linePointers, stats_[0]); > src += stride_ * 2; > } > }
diff --git a/include/libcamera/internal/software_isp/swstats_cpu.h b/include/libcamera/internal/software_isp/swstats_cpu.h index 64b3e23f5..feee92f99 100644 --- a/include/libcamera/internal/software_isp/swstats_cpu.h +++ b/include/libcamera/internal/software_isp/swstats_cpu.h @@ -12,6 +12,7 @@ #pragma once #include <stdint.h> +#include <vector> #include <libcamera/base/signal.h> @@ -51,13 +52,13 @@ public: const Size &patternSize() { return patternSize_; } - int configure(const StreamConfiguration &inputCfg); + int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1); void setWindow(const Rectangle &window); void startFrame(uint32_t frame); void finishFrame(uint32_t frame, uint32_t bufferId); void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input); - void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) + void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) { if (frame % kStatPerNumFrames) return; @@ -66,10 +67,10 @@ public: y >= (window_.y + window_.height)) return; - (this->*stats0_)(src); + (this->*stats0_)(src, stats_[statsBufferIndex]); } - void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) + void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) { if (frame % kStatPerNumFrames) return; @@ -78,25 +79,25 @@ public: y >= (window_.y + window_.height)) return; - (this->*stats2_)(src); + (this->*stats2_)(src, stats_[statsBufferIndex]); } Signal<uint32_t, uint32_t> statsReady; private: - using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]); + using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats); using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in); int setupStandardBayerOrder(BayerFormat::Order order); /* Bayer 8 bpp unpacked */ - void statsBGGR8Line0(const uint8_t *src[]); + void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats); /* Bayer 10 bpp unpacked */ - void statsBGGR10Line0(const uint8_t *src[]); + void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats); /* Bayer 12 bpp unpacked */ - void statsBGGR12Line0(const uint8_t *src[]); + void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats); /* Bayer 10 bpp packed */ - void statsBGGR10PLine0(const uint8_t *src[]); - void statsGBRG10PLine0(const uint8_t *src[]); + void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats); + void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats); void processBayerFrame2(MappedFrameBuffer &in); @@ -116,8 +117,8 @@ private: unsigned int xShift_; unsigned int stride_; + std::vector<SwIspStats> stats_; SharedMemObject<SwIspStats> sharedStats_; - SwIspStats stats_; Benchmark bench_; }; diff --git a/src/libcamera/software_isp/swstats_cpu.cpp b/src/libcamera/software_isp/swstats_cpu.cpp index 1cedcfbc1..7c71aed96 100644 --- a/src/libcamera/software_isp/swstats_cpu.cpp +++ b/src/libcamera/software_isp/swstats_cpu.cpp @@ -74,11 +74,12 @@ namespace libcamera { */ /** - * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[]) + * \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) * \brief Process line 0 * \param[in] frame The frame number * \param[in] y The y coordinate. * \param[in] src The input data. + * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading. * * This function processes line 0 for input formats with * patternSize height == 1. @@ -97,14 +98,18 @@ namespace libcamera { * to the line in plane 0, etc. * * For non Bayer single plane input data only a single src pointer is required. + * + * The statsBufferIndex value must be less than the statsBufferCount value passed + * to configure(). */ /** - * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[]) + * \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0) * \brief Process line 2 and 3 * \param[in] frame The frame number * \param[in] y The y coordinate. * \param[in] src The input data. + * \param[in] statsBufferIndex Index of stats buffer to use for multi-threading. * * This function processes line 2 and 3 for input formats with * patternSize height == 4. @@ -182,14 +187,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */ yVal = r * kRedYMul; \ yVal += g * kGreenYMul; \ yVal += b * kBlueYMul; \ - stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; + stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++; #define SWSTATS_FINISH_LINE_STATS() \ - stats_.sum_.r() += sumR; \ - stats_.sum_.g() += sumG; \ - stats_.sum_.b() += sumB; + stats.sum_.r() += sumR; \ + stats.sum_.g() += sumG; \ + stats.sum_.b() += sumB; -void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats) { const uint8_t *src0 = src[1] + window_.x; const uint8_t *src1 = src[2] + window_.x; @@ -214,7 +219,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats) { const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; @@ -240,7 +245,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats) { const uint16_t *src0 = (const uint16_t *)src[1] + window_.x; const uint16_t *src1 = (const uint16_t *)src[2] + window_.x; @@ -266,7 +271,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) +void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats) { const uint8_t *src0 = src[1] + window_.x * 5 / 4; const uint8_t *src1 = src[2] + window_.x * 5 / 4; @@ -292,7 +297,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[]) SWSTATS_FINISH_LINE_STATS() } -void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[]) +void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats) { const uint8_t *src0 = src[1] + window_.x * 5 / 4; const uint8_t *src1 = src[2] + window_.x * 5 / 4; @@ -332,8 +337,10 @@ void SwStatsCpu::startFrame(uint32_t frame) if (window_.width == 0) LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()"; - stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 }); - stats_.yHistogram.fill(0); + for (unsigned int i = 0; i < stats_.size(); i++) { + stats_[i].sum_ = RGB<uint64_t>({ 0, 0, 0 }); + stats_[i].yHistogram.fill(0); + } } /** @@ -345,8 +352,19 @@ void SwStatsCpu::startFrame(uint32_t frame) */ void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId) { - stats_.valid = frame % kStatPerNumFrames == 0; - *sharedStats_ = stats_; + bool valid = frame % kStatPerNumFrames == 0; + + if (valid) { + sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 }); + sharedStats_->yHistogram.fill(0); + for (unsigned int i = 0; i < stats_.size(); i++) { + sharedStats_->sum_ += stats_[i].sum_; + for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++) + sharedStats_->yHistogram[j] += stats_[i].yHistogram[j]; + } + } + + sharedStats_->valid = valid; statsReady.emit(frame, bufferId); } @@ -389,12 +407,14 @@ int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order) /** * \brief Configure the statistics object for the passed in input format * \param[in] inputCfg The input format + * \param[in] statsBufferCount number of internal stats buffers to use for multi-threading * * \return 0 on success, a negative errno value on failure */ -int SwStatsCpu::configure(const StreamConfiguration &inputCfg) +int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount) { stride_ = inputCfg.stride; + stats_.resize(statsBufferCount); BayerFormat bayerFormat = BayerFormat::fromPixelFormat(inputCfg.pixelFormat); @@ -504,7 +524,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in) /* linePointers[0] is not used by any stats0_ functions */ linePointers[1] = src; linePointers[2] = src + stride_; - (this->*stats0_)(linePointers); + (this->*stats0_)(linePointers, stats_[0]); src += stride_ * 2; } }
Make the storage used to accumulate the RGB sums and the Y histogram value a vector of SwIspStats objects instead of a single object so that when using multi-threading every thread can use its own storage to collect intermediate stats to avoid cache-line bouncing. Benchmarking with the GPU-ISP which does separate swstats benchmarking, on the Arduino Uno-Q which has a weak CPU which is good for performance testing, shows 20ms to generate stats for a 3272x2464 frame both before and after this change. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- Changes in v2: - Move the allocation of the vector of SwIspStats objects to inside the SwStatsCpu class, controlled by a configure() arguments instead of making the caller allocate the objects --- .../internal/software_isp/swstats_cpu.h | 25 ++++----- src/libcamera/software_isp/swstats_cpu.cpp | 54 +++++++++++++------ 2 files changed, 50 insertions(+), 29 deletions(-)