| Message ID | 20260223160930.27913-4-johannes.goede@oss.qualcomm.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series |
|
| Related | show |
Hi 2026. 02. 23. 17:09 keltezéssel, Hans de Goede írta: > Add CPU soft ISP multi-threading support. > > Benchmark results for the Arduino Uno-Q with a weak CPU which is good for > performance testing, all numbers with an IMX219 running at > 3280x2464 -> 3272x2464: > > 1 thread : 147ms / frame, ~6.5 fps > 2 threads: 80ms / frame, ~12.5 fps > 3 threads: 65ms / frame, ~15 fps > > Adding a 4th thread does not improve performance. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > Changes in v2: > - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series > - Re-use threads instead of starting new threads every frame > --- > src/libcamera/software_isp/debayer_cpu.cpp | 53 ++++++++++++++++++++-- > src/libcamera/software_isp/debayer_cpu.h | 6 +++ > 2 files changed, 55 insertions(+), 4 deletions(-) > > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index 122bfbb05..ea1b17c1f 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -18,6 +18,8 @@ > > #include <linux/dma-buf.h> > > +#include <libcamera/base/thread.h> > + > #include <libcamera/formats.h> > > #include "libcamera/internal/bayer_format.h" > @@ -50,13 +52,15 @@ public: > unsigned int lineBufferIndex_; > std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers]; > bool enableInputMemcpy_; > + Thread worker_; Alternatively you can inherit it: `class DebayerCpuThread : public Thread, public Object { ...` (Given that type has "Thread" in its name.) (See e.g. the type `VirtualCameraData`.) > }; > > DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, > bool enableInputMemcpy) > : debayer_(debayer), threadIndex_(threadIndex), > - enableInputMemcpy_(enableInputMemcpy) > + enableInputMemcpy_(enableInputMemcpy), worker_("DebayerWorker") Could you add the index to the name, e.g. `"DebayerCpu:" + std::to_string(threadIndex)` or similar? > { > + this->moveToThread(&worker_); > } > > /** > @@ -88,8 +92,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > bool enableInputMemcpy = > configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > > - /* Just one thread object for now, which will be called inline rather than async */ > - threads_.resize(1); > + unsigned int threadCount = > + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(2); > + threadCount = std::clamp(threadCount, 1u, 8u); > + threads_.resize(threadCount); > > for (unsigned int i = 0; i < threads_.size(); i++) > threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy); > @@ -714,6 +720,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) > process2(frame, src, dst); > else > process4(frame, src, dst); > + > + debayer_->workPendingMutex_.lock(); > + debayer_->workPending_ &= ~(1 << threadIndex_); > + debayer_->workPendingMutex_.unlock(); > + debayer_->workPendingCv_.notify_one(); > } > > void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > @@ -953,7 +964,24 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > stats_->startFrame(frame); > > - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); > + workPendingMutex_.lock(); Is the above locking needed? > + workPending_ = (1 << threads_.size()) - 1; Why not just have it be `thread_.size()`? And then subtract one in the worker when done? > + workPendingMutex_.unlock(); > + > + for (unsigned int i = 0; i < threads_.size(); i++) > + threads_[i]->invokeMethod(&DebayerCpuThread::process, > + ConnectionTypeQueued, frame, > + in.planes()[0].data(), out.planes()[0].data()); > + > + { > + MutexLocker locker(workPendingMutex_); > + > + auto workPending = ([&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { ^ I think you can drop the extra `()`. > + return workPending_ == 0; > + }); > + > + workPendingCv_.wait(locker, workPending); Actually, I would probably inline the lambda here. > + } > > metadata.planes()[0].bytesused = out.planes()[0].size(); > > @@ -972,6 +1000,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > inputBufferReady.emit(input); > } > > +int DebayerCpu::start() > +{ > + for (unsigned int i = 0; i < threads_.size(); i++) > + threads_[i]->worker_.start(); > + > + return 0; > +} > + > +void DebayerCpu::stop() > +{ > + for (unsigned int i = 0; i < threads_.size(); i++) > + threads_[i]->worker_.exit(); > + > + for (unsigned int i = 0; i < threads_.size(); i++) Please for (auto &thr : threads_) here and above. > + threads_[i]->worker_.wait(); > +} > + > SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize) > { > Size patternSize = this->patternSize(inputFormat); > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 7196dcdd0..2c84f8e40 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -16,6 +16,7 @@ > #include <vector> > > #include <libcamera/base/object.h> > +#include <libcamera/base/mutex.h> > > #include "libcamera/internal/bayer_format.h" > #include "libcamera/internal/global_configuration.h" > @@ -41,6 +42,8 @@ public: > std::tuple<unsigned int, unsigned int> > strideAndFrameSize(const PixelFormat &outputFormat, const Size &size); > void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms); > + int start(); > + void stop(); > SizeRange sizes(PixelFormat inputFormat, const Size &inputSize); > const SharedFD &getStatsFD() { return stats_->getStatsFD(); } > > @@ -147,6 +150,9 @@ private: > std::unique_ptr<SwStatsCpu> stats_; > unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ > > + unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_); > + Mutex workPendingMutex_; > + ConditionVariable workPendingCv_; > std::vector<DebayerCpuThread *>threads_; > }; >
Hi, On 23-Feb-26 17:33, Barnabás Pőcze wrote: > Hi > > 2026. 02. 23. 17:09 keltezéssel, Hans de Goede írta: >> Add CPU soft ISP multi-threading support. >> >> Benchmark results for the Arduino Uno-Q with a weak CPU which is good for >> performance testing, all numbers with an IMX219 running at >> 3280x2464 -> 3272x2464: >> >> 1 thread : 147ms / frame, ~6.5 fps >> 2 threads: 80ms / frame, ~12.5 fps >> 3 threads: 65ms / frame, ~15 fps >> >> Adding a 4th thread does not improve performance. >> >> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >> --- >> Changes in v2: >> - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series >> - Re-use threads instead of starting new threads every frame >> --- >> src/libcamera/software_isp/debayer_cpu.cpp | 53 ++++++++++++++++++++-- >> src/libcamera/software_isp/debayer_cpu.h | 6 +++ >> 2 files changed, 55 insertions(+), 4 deletions(-) >> >> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp >> index 122bfbb05..ea1b17c1f 100644 >> --- a/src/libcamera/software_isp/debayer_cpu.cpp >> +++ b/src/libcamera/software_isp/debayer_cpu.cpp >> @@ -18,6 +18,8 @@ >> #include <linux/dma-buf.h> >> +#include <libcamera/base/thread.h> >> + >> #include <libcamera/formats.h> >> #include "libcamera/internal/bayer_format.h" >> @@ -50,13 +52,15 @@ public: >> unsigned int lineBufferIndex_; >> std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers]; >> bool enableInputMemcpy_; >> + Thread worker_; > > Alternatively you can inherit it: `class DebayerCpuThread : public Thread, public Object { ...` > (Given that type has "Thread" in its name.) (See e.g. the type `VirtualCameraData`.) Good idea, will do for v3. > > >> }; >> DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, >> bool enableInputMemcpy) >> : debayer_(debayer), threadIndex_(threadIndex), >> - enableInputMemcpy_(enableInputMemcpy) >> + enableInputMemcpy_(enableInputMemcpy), worker_("DebayerWorker") > > Could you add the index to the name, e.g. `"DebayerCpu:" + std::to_string(threadIndex)` or similar? Ack, will do for v3. >> { >> + this->moveToThread(&worker_); >> } >> /** >> @@ -88,8 +92,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat >> bool enableInputMemcpy = >> configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); >> - /* Just one thread object for now, which will be called inline rather than async */ >> - threads_.resize(1); >> + unsigned int threadCount = >> + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(2); >> + threadCount = std::clamp(threadCount, 1u, 8u); >> + threads_.resize(threadCount); >> for (unsigned int i = 0; i < threads_.size(); i++) >> threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy); >> @@ -714,6 +720,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) >> process2(frame, src, dst); >> else >> process4(frame, src, dst); >> + >> + debayer_->workPendingMutex_.lock(); >> + debayer_->workPending_ &= ~(1 << threadIndex_); >> + debayer_->workPendingMutex_.unlock(); >> + debayer_->workPendingCv_.notify_one(); >> } >> void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) >> @@ -953,7 +964,24 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >> stats_->startFrame(frame); >> - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); >> + workPendingMutex_.lock(); > > Is the above locking needed? It is the correct thing to do and necessary to not get warnings because of workPending_ being marked as LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_) > > >> + workPending_ = (1 << threads_.size()) - 1; > > Why not just have it be `thread_.size()`? And then subtract one in the worker when done? It feels more correct to me to have one pending bit per part of the image being processed and clear those. >> + workPendingMutex_.unlock(); >> + >> + for (unsigned int i = 0; i < threads_.size(); i++) >> + threads_[i]->invokeMethod(&DebayerCpuThread::process, >> + ConnectionTypeQueued, frame, >> + in.planes()[0].data(), out.planes()[0].data()); >> + >> + { >> + MutexLocker locker(workPendingMutex_); >> + >> + auto workPending = ([&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { > ^ > > I think you can drop the extra `()`. > > >> + return workPending_ == 0; >> + }); >> + >> + workPendingCv_.wait(locker, workPending); > > Actually, I would probably inline the lambda here. Ack. > > >> + } >> metadata.planes()[0].bytesused = out.planes()[0].size(); >> @@ -972,6 +1000,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >> inputBufferReady.emit(input); >> } >> +int DebayerCpu::start() >> +{ >> + for (unsigned int i = 0; i < threads_.size(); i++) >> + threads_[i]->worker_.start(); >> + >> + return 0; >> +} >> + >> +void DebayerCpu::stop() >> +{ >> + for (unsigned int i = 0; i < threads_.size(); i++) >> + threads_[i]->worker_.exit(); >> + >> + for (unsigned int i = 0; i < threads_.size(); i++) > > Please > > for (auto &thr : threads_) > > here and above. Ack. Regards, Hans
2026. 02. 24. 13:44 keltezéssel, Hans de Goede írta: > Hi, > > On 23-Feb-26 17:33, Barnabás Pőcze wrote: >> Hi >> >> 2026. 02. 23. 17:09 keltezéssel, Hans de Goede írta: >>> Add CPU soft ISP multi-threading support. >>> >>> Benchmark results for the Arduino Uno-Q with a weak CPU which is good for >>> performance testing, all numbers with an IMX219 running at >>> 3280x2464 -> 3272x2464: >>> >>> 1 thread : 147ms / frame, ~6.5 fps >>> 2 threads: 80ms / frame, ~12.5 fps >>> 3 threads: 65ms / frame, ~15 fps >>> >>> Adding a 4th thread does not improve performance. >>> >>> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >>> --- >>> Changes in v2: >>> - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series >>> - Re-use threads instead of starting new threads every frame >>> --- >>> src/libcamera/software_isp/debayer_cpu.cpp | 53 ++++++++++++++++++++-- >>> src/libcamera/software_isp/debayer_cpu.h | 6 +++ >>> 2 files changed, 55 insertions(+), 4 deletions(-) >>> >>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp >>> index 122bfbb05..ea1b17c1f 100644 >>> --- a/src/libcamera/software_isp/debayer_cpu.cpp >>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp > [...] >>> @@ -714,6 +720,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) >>> process2(frame, src, dst); >>> else >>> process4(frame, src, dst); >>> + >>> + debayer_->workPendingMutex_.lock(); >>> + debayer_->workPending_ &= ~(1 << threadIndex_); >>> + debayer_->workPendingMutex_.unlock(); >>> + debayer_->workPendingCv_.notify_one(); >>> } >>> void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) >>> @@ -953,7 +964,24 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >>> stats_->startFrame(frame); >>> - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); >>> + workPendingMutex_.lock(); >> >> Is the above locking needed? > > It is the correct thing to do and necessary to not get warnings because of > workPending_ being marked as LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_) > The way I see it, no thread may be executing `DebayerCpuThread::process` when this runs, so it should not be necessary. But I have indeed missed the tsa annotation. > >> >> >>> + workPending_ = (1 << threads_.size()) - 1; >> >> Why not just have it be `thread_.size()`? And then subtract one in the worker when done? > > It feels more correct to me to have one pending bit per part of > the image being processed and clear those. Interesting, to me a simple counter looks best; but I suppose either works. > [...]
diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index 122bfbb05..ea1b17c1f 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -18,6 +18,8 @@ #include <linux/dma-buf.h> +#include <libcamera/base/thread.h> + #include <libcamera/formats.h> #include "libcamera/internal/bayer_format.h" @@ -50,13 +52,15 @@ public: unsigned int lineBufferIndex_; std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers]; bool enableInputMemcpy_; + Thread worker_; }; DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex, bool enableInputMemcpy) : debayer_(debayer), threadIndex_(threadIndex), - enableInputMemcpy_(enableInputMemcpy) + enableInputMemcpy_(enableInputMemcpy), worker_("DebayerWorker") { + this->moveToThread(&worker_); } /** @@ -88,8 +92,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat bool enableInputMemcpy = configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); - /* Just one thread object for now, which will be called inline rather than async */ - threads_.resize(1); + unsigned int threadCount = + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(2); + threadCount = std::clamp(threadCount, 1u, 8u); + threads_.resize(threadCount); for (unsigned int i = 0; i < threads_.size(); i++) threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy); @@ -714,6 +720,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) process2(frame, src, dst); else process4(frame, src, dst); + + debayer_->workPendingMutex_.lock(); + debayer_->workPending_ &= ~(1 << threadIndex_); + debayer_->workPendingMutex_.unlock(); + debayer_->workPendingCv_.notify_one(); } void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) @@ -953,7 +964,24 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output stats_->startFrame(frame); - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); + workPendingMutex_.lock(); + workPending_ = (1 << threads_.size()) - 1; + workPendingMutex_.unlock(); + + for (unsigned int i = 0; i < threads_.size(); i++) + threads_[i]->invokeMethod(&DebayerCpuThread::process, + ConnectionTypeQueued, frame, + in.planes()[0].data(), out.planes()[0].data()); + + { + MutexLocker locker(workPendingMutex_); + + auto workPending = ([&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { + return workPending_ == 0; + }); + + workPendingCv_.wait(locker, workPending); + } metadata.planes()[0].bytesused = out.planes()[0].size(); @@ -972,6 +1000,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output inputBufferReady.emit(input); } +int DebayerCpu::start() +{ + for (unsigned int i = 0; i < threads_.size(); i++) + threads_[i]->worker_.start(); + + return 0; +} + +void DebayerCpu::stop() +{ + for (unsigned int i = 0; i < threads_.size(); i++) + threads_[i]->worker_.exit(); + + for (unsigned int i = 0; i < threads_.size(); i++) + threads_[i]->worker_.wait(); +} + SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize) { Size patternSize = this->patternSize(inputFormat); diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index 7196dcdd0..2c84f8e40 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -16,6 +16,7 @@ #include <vector> #include <libcamera/base/object.h> +#include <libcamera/base/mutex.h> #include "libcamera/internal/bayer_format.h" #include "libcamera/internal/global_configuration.h" @@ -41,6 +42,8 @@ public: std::tuple<unsigned int, unsigned int> strideAndFrameSize(const PixelFormat &outputFormat, const Size &size); void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms); + int start(); + void stop(); SizeRange sizes(PixelFormat inputFormat, const Size &inputSize); const SharedFD &getStatsFD() { return stats_->getStatsFD(); } @@ -147,6 +150,9 @@ private: std::unique_ptr<SwStatsCpu> stats_; unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ + unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_); + Mutex workPendingMutex_; + ConditionVariable workPendingCv_; std::vector<DebayerCpuThread *>threads_; };
Add CPU soft ISP multi-threading support. Benchmark results for the Arduino Uno-Q with a weak CPU which is good for performance testing, all numbers with an IMX219 running at 3280x2464 -> 3272x2464: 1 thread : 147ms / frame, ~6.5 fps 2 threads: 80ms / frame, ~12.5 fps 3 threads: 65ms / frame, ~15 fps Adding a 4th thread does not improve performance. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- Changes in v2: - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series - Re-use threads instead of starting new threads every frame --- src/libcamera/software_isp/debayer_cpu.cpp | 53 ++++++++++++++++++++-- src/libcamera/software_isp/debayer_cpu.h | 6 +++ 2 files changed, 55 insertions(+), 4 deletions(-)