| Message ID | 20260303111741.17417-4-johannes.goede@oss.qualcomm.com |
|---|---|
| State | New |
| Headers | show |
| Series |
|
| Related | show |
Hi Hans, thank you for the update. Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > Add CPU soft ISP multi-threading support. > > Benchmark results for the Arduino Uno-Q with a weak CPU which is good for > performance testing, all numbers with an IMX219 running at > 3280x2464 -> 3272x2464: > > 1 thread : 147ms / frame, ~6.5 fps > 2 threads: 80ms / frame, ~12.5 fps > 3 threads: 65ms / frame, ~15 fps > > Adding a 4th thread does not improve performance. > > Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> > --- > Changes in v4: > - Document "software_isp", "threads" option in runtime_configuration.rst > - Add an use constants for min/max/default number of threads > > Changes in v3: > - Adjust for DebayerCpuThread now inheriting from Thread > - Use for (auto &thread : threads_) > > Changes in v2: > - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series > - Re-use threads instead of starting new threads every frame > --- > Documentation/runtime_configuration.rst | 1 + > src/libcamera/software_isp/debayer_cpu.cpp | 45 ++++++++++++++++++++-- > src/libcamera/software_isp/debayer_cpu.h | 10 +++++ > 3 files changed, 53 insertions(+), 3 deletions(-) > > diff --git a/Documentation/runtime_configuration.rst b/Documentation/runtime_configuration.rst > index e99ef2fb9..07d1a9123 100644 > --- a/Documentation/runtime_configuration.rst > +++ b/Documentation/runtime_configuration.rst > @@ -51,6 +51,7 @@ file structure: > measure: > skip: # non-negative integer, frames to skip initially > number: # non-negative integer, frames to measure > + threads: # integer >= 1, number of render threads to use, default 2 In addition to being added to this overview, the option should be documented in "List of variables and configuration options" later in the file. The limits on the value range can be described there. > Configuration file example > -------------------------- > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp > index d57d640df..f7c3e1751 100644 > --- a/src/libcamera/software_isp/debayer_cpu.cpp > +++ b/src/libcamera/software_isp/debayer_cpu.cpp > @@ -76,6 +76,7 @@ DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex > debayer_(debayer), threadIndex_(threadIndex), > enableInputMemcpy_(enableInputMemcpy) > { > + moveToThread(this); > } > > /** > @@ -107,8 +108,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat > bool enableInputMemcpy = > configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); > > - /* Just one thread object for now, which will be called inline rather than async */ > - threads_.resize(1); > + unsigned int threadCount = > + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(kDefaultThreads); > + threadCount = std::clamp(threadCount, kMinThreads, kMaxThreads); Should there be an INFO log message in case the specified number of threads gets changed due to the clamping? > + threads_.resize(threadCount); > > for (unsigned int i = 0; i < threads_.size(); i++) > threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); > @@ -746,6 +749,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) > process2(frame, src, dst); > else > process4(frame, src, dst); > + > + debayer_->workPendingMutex_.lock(); > + debayer_->workPending_ &= ~(1 << threadIndex_); > + debayer_->workPendingMutex_.unlock(); > + debayer_->workPendingCv_.notify_one(); > } > > void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) > @@ -985,7 +993,21 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > > stats_->startFrame(frame); > > - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); > + workPendingMutex_.lock(); > + workPending_ = (1 << threads_.size()) - 1; > + workPendingMutex_.unlock(); > + > + for (auto &thread : threads_) > + thread->invokeMethod(&DebayerCpuThread::process, > + ConnectionTypeQueued, frame, > + in.planes()[0].data(), out.planes()[0].data()); > + > + { > + MutexLocker locker(workPendingMutex_); > + workPendingCv_.wait(locker, [&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { > + return workPending_ == 0; > + }); > + } > > metadata.planes()[0].bytesused = out.planes()[0].size(); > > @@ -1004,6 +1026,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output > inputBufferReady.emit(input); > } > > +int DebayerCpu::start() > +{ > + for (auto &thread : threads_) > + thread->start(); > + > + return 0; > +} > + > +void DebayerCpu::stop() > +{ > + for (auto &thread : threads_) > + thread->exit(); > + > + for (auto &thread : threads_) > + thread->wait(); > +} > + > SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize) > { > Size patternSize = this->patternSize(inputFormat); > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h > index 780576090..11280c0a1 100644 > --- a/src/libcamera/software_isp/debayer_cpu.h > +++ b/src/libcamera/software_isp/debayer_cpu.h > @@ -16,6 +16,7 @@ > #include <vector> > > #include <libcamera/base/object.h> > +#include <libcamera/base/mutex.h> > > #include "libcamera/internal/bayer_format.h" > #include "libcamera/internal/global_configuration.h" > @@ -41,6 +42,8 @@ public: > std::tuple<unsigned int, unsigned int> > strideAndFrameSize(const PixelFormat &outputFormat, const Size &size); > void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms); > + int start(); > + void stop(); > SizeRange sizes(PixelFormat inputFormat, const Size &inputSize); > const SharedFD &getStatsFD() { return stats_->getStatsFD(); } > > @@ -144,6 +147,13 @@ private: > std::unique_ptr<SwStatsCpu> stats_; > unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ > > + static constexpr unsigned int kMinThreads = 1; > + static constexpr unsigned int kMaxThreads = 8; > + static constexpr unsigned int kDefaultThreads = 2; > + > + unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_); > + Mutex workPendingMutex_; > + ConditionVariable workPendingCv_; > std::vector<std::unique_ptr<DebayerCpuThread>>threads_; > };
Hi, On 3-Mar-26 14:26, Milan Zamazal wrote: > Hi Hans, > > thank you for the update. > > Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > >> Add CPU soft ISP multi-threading support. >> >> Benchmark results for the Arduino Uno-Q with a weak CPU which is good for >> performance testing, all numbers with an IMX219 running at >> 3280x2464 -> 3272x2464: >> >> 1 thread : 147ms / frame, ~6.5 fps >> 2 threads: 80ms / frame, ~12.5 fps >> 3 threads: 65ms / frame, ~15 fps >> >> Adding a 4th thread does not improve performance. >> >> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >> --- >> Changes in v4: >> - Document "software_isp", "threads" option in runtime_configuration.rst >> - Add an use constants for min/max/default number of threads >> >> Changes in v3: >> - Adjust for DebayerCpuThread now inheriting from Thread >> - Use for (auto &thread : threads_) >> >> Changes in v2: >> - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series >> - Re-use threads instead of starting new threads every frame >> --- >> Documentation/runtime_configuration.rst | 1 + >> src/libcamera/software_isp/debayer_cpu.cpp | 45 ++++++++++++++++++++-- >> src/libcamera/software_isp/debayer_cpu.h | 10 +++++ >> 3 files changed, 53 insertions(+), 3 deletions(-) >> >> diff --git a/Documentation/runtime_configuration.rst b/Documentation/runtime_configuration.rst >> index e99ef2fb9..07d1a9123 100644 >> --- a/Documentation/runtime_configuration.rst >> +++ b/Documentation/runtime_configuration.rst >> @@ -51,6 +51,7 @@ file structure: >> measure: >> skip: # non-negative integer, frames to skip initially >> number: # non-negative integer, frames to measure >> + threads: # integer >= 1, number of render threads to use, default 2 > > In addition to being added to this overview, the option should be > documented in "List of variables and configuration options" later in the > file. The limits on the value range can be described there. Ack, will fix for v5. >> Configuration file example >> -------------------------- >> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp >> index d57d640df..f7c3e1751 100644 >> --- a/src/libcamera/software_isp/debayer_cpu.cpp >> +++ b/src/libcamera/software_isp/debayer_cpu.cpp >> @@ -76,6 +76,7 @@ DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex >> debayer_(debayer), threadIndex_(threadIndex), >> enableInputMemcpy_(enableInputMemcpy) >> { >> + moveToThread(this); >> } >> >> /** >> @@ -107,8 +108,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat >> bool enableInputMemcpy = >> configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); >> >> - /* Just one thread object for now, which will be called inline rather than async */ >> - threads_.resize(1); >> + unsigned int threadCount = >> + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(kDefaultThreads); >> + threadCount = std::clamp(threadCount, kMinThreads, kMaxThreads); > > Should there be an INFO log message in case the specified number of > threads gets changed due to the clamping? I did consider that when writing this, but I consider it unnecessary verbose, unnecessary complication of the code to add this. Still I can add it if you wish. If you let me know if you want an Info message or ok with omitting it then I'll prepare a v5 addressing your other comment. Regards, Hans > >> + threads_.resize(threadCount); >> >> for (unsigned int i = 0; i < threads_.size(); i++) >> threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); >> @@ -746,6 +749,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) >> process2(frame, src, dst); >> else >> process4(frame, src, dst); >> + >> + debayer_->workPendingMutex_.lock(); >> + debayer_->workPending_ &= ~(1 << threadIndex_); >> + debayer_->workPendingMutex_.unlock(); >> + debayer_->workPendingCv_.notify_one(); >> } >> >> void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) >> @@ -985,7 +993,21 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >> >> stats_->startFrame(frame); >> >> - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); >> + workPendingMutex_.lock(); >> + workPending_ = (1 << threads_.size()) - 1; >> + workPendingMutex_.unlock(); >> + >> + for (auto &thread : threads_) >> + thread->invokeMethod(&DebayerCpuThread::process, >> + ConnectionTypeQueued, frame, >> + in.planes()[0].data(), out.planes()[0].data()); >> + >> + { >> + MutexLocker locker(workPendingMutex_); >> + workPendingCv_.wait(locker, [&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { >> + return workPending_ == 0; >> + }); >> + } >> >> metadata.planes()[0].bytesused = out.planes()[0].size(); >> >> @@ -1004,6 +1026,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >> inputBufferReady.emit(input); >> } >> >> +int DebayerCpu::start() >> +{ >> + for (auto &thread : threads_) >> + thread->start(); >> + >> + return 0; >> +} >> + >> +void DebayerCpu::stop() >> +{ >> + for (auto &thread : threads_) >> + thread->exit(); >> + >> + for (auto &thread : threads_) >> + thread->wait(); >> +} >> + >> SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize) >> { >> Size patternSize = this->patternSize(inputFormat); >> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h >> index 780576090..11280c0a1 100644 >> --- a/src/libcamera/software_isp/debayer_cpu.h >> +++ b/src/libcamera/software_isp/debayer_cpu.h >> @@ -16,6 +16,7 @@ >> #include <vector> >> >> #include <libcamera/base/object.h> >> +#include <libcamera/base/mutex.h> >> >> #include "libcamera/internal/bayer_format.h" >> #include "libcamera/internal/global_configuration.h" >> @@ -41,6 +42,8 @@ public: >> std::tuple<unsigned int, unsigned int> >> strideAndFrameSize(const PixelFormat &outputFormat, const Size &size); >> void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms); >> + int start(); >> + void stop(); >> SizeRange sizes(PixelFormat inputFormat, const Size &inputSize); >> const SharedFD &getStatsFD() { return stats_->getStatsFD(); } >> >> @@ -144,6 +147,13 @@ private: >> std::unique_ptr<SwStatsCpu> stats_; >> unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ >> >> + static constexpr unsigned int kMinThreads = 1; >> + static constexpr unsigned int kMaxThreads = 8; >> + static constexpr unsigned int kDefaultThreads = 2; >> + >> + unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_); >> + Mutex workPendingMutex_; >> + ConditionVariable workPendingCv_; >> std::vector<std::unique_ptr<DebayerCpuThread>>threads_; >> }; >
Hans de Goede <johannes.goede@oss.qualcomm.com> writes: > Hi, > > On 3-Mar-26 14:26, Milan Zamazal wrote: >> Hi Hans, >> >> thank you for the update. >> >> Hans de Goede <johannes.goede@oss.qualcomm.com> writes: >> >>> Add CPU soft ISP multi-threading support. >>> >>> Benchmark results for the Arduino Uno-Q with a weak CPU which is good for >>> performance testing, all numbers with an IMX219 running at >>> 3280x2464 -> 3272x2464: >>> >>> 1 thread : 147ms / frame, ~6.5 fps >>> 2 threads: 80ms / frame, ~12.5 fps >>> 3 threads: 65ms / frame, ~15 fps >>> >>> Adding a 4th thread does not improve performance. >>> >>> Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> >>> --- >>> Changes in v4: >>> - Document "software_isp", "threads" option in runtime_configuration.rst >>> - Add an use constants for min/max/default number of threads >>> >>> Changes in v3: >>> - Adjust for DebayerCpuThread now inheriting from Thread >>> - Use for (auto &thread : threads_) >>> >>> Changes in v2: >>> - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series >>> - Re-use threads instead of starting new threads every frame >>> --- >>> Documentation/runtime_configuration.rst | 1 + >>> src/libcamera/software_isp/debayer_cpu.cpp | 45 ++++++++++++++++++++-- >>> src/libcamera/software_isp/debayer_cpu.h | 10 +++++ >>> 3 files changed, 53 insertions(+), 3 deletions(-) >>> >>> diff --git a/Documentation/runtime_configuration.rst b/Documentation/runtime_configuration.rst >>> index e99ef2fb9..07d1a9123 100644 >>> --- a/Documentation/runtime_configuration.rst >>> +++ b/Documentation/runtime_configuration.rst >>> @@ -51,6 +51,7 @@ file structure: >>> measure: >>> skip: # non-negative integer, frames to skip initially >>> number: # non-negative integer, frames to measure >>> + threads: # integer >= 1, number of render threads to use, default 2 >> >> In addition to being added to this overview, the option should be >> documented in "List of variables and configuration options" later in the >> file. The limits on the value range can be described there. > > Ack, will fix for v5. > >>> Configuration file example >>> -------------------------- >>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp >>> index d57d640df..f7c3e1751 100644 >>> --- a/src/libcamera/software_isp/debayer_cpu.cpp >>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp >>> @@ -76,6 +76,7 @@ DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex >>> debayer_(debayer), threadIndex_(threadIndex), >>> enableInputMemcpy_(enableInputMemcpy) >>> { >>> + moveToThread(this); >>> } >>> >>> /** >>> @@ -107,8 +108,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat >>> bool enableInputMemcpy = >>> configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); >>> >>> - /* Just one thread object for now, which will be called inline rather than async */ >>> - threads_.resize(1); >>> + unsigned int threadCount = >>> + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(kDefaultThreads); >>> + threadCount = std::clamp(threadCount, kMinThreads, kMaxThreads); >> >> Should there be an INFO log message in case the specified number of >> threads gets changed due to the clamping? > > I did consider that when writing this, but I consider it unnecessary verbose, > unnecessary complication of the code to add this. > > Still I can add it if you wish. If you let me know if you want an Info message > or ok with omitting it then I'll prepare a v5 addressing your other comment. I'm OK with omitting the log message if the clamping is clearly documented in the option description. > Regards, > > Hans > > > > >> >>> + threads_.resize(threadCount); >>> >>> for (unsigned int i = 0; i < threads_.size(); i++) >>> threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); >>> @@ -746,6 +749,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) >>> process2(frame, src, dst); >>> else >>> process4(frame, src, dst); >>> + >>> + debayer_->workPendingMutex_.lock(); >>> + debayer_->workPending_ &= ~(1 << threadIndex_); >>> + debayer_->workPendingMutex_.unlock(); >>> + debayer_->workPendingCv_.notify_one(); >>> } >>> >>> void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) >>> @@ -985,7 +993,21 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >>> >>> stats_->startFrame(frame); >>> >>> - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); >>> + workPendingMutex_.lock(); >>> + workPending_ = (1 << threads_.size()) - 1; >>> + workPendingMutex_.unlock(); >>> + >>> + for (auto &thread : threads_) >>> + thread->invokeMethod(&DebayerCpuThread::process, >>> + ConnectionTypeQueued, frame, >>> + in.planes()[0].data(), out.planes()[0].data()); >>> + >>> + { >>> + MutexLocker locker(workPendingMutex_); >>> + workPendingCv_.wait(locker, [&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { >>> + return workPending_ == 0; >>> + }); >>> + } >>> >>> metadata.planes()[0].bytesused = out.planes()[0].size(); >>> >>> @@ -1004,6 +1026,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output >>> inputBufferReady.emit(input); >>> } >>> >>> +int DebayerCpu::start() >>> +{ >>> + for (auto &thread : threads_) >>> + thread->start(); >>> + >>> + return 0; >>> +} >>> + >>> +void DebayerCpu::stop() >>> +{ >>> + for (auto &thread : threads_) >>> + thread->exit(); >>> + >>> + for (auto &thread : threads_) >>> + thread->wait(); >>> +} >>> + >>> SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize) >>> { >>> Size patternSize = this->patternSize(inputFormat); >>> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h >>> index 780576090..11280c0a1 100644 >>> --- a/src/libcamera/software_isp/debayer_cpu.h >>> +++ b/src/libcamera/software_isp/debayer_cpu.h >>> @@ -16,6 +16,7 @@ >>> #include <vector> >>> >>> #include <libcamera/base/object.h> >>> +#include <libcamera/base/mutex.h> >>> >>> #include "libcamera/internal/bayer_format.h" >>> #include "libcamera/internal/global_configuration.h" >>> @@ -41,6 +42,8 @@ public: >>> std::tuple<unsigned int, unsigned int> >>> strideAndFrameSize(const PixelFormat &outputFormat, const Size &size); >>> void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms); >>> + int start(); >>> + void stop(); >>> SizeRange sizes(PixelFormat inputFormat, const Size &inputSize); >>> const SharedFD &getStatsFD() { return stats_->getStatsFD(); } >>> >>> @@ -144,6 +147,13 @@ private: >>> std::unique_ptr<SwStatsCpu> stats_; >>> unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ >>> >>> + static constexpr unsigned int kMinThreads = 1; >>> + static constexpr unsigned int kMaxThreads = 8; >>> + static constexpr unsigned int kDefaultThreads = 2; >>> + >>> + unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_); >>> + Mutex workPendingMutex_; >>> + ConditionVariable workPendingCv_; >>> std::vector<std::unique_ptr<DebayerCpuThread>>threads_; >>> }; >>
diff --git a/Documentation/runtime_configuration.rst b/Documentation/runtime_configuration.rst index e99ef2fb9..07d1a9123 100644 --- a/Documentation/runtime_configuration.rst +++ b/Documentation/runtime_configuration.rst @@ -51,6 +51,7 @@ file structure: measure: skip: # non-negative integer, frames to skip initially number: # non-negative integer, frames to measure + threads: # integer >= 1, number of render threads to use, default 2 Configuration file example -------------------------- diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp index d57d640df..f7c3e1751 100644 --- a/src/libcamera/software_isp/debayer_cpu.cpp +++ b/src/libcamera/software_isp/debayer_cpu.cpp @@ -76,6 +76,7 @@ DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex debayer_(debayer), threadIndex_(threadIndex), enableInputMemcpy_(enableInputMemcpy) { + moveToThread(this); } /** @@ -107,8 +108,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat bool enableInputMemcpy = configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true); - /* Just one thread object for now, which will be called inline rather than async */ - threads_.resize(1); + unsigned int threadCount = + configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(kDefaultThreads); + threadCount = std::clamp(threadCount, kMinThreads, kMaxThreads); + threads_.resize(threadCount); for (unsigned int i = 0; i < threads_.size(); i++) threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy); @@ -746,6 +749,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst) process2(frame, src, dst); else process4(frame, src, dst); + + debayer_->workPendingMutex_.lock(); + debayer_->workPending_ &= ~(1 << threadIndex_); + debayer_->workPendingMutex_.unlock(); + debayer_->workPendingCv_.notify_one(); } void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst) @@ -985,7 +993,21 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output stats_->startFrame(frame); - threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data()); + workPendingMutex_.lock(); + workPending_ = (1 << threads_.size()) - 1; + workPendingMutex_.unlock(); + + for (auto &thread : threads_) + thread->invokeMethod(&DebayerCpuThread::process, + ConnectionTypeQueued, frame, + in.planes()[0].data(), out.planes()[0].data()); + + { + MutexLocker locker(workPendingMutex_); + workPendingCv_.wait(locker, [&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) { + return workPending_ == 0; + }); + } metadata.planes()[0].bytesused = out.planes()[0].size(); @@ -1004,6 +1026,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output inputBufferReady.emit(input); } +int DebayerCpu::start() +{ + for (auto &thread : threads_) + thread->start(); + + return 0; +} + +void DebayerCpu::stop() +{ + for (auto &thread : threads_) + thread->exit(); + + for (auto &thread : threads_) + thread->wait(); +} + SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize) { Size patternSize = this->patternSize(inputFormat); diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h index 780576090..11280c0a1 100644 --- a/src/libcamera/software_isp/debayer_cpu.h +++ b/src/libcamera/software_isp/debayer_cpu.h @@ -16,6 +16,7 @@ #include <vector> #include <libcamera/base/object.h> +#include <libcamera/base/mutex.h> #include "libcamera/internal/bayer_format.h" #include "libcamera/internal/global_configuration.h" @@ -41,6 +42,8 @@ public: std::tuple<unsigned int, unsigned int> strideAndFrameSize(const PixelFormat &outputFormat, const Size &size); void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams ¶ms); + int start(); + void stop(); SizeRange sizes(PixelFormat inputFormat, const Size &inputSize); const SharedFD &getStatsFD() { return stats_->getStatsFD(); } @@ -144,6 +147,13 @@ private: std::unique_ptr<SwStatsCpu> stats_; unsigned int xShift_; /* Offset of 0/1 applied to window_.x */ + static constexpr unsigned int kMinThreads = 1; + static constexpr unsigned int kMaxThreads = 8; + static constexpr unsigned int kDefaultThreads = 2; + + unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_); + Mutex workPendingMutex_; + ConditionVariable workPendingCv_; std::vector<std::unique_ptr<DebayerCpuThread>>threads_; };
Add CPU soft ISP multi-threading support. Benchmark results for the Arduino Uno-Q with a weak CPU which is good for performance testing, all numbers with an IMX219 running at 3280x2464 -> 3272x2464: 1 thread : 147ms / frame, ~6.5 fps 2 threads: 80ms / frame, ~12.5 fps 3 threads: 65ms / frame, ~15 fps Adding a 4th thread does not improve performance. Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com> --- Changes in v4: - Document "software_isp", "threads" option in runtime_configuration.rst - Add an use constants for min/max/default number of threads Changes in v3: - Adjust for DebayerCpuThread now inheriting from Thread - Use for (auto &thread : threads_) Changes in v2: - Adjust to use the new DebayerCpuThread class introduced in the v2 patch-series - Re-use threads instead of starting new threads every frame --- Documentation/runtime_configuration.rst | 1 + src/libcamera/software_isp/debayer_cpu.cpp | 45 ++++++++++++++++++++-- src/libcamera/software_isp/debayer_cpu.h | 10 +++++ 3 files changed, 53 insertions(+), 3 deletions(-)