[v5,1/1] libcamera: add method to set thread affinity
diff mbox series

Message ID 20241016084449.3566589-2-chenghaoyang@chromium.org
State Superseded
Headers show
Series
  • Add thread affinity
Related show

Commit Message

Cheng-Hao Yang Oct. 16, 2024, 8:43 a.m. UTC
From: Han-Lin Chen <hanlinchen@chromium.org>

Add method to set thread affinity to Thread class.

Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
---
 include/libcamera/base/thread.h |  5 ++++
 src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

Comments

Laurent Pinchart Oct. 16, 2024, 10:08 p.m. UTC | #1
Hi Harvey, Han-Lin,

Thank you for the patch. This is taking shape.

On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> From: Han-Lin Chen <hanlinchen@chromium.org>
> 
> Add method to set thread affinity to Thread class.
> 
> Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> ---
>  include/libcamera/base/thread.h |  5 ++++
>  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
>  2 files changed, 51 insertions(+)
> 
> diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> index 4f33de63d..3209d4f7c 100644
> --- a/include/libcamera/base/thread.h
> +++ b/include/libcamera/base/thread.h
> @@ -15,6 +15,7 @@
>  
>  #include <libcamera/base/message.h>
>  #include <libcamera/base/signal.h>
> +#include <libcamera/base/span.h>
>  #include <libcamera/base/utils.h>
>  
>  namespace libcamera {
> @@ -35,6 +36,8 @@ public:
>  	void exit(int code = 0);
>  	bool wait(utils::duration duration = utils::duration::max());
>  
> +	int setThreadAffinity(const Span<const unsigned int> &cpus);
> +
>  	bool isRunning();
>  
>  	Signal<> finished;
> @@ -54,6 +57,8 @@ private:
>  	void startThread();
>  	void finishThread();
>  
> +	void setThreadAffinityInternal();
> +
>  	void postMessage(std::unique_ptr<Message> msg, Object *receiver);
>  	void removeMessages(Object *receiver);
>  
> diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> index 8735670b8..5dc787dba 100644
> --- a/src/libcamera/base/thread.cpp
> +++ b/src/libcamera/base/thread.cpp
> @@ -9,6 +9,7 @@
>  
>  #include <atomic>
>  #include <list>
> +#include <optional>
>  #include <sys/syscall.h>
>  #include <sys/types.h>
>  #include <unistd.h>
> @@ -128,6 +129,8 @@ private:
>  	int exitCode_;
>  
>  	MessageQueue messages_;
> +
> +	std::optional<cpu_set_t> cpuset_ = std::nullopt;

Is there a need to initialize this to std::nullopt, as that's what the
default constructor does ?

>  };
>  
>  /**
> @@ -254,6 +257,8 @@ void Thread::start()
>  	data_->exit_.store(false, std::memory_order_relaxed);
>  
>  	thread_ = std::thread(&Thread::startThread, this);
> +
> +	setThreadAffinityInternal();
>  }
>  
>  void Thread::startThread()
> @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
>  	return hasFinished;
>  }
>  
> +/**
> + * \brief Set the CPU affinity mask of the thread
> + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> + *
> + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> + * If any index is invalid, no cpu indices will be set.

 * If any index is invalid, this function won't modify the thread affinity and
 * will return an error.

> + *
> + * \return 0 if all indices are valid, -EINVAL otherwise
> + */
> +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> +{
> +	const unsigned int numCpus = std::thread::hardware_concurrency();

https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
says:

  Returns the number of concurrent threads supported by the
  implementation. The value should be considered only a hint.

  [snip]

  Return value

  Number of concurrent threads supported. If the value is not well
  defined or not computable, returns 0.

libcxx seems to have the following implementation:

unsigned thread::hardware_concurrency() noexcept {
#if defined(_SC_NPROCESSORS_ONLN)
  long result = sysconf(_SC_NPROCESSORS_ONLN);
  // sysconf returns -1 if the name is invalid, the option does not exist or
  // does not have a definite limit.
  // if sysconf returns some other negative number, we have no idea
  // what is going on. Default to something safe.
  if (result < 0)
    return 0;
  return static_cast<unsigned>(result);
#elif defined(_LIBCPP_WIN32API)
  SYSTEM_INFO info;
  GetSystemInfo(&info);
  return info.dwNumberOfProcessors;
#else // defined(CTL_HW) && defined(HW_NCPU)
  // TODO: grovel through /proc or check cpuid on x86 and similar
  // instructions on other architectures.
#  if defined(_LIBCPP_WARNING)
  _LIBCPP_WARNING("hardware_concurrency not yet implemented")
#  else
#    warning hardware_concurrency not yet implemented
#  endif
  return 0; // Means not computable [thread.thread.static]
#endif // defined(CTL_HW) && defined(HW_NCPU)
}

while libstdc++ uses

#if defined(_GLIBCXX_USE_GET_NPROCS)
# include <sys/sysinfo.h>
# define _GLIBCXX_NPROCS get_nprocs()
#elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
# define _GLIBCXX_NPROCS pthread_num_processors_np()
#elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
# include <stddef.h>
# include <sys/sysctl.h>
static inline int get_nprocs()
{
 int count;
 size_t size = sizeof(count);
 int mib[] = { CTL_HW, HW_NCPU };
 if (!sysctl(mib, 2, &count, &size, NULL, 0))
   return count;
 return 0;
}
# define _GLIBCXX_NPROCS get_nprocs()
#elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
# include <unistd.h>
# define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
#elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
# include <unistd.h>
# define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
#else
# define _GLIBCXX_NPROCS 0
#endif

  unsigned int
  thread::hardware_concurrency() noexcept
  {
    int __n = _GLIBCXX_NPROCS;
    if (__n < 0)
      __n = 0;
    return __n;
  }

get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
is a wrapper around the former). So it seems that, while the C++ API
doesn't guarantee that thread::hardware_concurrency() will provide an
accurate value, the implementations we have will do the right thing.
They're however a bit expensive, as they open a file in /sys, but that's
probably the best we can do.

> +
> +	MutexLocker locker(data_->mutex_);
> +	data_->cpuset_ = cpu_set_t();
> +	CPU_ZERO(&data_->cpuset_.value());
> +
> +	for (const unsigned int &cpu : cpus) {
> +		if (cpu >= numCpus) {
> +			LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;

It's not ignored anymore

			LOG(Thread, Error)
				<< "Invalid CPU " << cpu << " for thread affinity";

> +			return -EINVAL;
> +		}
> +
> +		CPU_SET(cpu, &data_->cpuset_.value());
> +	}
> +
> +	if (data_->running_)
> +		setThreadAffinityInternal();
> +
> +	return 0;
> +}
> +
> +void Thread::setThreadAffinityInternal()
> +{
> +	if (!data_->cpuset_)
> +		return;
> +
> +	const cpu_set_t &cpuset = data_->cpuset_.value();
> +	pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);

With this implementation, it's not possible to set the thread's CPU
affinity before starting the thread. That's a limitation of the C++
std::thread API, it could be done if we used the native pthread API
instead. Is that a problem ? Could you actually point me to code that
uses Thread::setThreadAffinity() ?

Even without the ability to set the affinity before starting the thread,
we could set it from within the thread before calling the run()
function. This would avoid the main workload being run on incorrect
CPUs. Is that something that would be useful ?

Regardless of which option we implement, Thread::setThreadAffinity()
should document what happens if it's called before calling
Thread::start().

> +}
> +
>  /**
>   * \brief Check if the thread is running
>   *
Cheng-Hao Yang Oct. 17, 2024, 5:59 a.m. UTC | #2
Hi Laurent,

I'll update a new version when all changes are applied.

On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> Hi Harvey, Han-Lin,
>
> Thank you for the patch. This is taking shape.
>
> On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > From: Han-Lin Chen <hanlinchen@chromium.org>
> >
> > Add method to set thread affinity to Thread class.
> >
> > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > ---
> >  include/libcamera/base/thread.h |  5 ++++
> >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> >  2 files changed, 51 insertions(+)
> >
> > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > index 4f33de63d..3209d4f7c 100644
> > --- a/include/libcamera/base/thread.h
> > +++ b/include/libcamera/base/thread.h
> > @@ -15,6 +15,7 @@
> >
> >  #include <libcamera/base/message.h>
> >  #include <libcamera/base/signal.h>
> > +#include <libcamera/base/span.h>
> >  #include <libcamera/base/utils.h>
> >
> >  namespace libcamera {
> > @@ -35,6 +36,8 @@ public:
> >       void exit(int code = 0);
> >       bool wait(utils::duration duration = utils::duration::max());
> >
> > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > +
> >       bool isRunning();
> >
> >       Signal<> finished;
> > @@ -54,6 +57,8 @@ private:
> >       void startThread();
> >       void finishThread();
> >
> > +     void setThreadAffinityInternal();
> > +
> >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> >       void removeMessages(Object *receiver);
> >
> > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > index 8735670b8..5dc787dba 100644
> > --- a/src/libcamera/base/thread.cpp
> > +++ b/src/libcamera/base/thread.cpp
> > @@ -9,6 +9,7 @@
> >
> >  #include <atomic>
> >  #include <list>
> > +#include <optional>
> >  #include <sys/syscall.h>
> >  #include <sys/types.h>
> >  #include <unistd.h>
> > @@ -128,6 +129,8 @@ private:
> >       int exitCode_;
> >
> >       MessageQueue messages_;
> > +
> > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
>
> Is there a need to initialize this to std::nullopt, as that's what the
> default constructor does ?

Removed.

>
> >  };
> >
> >  /**
> > @@ -254,6 +257,8 @@ void Thread::start()
> >       data_->exit_.store(false, std::memory_order_relaxed);
> >
> >       thread_ = std::thread(&Thread::startThread, this);
> > +
> > +     setThreadAffinityInternal();
> >  }
> >
> >  void Thread::startThread()
> > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> >       return hasFinished;
> >  }
> >
> > +/**
> > + * \brief Set the CPU affinity mask of the thread
> > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > + *
> > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > + * If any index is invalid, no cpu indices will be set.
>
>  * If any index is invalid, this function won't modify the thread affinity and
>  * will return an error.

Done.

>
> > + *
> > + * \return 0 if all indices are valid, -EINVAL otherwise
> > + */
> > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > +{
> > +     const unsigned int numCpus = std::thread::hardware_concurrency();
>
> https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> says:
>
>   Returns the number of concurrent threads supported by the
>   implementation. The value should be considered only a hint.
>
>   [snip]
>
>   Return value
>
>   Number of concurrent threads supported. If the value is not well
>   defined or not computable, returns 0.
>
> libcxx seems to have the following implementation:
>
> unsigned thread::hardware_concurrency() noexcept {
> #if defined(_SC_NPROCESSORS_ONLN)
>   long result = sysconf(_SC_NPROCESSORS_ONLN);
>   // sysconf returns -1 if the name is invalid, the option does not exist or
>   // does not have a definite limit.
>   // if sysconf returns some other negative number, we have no idea
>   // what is going on. Default to something safe.
>   if (result < 0)
>     return 0;
>   return static_cast<unsigned>(result);
> #elif defined(_LIBCPP_WIN32API)
>   SYSTEM_INFO info;
>   GetSystemInfo(&info);
>   return info.dwNumberOfProcessors;
> #else // defined(CTL_HW) && defined(HW_NCPU)
>   // TODO: grovel through /proc or check cpuid on x86 and similar
>   // instructions on other architectures.
> #  if defined(_LIBCPP_WARNING)
>   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> #  else
> #    warning hardware_concurrency not yet implemented
> #  endif
>   return 0; // Means not computable [thread.thread.static]
> #endif // defined(CTL_HW) && defined(HW_NCPU)
> }
>
> while libstdc++ uses
>
> #if defined(_GLIBCXX_USE_GET_NPROCS)
> # include <sys/sysinfo.h>
> # define _GLIBCXX_NPROCS get_nprocs()
> #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> # define _GLIBCXX_NPROCS pthread_num_processors_np()
> #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> # include <stddef.h>
> # include <sys/sysctl.h>
> static inline int get_nprocs()
> {
>  int count;
>  size_t size = sizeof(count);
>  int mib[] = { CTL_HW, HW_NCPU };
>  if (!sysctl(mib, 2, &count, &size, NULL, 0))
>    return count;
>  return 0;
> }
> # define _GLIBCXX_NPROCS get_nprocs()
> #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> # include <unistd.h>
> # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> # include <unistd.h>
> # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> #else
> # define _GLIBCXX_NPROCS 0
> #endif
>
>   unsigned int
>   thread::hardware_concurrency() noexcept
>   {
>     int __n = _GLIBCXX_NPROCS;
>     if (__n < 0)
>       __n = 0;
>     return __n;
>   }
>
> get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> is a wrapper around the former). So it seems that, while the C++ API
> doesn't guarantee that thread::hardware_concurrency() will provide an
> accurate value, the implementations we have will do the right thing.
> They're however a bit expensive, as they open a file in /sys, but that's
> probably the best we can do.
>
> > +
> > +     MutexLocker locker(data_->mutex_);
> > +     data_->cpuset_ = cpu_set_t();
> > +     CPU_ZERO(&data_->cpuset_.value());
> > +
> > +     for (const unsigned int &cpu : cpus) {
> > +             if (cpu >= numCpus) {
> > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
>
> It's not ignored anymore
>
>                         LOG(Thread, Error)
>                                 << "Invalid CPU " << cpu << " for thread affinity";
>

Right, updated.

> > +                     return -EINVAL;
> > +             }
> > +
> > +             CPU_SET(cpu, &data_->cpuset_.value());
> > +     }
> > +
> > +     if (data_->running_)
> > +             setThreadAffinityInternal();
> > +
> > +     return 0;
> > +}
> > +
> > +void Thread::setThreadAffinityInternal()
> > +{
> > +     if (!data_->cpuset_)
> > +             return;
> > +
> > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
>
> With this implementation, it's not possible to set the thread's CPU
> affinity before starting the thread. That's a limitation of the C++
> std::thread API, it could be done if we used the native pthread API
> instead. Is that a problem ? Could you actually point me to code that
> uses Thread::setThreadAffinity() ?

There are two main usages in mtkisp7:
1. Setting pipeline handler's thread [1].
2. Setting IPA's two new threads [2].

The first one sets thread affinity to a thread that has already been
started, while the second one set new threads.

By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
This means that we need to implement libcamera::Thread with pthread,
instead of std::thread?

[1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
[2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119

>
> Even without the ability to set the affinity before starting the thread,
> we could set it from within the thread before calling the run()
> function. This would avoid the main workload being run on incorrect
> CPUs. Is that something that would be useful ?

You mean calling `setThreadAffinityInterna()l` in `startThread()`
(before `run()`), instead of at the end of `start()`?

>
> Regardless of which option we implement, Thread::setThreadAffinity()
> should document what happens if it's called before calling
> Thread::start().
>
> > +}
> > +
> >  /**
> >   * \brief Check if the thread is running
> >   *
>
> --
> Regards,
>
> Laurent Pinchart

BR,
Harvey
Laurent Pinchart Oct. 22, 2024, 10:49 p.m. UTC | #3
On Thu, Oct 17, 2024 at 01:59:58PM +0800, Cheng-Hao Yang wrote:
> Hi Laurent,
> 
> I'll update a new version when all changes are applied.

Yes, let's finalize the discussion first :-)

> On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart wrote:
> > On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > > From: Han-Lin Chen <hanlinchen@chromium.org>
> > >
> > > Add method to set thread affinity to Thread class.
> > >
> > > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > > ---
> > >  include/libcamera/base/thread.h |  5 ++++
> > >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> > >  2 files changed, 51 insertions(+)
> > >
> > > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > > index 4f33de63d..3209d4f7c 100644
> > > --- a/include/libcamera/base/thread.h
> > > +++ b/include/libcamera/base/thread.h
> > > @@ -15,6 +15,7 @@
> > >
> > >  #include <libcamera/base/message.h>
> > >  #include <libcamera/base/signal.h>
> > > +#include <libcamera/base/span.h>
> > >  #include <libcamera/base/utils.h>
> > >
> > >  namespace libcamera {
> > > @@ -35,6 +36,8 @@ public:
> > >       void exit(int code = 0);
> > >       bool wait(utils::duration duration = utils::duration::max());
> > >
> > > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > > +
> > >       bool isRunning();
> > >
> > >       Signal<> finished;
> > > @@ -54,6 +57,8 @@ private:
> > >       void startThread();
> > >       void finishThread();
> > >
> > > +     void setThreadAffinityInternal();
> > > +
> > >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> > >       void removeMessages(Object *receiver);
> > >
> > > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > > index 8735670b8..5dc787dba 100644
> > > --- a/src/libcamera/base/thread.cpp
> > > +++ b/src/libcamera/base/thread.cpp
> > > @@ -9,6 +9,7 @@
> > >
> > >  #include <atomic>
> > >  #include <list>
> > > +#include <optional>
> > >  #include <sys/syscall.h>
> > >  #include <sys/types.h>
> > >  #include <unistd.h>
> > > @@ -128,6 +129,8 @@ private:
> > >       int exitCode_;
> > >
> > >       MessageQueue messages_;
> > > +
> > > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
> >
> > Is there a need to initialize this to std::nullopt, as that's what the
> > default constructor does ?
> 
> Removed.
> 
> > >  };
> > >
> > >  /**
> > > @@ -254,6 +257,8 @@ void Thread::start()
> > >       data_->exit_.store(false, std::memory_order_relaxed);
> > >
> > >       thread_ = std::thread(&Thread::startThread, this);
> > > +
> > > +     setThreadAffinityInternal();
> > >  }
> > >
> > >  void Thread::startThread()
> > > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> > >       return hasFinished;
> > >  }
> > >
> > > +/**
> > > + * \brief Set the CPU affinity mask of the thread
> > > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > > + *
> > > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > > + * If any index is invalid, no cpu indices will be set.
> >
> >  * If any index is invalid, this function won't modify the thread affinity and
> >  * will return an error.
> 
> Done.
> 
> > > + *
> > > + * \return 0 if all indices are valid, -EINVAL otherwise
> > > + */
> > > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > > +{
> > > +     const unsigned int numCpus = std::thread::hardware_concurrency();
> >
> > https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> > says:
> >
> >   Returns the number of concurrent threads supported by the
> >   implementation. The value should be considered only a hint.
> >
> >   [snip]
> >
> >   Return value
> >
> >   Number of concurrent threads supported. If the value is not well
> >   defined or not computable, returns 0.
> >
> > libcxx seems to have the following implementation:
> >
> > unsigned thread::hardware_concurrency() noexcept {
> > #if defined(_SC_NPROCESSORS_ONLN)
> >   long result = sysconf(_SC_NPROCESSORS_ONLN);
> >   // sysconf returns -1 if the name is invalid, the option does not exist or
> >   // does not have a definite limit.
> >   // if sysconf returns some other negative number, we have no idea
> >   // what is going on. Default to something safe.
> >   if (result < 0)
> >     return 0;
> >   return static_cast<unsigned>(result);
> > #elif defined(_LIBCPP_WIN32API)
> >   SYSTEM_INFO info;
> >   GetSystemInfo(&info);
> >   return info.dwNumberOfProcessors;
> > #else // defined(CTL_HW) && defined(HW_NCPU)
> >   // TODO: grovel through /proc or check cpuid on x86 and similar
> >   // instructions on other architectures.
> > #  if defined(_LIBCPP_WARNING)
> >   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> > #  else
> > #    warning hardware_concurrency not yet implemented
> > #  endif
> >   return 0; // Means not computable [thread.thread.static]
> > #endif // defined(CTL_HW) && defined(HW_NCPU)
> > }
> >
> > while libstdc++ uses
> >
> > #if defined(_GLIBCXX_USE_GET_NPROCS)
> > # include <sys/sysinfo.h>
> > # define _GLIBCXX_NPROCS get_nprocs()
> > #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> > # define _GLIBCXX_NPROCS pthread_num_processors_np()
> > #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> > # include <stddef.h>
> > # include <sys/sysctl.h>
> > static inline int get_nprocs()
> > {
> >  int count;
> >  size_t size = sizeof(count);
> >  int mib[] = { CTL_HW, HW_NCPU };
> >  if (!sysctl(mib, 2, &count, &size, NULL, 0))
> >    return count;
> >  return 0;
> > }
> > # define _GLIBCXX_NPROCS get_nprocs()
> > #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> > # include <unistd.h>
> > # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> > #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> > # include <unistd.h>
> > # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> > #else
> > # define _GLIBCXX_NPROCS 0
> > #endif
> >
> >   unsigned int
> >   thread::hardware_concurrency() noexcept
> >   {
> >     int __n = _GLIBCXX_NPROCS;
> >     if (__n < 0)
> >       __n = 0;
> >     return __n;
> >   }
> >
> > get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> > is a wrapper around the former). So it seems that, while the C++ API
> > doesn't guarantee that thread::hardware_concurrency() will provide an
> > accurate value, the implementations we have will do the right thing.
> > They're however a bit expensive, as they open a file in /sys, but that's
> > probably the best we can do.
> >
> > > +
> > > +     MutexLocker locker(data_->mutex_);
> > > +     data_->cpuset_ = cpu_set_t();
> > > +     CPU_ZERO(&data_->cpuset_.value());
> > > +
> > > +     for (const unsigned int &cpu : cpus) {
> > > +             if (cpu >= numCpus) {
> > > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
> >
> > It's not ignored anymore
> >
> >                         LOG(Thread, Error)
> >                                 << "Invalid CPU " << cpu << " for thread affinity";
> >
> 
> Right, updated.
> 
> > > +                     return -EINVAL;
> > > +             }
> > > +
> > > +             CPU_SET(cpu, &data_->cpuset_.value());
> > > +     }
> > > +
> > > +     if (data_->running_)
> > > +             setThreadAffinityInternal();
> > > +
> > > +     return 0;
> > > +}
> > > +
> > > +void Thread::setThreadAffinityInternal()
> > > +{
> > > +     if (!data_->cpuset_)
> > > +             return;
> > > +
> > > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
> >
> > With this implementation, it's not possible to set the thread's CPU
> > affinity before starting the thread. That's a limitation of the C++
> > std::thread API, it could be done if we used the native pthread API
> > instead. Is that a problem ? Could you actually point me to code that
> > uses Thread::setThreadAffinity() ?
> 
> There are two main usages in mtkisp7:
> 1. Setting pipeline handler's thread [1].

Heads up on that one, we probably want to handle this in the libcamera
core. Multiple pipeline handlers can run on the same platform, currently
sharing a single camera manager thread. If they all tried to set thread
affinity, that would be messy. I'd like to explore how we could
coordinate that in a cleaner way.

> 2. Setting IPA's two new threads [2].
> 
> The first one sets thread affinity to a thread that has already been
> started, while the second one set new threads.

Out of curiosity, have you measured the difference in performance ? Is
it a matter of pinning the threads to big cores to ensure they have
enough CPU time ? Or a matter of avoiding latency caused by migration ?
Or something else ?

> By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
> This means that we need to implement libcamera::Thread with pthread,
> instead of std::thread?

Yes. It doesn't mean we have to do that, I'm only asking if you think
that would be a good idea, or overkill.

> [1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
> [2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119
> 
> > Even without the ability to set the affinity before starting the thread,
> > we could set it from within the thread before calling the run()
> > function. This would avoid the main workload being run on incorrect
> > CPUs. Is that something that would be useful ?
> 
> You mean calling `setThreadAffinityInterna()l` in `startThread()`
> (before `run()`), instead of at the end of `start()`?

Yes. Again, I'm not saing it has to be done, just exploring options.

> > Regardless of which option we implement, Thread::setThreadAffinity()
> > should document what happens if it's called before calling
> > Thread::start().
> >
> > > +}
> > > +
> > >  /**
> > >   * \brief Check if the thread is running
> > >   *
Cheng-Hao Yang Oct. 23, 2024, 6:46 a.m. UTC | #4
Hi Laurent,

On Wed, Oct 23, 2024 at 6:49 AM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> On Thu, Oct 17, 2024 at 01:59:58PM +0800, Cheng-Hao Yang wrote:
> > Hi Laurent,
> >
> > I'll update a new version when all changes are applied.
>
> Yes, let's finalize the discussion first :-)
>
> > On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart wrote:
> > > On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > > > From: Han-Lin Chen <hanlinchen@chromium.org>
> > > >
> > > > Add method to set thread affinity to Thread class.
> > > >
> > > > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > > > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > ---
> > > >  include/libcamera/base/thread.h |  5 ++++
> > > >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> > > >  2 files changed, 51 insertions(+)
> > > >
> > > > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > > > index 4f33de63d..3209d4f7c 100644
> > > > --- a/include/libcamera/base/thread.h
> > > > +++ b/include/libcamera/base/thread.h
> > > > @@ -15,6 +15,7 @@
> > > >
> > > >  #include <libcamera/base/message.h>
> > > >  #include <libcamera/base/signal.h>
> > > > +#include <libcamera/base/span.h>
> > > >  #include <libcamera/base/utils.h>
> > > >
> > > >  namespace libcamera {
> > > > @@ -35,6 +36,8 @@ public:
> > > >       void exit(int code = 0);
> > > >       bool wait(utils::duration duration = utils::duration::max());
> > > >
> > > > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > > > +
> > > >       bool isRunning();
> > > >
> > > >       Signal<> finished;
> > > > @@ -54,6 +57,8 @@ private:
> > > >       void startThread();
> > > >       void finishThread();
> > > >
> > > > +     void setThreadAffinityInternal();
> > > > +
> > > >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> > > >       void removeMessages(Object *receiver);
> > > >
> > > > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > > > index 8735670b8..5dc787dba 100644
> > > > --- a/src/libcamera/base/thread.cpp
> > > > +++ b/src/libcamera/base/thread.cpp
> > > > @@ -9,6 +9,7 @@
> > > >
> > > >  #include <atomic>
> > > >  #include <list>
> > > > +#include <optional>
> > > >  #include <sys/syscall.h>
> > > >  #include <sys/types.h>
> > > >  #include <unistd.h>
> > > > @@ -128,6 +129,8 @@ private:
> > > >       int exitCode_;
> > > >
> > > >       MessageQueue messages_;
> > > > +
> > > > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
> > >
> > > Is there a need to initialize this to std::nullopt, as that's what the
> > > default constructor does ?
> >
> > Removed.
> >
> > > >  };
> > > >
> > > >  /**
> > > > @@ -254,6 +257,8 @@ void Thread::start()
> > > >       data_->exit_.store(false, std::memory_order_relaxed);
> > > >
> > > >       thread_ = std::thread(&Thread::startThread, this);
> > > > +
> > > > +     setThreadAffinityInternal();
> > > >  }
> > > >
> > > >  void Thread::startThread()
> > > > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> > > >       return hasFinished;
> > > >  }
> > > >
> > > > +/**
> > > > + * \brief Set the CPU affinity mask of the thread
> > > > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > > > + *
> > > > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > > > + * If any index is invalid, no cpu indices will be set.
> > >
> > >  * If any index is invalid, this function won't modify the thread affinity and
> > >  * will return an error.
> >
> > Done.
> >
> > > > + *
> > > > + * \return 0 if all indices are valid, -EINVAL otherwise
> > > > + */
> > > > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > > > +{
> > > > +     const unsigned int numCpus = std::thread::hardware_concurrency();
> > >
> > > https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> > > says:
> > >
> > >   Returns the number of concurrent threads supported by the
> > >   implementation. The value should be considered only a hint.
> > >
> > >   [snip]
> > >
> > >   Return value
> > >
> > >   Number of concurrent threads supported. If the value is not well
> > >   defined or not computable, returns 0.
> > >
> > > libcxx seems to have the following implementation:
> > >
> > > unsigned thread::hardware_concurrency() noexcept {
> > > #if defined(_SC_NPROCESSORS_ONLN)
> > >   long result = sysconf(_SC_NPROCESSORS_ONLN);
> > >   // sysconf returns -1 if the name is invalid, the option does not exist or
> > >   // does not have a definite limit.
> > >   // if sysconf returns some other negative number, we have no idea
> > >   // what is going on. Default to something safe.
> > >   if (result < 0)
> > >     return 0;
> > >   return static_cast<unsigned>(result);
> > > #elif defined(_LIBCPP_WIN32API)
> > >   SYSTEM_INFO info;
> > >   GetSystemInfo(&info);
> > >   return info.dwNumberOfProcessors;
> > > #else // defined(CTL_HW) && defined(HW_NCPU)
> > >   // TODO: grovel through /proc or check cpuid on x86 and similar
> > >   // instructions on other architectures.
> > > #  if defined(_LIBCPP_WARNING)
> > >   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> > > #  else
> > > #    warning hardware_concurrency not yet implemented
> > > #  endif
> > >   return 0; // Means not computable [thread.thread.static]
> > > #endif // defined(CTL_HW) && defined(HW_NCPU)
> > > }
> > >
> > > while libstdc++ uses
> > >
> > > #if defined(_GLIBCXX_USE_GET_NPROCS)
> > > # include <sys/sysinfo.h>
> > > # define _GLIBCXX_NPROCS get_nprocs()
> > > #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> > > # define _GLIBCXX_NPROCS pthread_num_processors_np()
> > > #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> > > # include <stddef.h>
> > > # include <sys/sysctl.h>
> > > static inline int get_nprocs()
> > > {
> > >  int count;
> > >  size_t size = sizeof(count);
> > >  int mib[] = { CTL_HW, HW_NCPU };
> > >  if (!sysctl(mib, 2, &count, &size, NULL, 0))
> > >    return count;
> > >  return 0;
> > > }
> > > # define _GLIBCXX_NPROCS get_nprocs()
> > > #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> > > # include <unistd.h>
> > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> > > #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> > > # include <unistd.h>
> > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> > > #else
> > > # define _GLIBCXX_NPROCS 0
> > > #endif
> > >
> > >   unsigned int
> > >   thread::hardware_concurrency() noexcept
> > >   {
> > >     int __n = _GLIBCXX_NPROCS;
> > >     if (__n < 0)
> > >       __n = 0;
> > >     return __n;
> > >   }
> > >
> > > get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> > > is a wrapper around the former). So it seems that, while the C++ API
> > > doesn't guarantee that thread::hardware_concurrency() will provide an
> > > accurate value, the implementations we have will do the right thing.
> > > They're however a bit expensive, as they open a file in /sys, but that's
> > > probably the best we can do.
> > >
> > > > +
> > > > +     MutexLocker locker(data_->mutex_);
> > > > +     data_->cpuset_ = cpu_set_t();
> > > > +     CPU_ZERO(&data_->cpuset_.value());
> > > > +
> > > > +     for (const unsigned int &cpu : cpus) {
> > > > +             if (cpu >= numCpus) {
> > > > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
> > >
> > > It's not ignored anymore
> > >
> > >                         LOG(Thread, Error)
> > >                                 << "Invalid CPU " << cpu << " for thread affinity";
> > >
> >
> > Right, updated.
> >
> > > > +                     return -EINVAL;
> > > > +             }
> > > > +
> > > > +             CPU_SET(cpu, &data_->cpuset_.value());
> > > > +     }
> > > > +
> > > > +     if (data_->running_)
> > > > +             setThreadAffinityInternal();
> > > > +
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +void Thread::setThreadAffinityInternal()
> > > > +{
> > > > +     if (!data_->cpuset_)
> > > > +             return;
> > > > +
> > > > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > > > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
> > >
> > > With this implementation, it's not possible to set the thread's CPU
> > > affinity before starting the thread. That's a limitation of the C++
> > > std::thread API, it could be done if we used the native pthread API
> > > instead. Is that a problem ? Could you actually point me to code that
> > > uses Thread::setThreadAffinity() ?
> >
> > There are two main usages in mtkisp7:
> > 1. Setting pipeline handler's thread [1].
>
> Heads up on that one, we probably want to handle this in the libcamera
> core. Multiple pipeline handlers can run on the same platform, currently
> sharing a single camera manager thread. If they all tried to set thread
> affinity, that would be messy. I'd like to explore how we could
> coordinate that in a cleaner way.

Yes, your concern makes a lot of sense. If we want to process requests
on a different CPU core in pipeline handlers, would it be better to create
another thread in each pipeline handler? That involves an extra context
switch per frame though...

>
> > 2. Setting IPA's two new threads [2].
> >
> > The first one sets thread affinity to a thread that has already been
> > started, while the second one set new threads.
>
> Out of curiosity, have you measured the difference in performance ? Is
> it a matter of pinning the threads to big cores to ensure they have
> enough CPU time ? Or a matter of avoiding latency caused by migration ?
> Or something else ?

We run CTS, and the current config passes the performance tests with
sufficient fps and low enough latencies.

Without this functionality though, I think we can still somehow pass CTS
with more retries. If it's controversial, we can skip this for now.

We can also allow setting thread affinity when the thread hasn't been
started yet, which at least fits the second use case in mtkisp7.

>
> > By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
> > This means that we need to implement libcamera::Thread with pthread,
> > instead of std::thread?
>
> Yes. It doesn't mean we have to do that, I'm only asking if you think
> that would be a good idea, or overkill.

I find it a bit an overkill :)
The approach below should be much easier.

>
> > [1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
> > [2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119
> >
> > > Even without the ability to set the affinity before starting the thread,
> > > we could set it from within the thread before calling the run()
> > > function. This would avoid the main workload being run on incorrect
> > > CPUs. Is that something that would be useful ?
> >
> > You mean calling `setThreadAffinityInterna()l` in `startThread()`
> > (before `run()`), instead of at the end of `start()`?
>
> Yes. Again, I'm not saing it has to be done, just exploring options.

That makes a lot of sense. It works on ciri as well in my trial.

BR,
Harvey

>
> > > Regardless of which option we implement, Thread::setThreadAffinity()
> > > should document what happens if it's called before calling
> > > Thread::start().
> > >
> > > > +}
> > > > +
> > > >  /**
> > > >   * \brief Check if the thread is running
> > > >   *
>
> --
> Regards,
>
> Laurent Pinchart
Laurent Pinchart Oct. 23, 2024, 11:43 p.m. UTC | #5
On Wed, Oct 23, 2024 at 02:46:38PM +0800, Cheng-Hao Yang wrote:
> On Wed, Oct 23, 2024 at 6:49 AM Laurent Pinchart wrote:
> > On Thu, Oct 17, 2024 at 01:59:58PM +0800, Cheng-Hao Yang wrote:
> > > Hi Laurent,
> > >
> > > I'll update a new version when all changes are applied.
> >
> > Yes, let's finalize the discussion first :-)
> >
> > > On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart wrote:
> > > > On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > > > > From: Han-Lin Chen <hanlinchen@chromium.org>
> > > > >
> > > > > Add method to set thread affinity to Thread class.
> > > > >
> > > > > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > ---
> > > > >  include/libcamera/base/thread.h |  5 ++++
> > > > >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> > > > >  2 files changed, 51 insertions(+)
> > > > >
> > > > > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > > > > index 4f33de63d..3209d4f7c 100644
> > > > > --- a/include/libcamera/base/thread.h
> > > > > +++ b/include/libcamera/base/thread.h
> > > > > @@ -15,6 +15,7 @@
> > > > >
> > > > >  #include <libcamera/base/message.h>
> > > > >  #include <libcamera/base/signal.h>
> > > > > +#include <libcamera/base/span.h>
> > > > >  #include <libcamera/base/utils.h>
> > > > >
> > > > >  namespace libcamera {
> > > > > @@ -35,6 +36,8 @@ public:
> > > > >       void exit(int code = 0);
> > > > >       bool wait(utils::duration duration = utils::duration::max());
> > > > >
> > > > > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > > > > +
> > > > >       bool isRunning();
> > > > >
> > > > >       Signal<> finished;
> > > > > @@ -54,6 +57,8 @@ private:
> > > > >       void startThread();
> > > > >       void finishThread();
> > > > >
> > > > > +     void setThreadAffinityInternal();
> > > > > +
> > > > >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> > > > >       void removeMessages(Object *receiver);
> > > > >
> > > > > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > > > > index 8735670b8..5dc787dba 100644
> > > > > --- a/src/libcamera/base/thread.cpp
> > > > > +++ b/src/libcamera/base/thread.cpp
> > > > > @@ -9,6 +9,7 @@
> > > > >
> > > > >  #include <atomic>
> > > > >  #include <list>
> > > > > +#include <optional>
> > > > >  #include <sys/syscall.h>
> > > > >  #include <sys/types.h>
> > > > >  #include <unistd.h>
> > > > > @@ -128,6 +129,8 @@ private:
> > > > >       int exitCode_;
> > > > >
> > > > >       MessageQueue messages_;
> > > > > +
> > > > > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
> > > >
> > > > Is there a need to initialize this to std::nullopt, as that's what the
> > > > default constructor does ?
> > >
> > > Removed.
> > >
> > > > >  };
> > > > >
> > > > >  /**
> > > > > @@ -254,6 +257,8 @@ void Thread::start()
> > > > >       data_->exit_.store(false, std::memory_order_relaxed);
> > > > >
> > > > >       thread_ = std::thread(&Thread::startThread, this);
> > > > > +
> > > > > +     setThreadAffinityInternal();
> > > > >  }
> > > > >
> > > > >  void Thread::startThread()
> > > > > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> > > > >       return hasFinished;
> > > > >  }
> > > > >
> > > > > +/**
> > > > > + * \brief Set the CPU affinity mask of the thread
> > > > > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > > > > + *
> > > > > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > > > > + * If any index is invalid, no cpu indices will be set.
> > > >
> > > >  * If any index is invalid, this function won't modify the thread affinity and
> > > >  * will return an error.
> > >
> > > Done.
> > >
> > > > > + *
> > > > > + * \return 0 if all indices are valid, -EINVAL otherwise
> > > > > + */
> > > > > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > > > > +{
> > > > > +     const unsigned int numCpus = std::thread::hardware_concurrency();
> > > >
> > > > https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> > > > says:
> > > >
> > > >   Returns the number of concurrent threads supported by the
> > > >   implementation. The value should be considered only a hint.
> > > >
> > > >   [snip]
> > > >
> > > >   Return value
> > > >
> > > >   Number of concurrent threads supported. If the value is not well
> > > >   defined or not computable, returns 0.
> > > >
> > > > libcxx seems to have the following implementation:
> > > >
> > > > unsigned thread::hardware_concurrency() noexcept {
> > > > #if defined(_SC_NPROCESSORS_ONLN)
> > > >   long result = sysconf(_SC_NPROCESSORS_ONLN);
> > > >   // sysconf returns -1 if the name is invalid, the option does not exist or
> > > >   // does not have a definite limit.
> > > >   // if sysconf returns some other negative number, we have no idea
> > > >   // what is going on. Default to something safe.
> > > >   if (result < 0)
> > > >     return 0;
> > > >   return static_cast<unsigned>(result);
> > > > #elif defined(_LIBCPP_WIN32API)
> > > >   SYSTEM_INFO info;
> > > >   GetSystemInfo(&info);
> > > >   return info.dwNumberOfProcessors;
> > > > #else // defined(CTL_HW) && defined(HW_NCPU)
> > > >   // TODO: grovel through /proc or check cpuid on x86 and similar
> > > >   // instructions on other architectures.
> > > > #  if defined(_LIBCPP_WARNING)
> > > >   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> > > > #  else
> > > > #    warning hardware_concurrency not yet implemented
> > > > #  endif
> > > >   return 0; // Means not computable [thread.thread.static]
> > > > #endif // defined(CTL_HW) && defined(HW_NCPU)
> > > > }
> > > >
> > > > while libstdc++ uses
> > > >
> > > > #if defined(_GLIBCXX_USE_GET_NPROCS)
> > > > # include <sys/sysinfo.h>
> > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> > > > # define _GLIBCXX_NPROCS pthread_num_processors_np()
> > > > #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> > > > # include <stddef.h>
> > > > # include <sys/sysctl.h>
> > > > static inline int get_nprocs()
> > > > {
> > > >  int count;
> > > >  size_t size = sizeof(count);
> > > >  int mib[] = { CTL_HW, HW_NCPU };
> > > >  if (!sysctl(mib, 2, &count, &size, NULL, 0))
> > > >    return count;
> > > >  return 0;
> > > > }
> > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> > > > # include <unistd.h>
> > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> > > > #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> > > > # include <unistd.h>
> > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> > > > #else
> > > > # define _GLIBCXX_NPROCS 0
> > > > #endif
> > > >
> > > >   unsigned int
> > > >   thread::hardware_concurrency() noexcept
> > > >   {
> > > >     int __n = _GLIBCXX_NPROCS;
> > > >     if (__n < 0)
> > > >       __n = 0;
> > > >     return __n;
> > > >   }
> > > >
> > > > get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> > > > is a wrapper around the former). So it seems that, while the C++ API
> > > > doesn't guarantee that thread::hardware_concurrency() will provide an
> > > > accurate value, the implementations we have will do the right thing.
> > > > They're however a bit expensive, as they open a file in /sys, but that's
> > > > probably the best we can do.
> > > >
> > > > > +
> > > > > +     MutexLocker locker(data_->mutex_);
> > > > > +     data_->cpuset_ = cpu_set_t();
> > > > > +     CPU_ZERO(&data_->cpuset_.value());
> > > > > +
> > > > > +     for (const unsigned int &cpu : cpus) {
> > > > > +             if (cpu >= numCpus) {
> > > > > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
> > > >
> > > > It's not ignored anymore
> > > >
> > > >                         LOG(Thread, Error)
> > > >                                 << "Invalid CPU " << cpu << " for thread affinity";
> > > >
> > >
> > > Right, updated.
> > >
> > > > > +                     return -EINVAL;
> > > > > +             }
> > > > > +
> > > > > +             CPU_SET(cpu, &data_->cpuset_.value());
> > > > > +     }
> > > > > +
> > > > > +     if (data_->running_)
> > > > > +             setThreadAffinityInternal();
> > > > > +
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +void Thread::setThreadAffinityInternal()
> > > > > +{
> > > > > +     if (!data_->cpuset_)
> > > > > +             return;
> > > > > +
> > > > > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > > > > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
> > > >
> > > > With this implementation, it's not possible to set the thread's CPU
> > > > affinity before starting the thread. That's a limitation of the C++
> > > > std::thread API, it could be done if we used the native pthread API
> > > > instead. Is that a problem ? Could you actually point me to code that
> > > > uses Thread::setThreadAffinity() ?
> > >
> > > There are two main usages in mtkisp7:
> > > 1. Setting pipeline handler's thread [1].
> >
> > Heads up on that one, we probably want to handle this in the libcamera
> > core. Multiple pipeline handlers can run on the same platform, currently
> > sharing a single camera manager thread. If they all tried to set thread
> > affinity, that would be messy. I'd like to explore how we could
> > coordinate that in a cleaner way.
> 
> Yes, your concern makes a lot of sense. If we want to process requests
> on a different CPU core in pipeline handlers, would it be better to create
> another thread in each pipeline handler? That involves an extra context
> switch per frame though...

One thread per pipeline handler is an option, yes. I don't think it
requires a context switch though, we don't have to go from the
application thread to the camera manager thread to the pipeline handler
thread, we could bypass the camera manager thread.

Another option that was floated around was to use a thread pool in case
one thread per pipeline handler would create too many threads. It's very
theoretical at this point whether or not this would help improve
performance.

I'm fine keeping a single internal thread in libcamera, but I'd like its
affinity to be managed in a way that doesn't let different pipeline
handlers compete to set different affinities.

> > > 2. Setting IPA's two new threads [2].
> > >
> > > The first one sets thread affinity to a thread that has already been
> > > started, while the second one set new threads.
> >
> > Out of curiosity, have you measured the difference in performance ? Is
> > it a matter of pinning the threads to big cores to ensure they have
> > enough CPU time ? Or a matter of avoiding latency caused by migration ?
> > Or something else ?
> 
> We run CTS, and the current config passes the performance tests with
> sufficient fps and low enough latencies.
> 
> Without this functionality though, I think we can still somehow pass CTS
> with more retries.

It would be interesting to see what kind of time targets are missed
without this change.

> If it's controversial, we can skip this for now.

No, it's not controversial, I'd just like to design it right :-)

> We can also allow setting thread affinity when the thread hasn't been
> started yet, which at least fits the second use case in mtkisp7.
> 
> > > By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
> > > This means that we need to implement libcamera::Thread with pthread,
> > > instead of std::thread?
> >
> > Yes. It doesn't mean we have to do that, I'm only asking if you think
> > that would be a good idea, or overkill.
> 
> I find it a bit an overkill :)
> The approach below should be much easier.

Fine with me.

> > > [1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
> > > [2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119
> > >
> > > > Even without the ability to set the affinity before starting the thread,
> > > > we could set it from within the thread before calling the run()
> > > > function. This would avoid the main workload being run on incorrect
> > > > CPUs. Is that something that would be useful ?
> > >
> > > You mean calling `setThreadAffinityInterna()l` in `startThread()`
> > > (before `run()`), instead of at the end of `start()`?
> >
> > Yes. Again, I'm not saing it has to be done, just exploring options.
> 
> That makes a lot of sense. It works on ciri as well in my trial.

Will you submit a new patch with this ?

> > > > Regardless of which option we implement, Thread::setThreadAffinity()
> > > > should document what happens if it's called before calling
> > > > Thread::start().
> > > >
> > > > > +}
> > > > > +
> > > > >  /**
> > > > >   * \brief Check if the thread is running
> > > > >   *
Cheng-Hao Yang Oct. 24, 2024, 8:16 a.m. UTC | #6
Hi Laurent,

On Thu, Oct 24, 2024 at 7:43 AM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> On Wed, Oct 23, 2024 at 02:46:38PM +0800, Cheng-Hao Yang wrote:
> > On Wed, Oct 23, 2024 at 6:49 AM Laurent Pinchart wrote:
> > > On Thu, Oct 17, 2024 at 01:59:58PM +0800, Cheng-Hao Yang wrote:
> > > > Hi Laurent,
> > > >
> > > > I'll update a new version when all changes are applied.
> > >
> > > Yes, let's finalize the discussion first :-)
> > >
> > > > On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart wrote:
> > > > > On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > > > > > From: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > >
> > > > > > Add method to set thread affinity to Thread class.
> > > > > >
> > > > > > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > > ---
> > > > > >  include/libcamera/base/thread.h |  5 ++++
> > > > > >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> > > > > >  2 files changed, 51 insertions(+)
> > > > > >
> > > > > > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > > > > > index 4f33de63d..3209d4f7c 100644
> > > > > > --- a/include/libcamera/base/thread.h
> > > > > > +++ b/include/libcamera/base/thread.h
> > > > > > @@ -15,6 +15,7 @@
> > > > > >
> > > > > >  #include <libcamera/base/message.h>
> > > > > >  #include <libcamera/base/signal.h>
> > > > > > +#include <libcamera/base/span.h>
> > > > > >  #include <libcamera/base/utils.h>
> > > > > >
> > > > > >  namespace libcamera {
> > > > > > @@ -35,6 +36,8 @@ public:
> > > > > >       void exit(int code = 0);
> > > > > >       bool wait(utils::duration duration = utils::duration::max());
> > > > > >
> > > > > > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > > > > > +
> > > > > >       bool isRunning();
> > > > > >
> > > > > >       Signal<> finished;
> > > > > > @@ -54,6 +57,8 @@ private:
> > > > > >       void startThread();
> > > > > >       void finishThread();
> > > > > >
> > > > > > +     void setThreadAffinityInternal();
> > > > > > +
> > > > > >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> > > > > >       void removeMessages(Object *receiver);
> > > > > >
> > > > > > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > > > > > index 8735670b8..5dc787dba 100644
> > > > > > --- a/src/libcamera/base/thread.cpp
> > > > > > +++ b/src/libcamera/base/thread.cpp
> > > > > > @@ -9,6 +9,7 @@
> > > > > >
> > > > > >  #include <atomic>
> > > > > >  #include <list>
> > > > > > +#include <optional>
> > > > > >  #include <sys/syscall.h>
> > > > > >  #include <sys/types.h>
> > > > > >  #include <unistd.h>
> > > > > > @@ -128,6 +129,8 @@ private:
> > > > > >       int exitCode_;
> > > > > >
> > > > > >       MessageQueue messages_;
> > > > > > +
> > > > > > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
> > > > >
> > > > > Is there a need to initialize this to std::nullopt, as that's what the
> > > > > default constructor does ?
> > > >
> > > > Removed.
> > > >
> > > > > >  };
> > > > > >
> > > > > >  /**
> > > > > > @@ -254,6 +257,8 @@ void Thread::start()
> > > > > >       data_->exit_.store(false, std::memory_order_relaxed);
> > > > > >
> > > > > >       thread_ = std::thread(&Thread::startThread, this);
> > > > > > +
> > > > > > +     setThreadAffinityInternal();
> > > > > >  }
> > > > > >
> > > > > >  void Thread::startThread()
> > > > > > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> > > > > >       return hasFinished;
> > > > > >  }
> > > > > >
> > > > > > +/**
> > > > > > + * \brief Set the CPU affinity mask of the thread
> > > > > > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > > > > > + *
> > > > > > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > > > > > + * If any index is invalid, no cpu indices will be set.
> > > > >
> > > > >  * If any index is invalid, this function won't modify the thread affinity and
> > > > >  * will return an error.
> > > >
> > > > Done.
> > > >
> > > > > > + *
> > > > > > + * \return 0 if all indices are valid, -EINVAL otherwise
> > > > > > + */
> > > > > > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > > > > > +{
> > > > > > +     const unsigned int numCpus = std::thread::hardware_concurrency();
> > > > >
> > > > > https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> > > > > says:
> > > > >
> > > > >   Returns the number of concurrent threads supported by the
> > > > >   implementation. The value should be considered only a hint.
> > > > >
> > > > >   [snip]
> > > > >
> > > > >   Return value
> > > > >
> > > > >   Number of concurrent threads supported. If the value is not well
> > > > >   defined or not computable, returns 0.
> > > > >
> > > > > libcxx seems to have the following implementation:
> > > > >
> > > > > unsigned thread::hardware_concurrency() noexcept {
> > > > > #if defined(_SC_NPROCESSORS_ONLN)
> > > > >   long result = sysconf(_SC_NPROCESSORS_ONLN);
> > > > >   // sysconf returns -1 if the name is invalid, the option does not exist or
> > > > >   // does not have a definite limit.
> > > > >   // if sysconf returns some other negative number, we have no idea
> > > > >   // what is going on. Default to something safe.
> > > > >   if (result < 0)
> > > > >     return 0;
> > > > >   return static_cast<unsigned>(result);
> > > > > #elif defined(_LIBCPP_WIN32API)
> > > > >   SYSTEM_INFO info;
> > > > >   GetSystemInfo(&info);
> > > > >   return info.dwNumberOfProcessors;
> > > > > #else // defined(CTL_HW) && defined(HW_NCPU)
> > > > >   // TODO: grovel through /proc or check cpuid on x86 and similar
> > > > >   // instructions on other architectures.
> > > > > #  if defined(_LIBCPP_WARNING)
> > > > >   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> > > > > #  else
> > > > > #    warning hardware_concurrency not yet implemented
> > > > > #  endif
> > > > >   return 0; // Means not computable [thread.thread.static]
> > > > > #endif // defined(CTL_HW) && defined(HW_NCPU)
> > > > > }
> > > > >
> > > > > while libstdc++ uses
> > > > >
> > > > > #if defined(_GLIBCXX_USE_GET_NPROCS)
> > > > > # include <sys/sysinfo.h>
> > > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > > #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> > > > > # define _GLIBCXX_NPROCS pthread_num_processors_np()
> > > > > #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> > > > > # include <stddef.h>
> > > > > # include <sys/sysctl.h>
> > > > > static inline int get_nprocs()
> > > > > {
> > > > >  int count;
> > > > >  size_t size = sizeof(count);
> > > > >  int mib[] = { CTL_HW, HW_NCPU };
> > > > >  if (!sysctl(mib, 2, &count, &size, NULL, 0))
> > > > >    return count;
> > > > >  return 0;
> > > > > }
> > > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > > #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> > > > > # include <unistd.h>
> > > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> > > > > #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> > > > > # include <unistd.h>
> > > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> > > > > #else
> > > > > # define _GLIBCXX_NPROCS 0
> > > > > #endif
> > > > >
> > > > >   unsigned int
> > > > >   thread::hardware_concurrency() noexcept
> > > > >   {
> > > > >     int __n = _GLIBCXX_NPROCS;
> > > > >     if (__n < 0)
> > > > >       __n = 0;
> > > > >     return __n;
> > > > >   }
> > > > >
> > > > > get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> > > > > is a wrapper around the former). So it seems that, while the C++ API
> > > > > doesn't guarantee that thread::hardware_concurrency() will provide an
> > > > > accurate value, the implementations we have will do the right thing.
> > > > > They're however a bit expensive, as they open a file in /sys, but that's
> > > > > probably the best we can do.
> > > > >
> > > > > > +
> > > > > > +     MutexLocker locker(data_->mutex_);
> > > > > > +     data_->cpuset_ = cpu_set_t();
> > > > > > +     CPU_ZERO(&data_->cpuset_.value());
> > > > > > +
> > > > > > +     for (const unsigned int &cpu : cpus) {
> > > > > > +             if (cpu >= numCpus) {
> > > > > > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
> > > > >
> > > > > It's not ignored anymore
> > > > >
> > > > >                         LOG(Thread, Error)
> > > > >                                 << "Invalid CPU " << cpu << " for thread affinity";
> > > > >
> > > >
> > > > Right, updated.
> > > >
> > > > > > +                     return -EINVAL;
> > > > > > +             }
> > > > > > +
> > > > > > +             CPU_SET(cpu, &data_->cpuset_.value());
> > > > > > +     }
> > > > > > +
> > > > > > +     if (data_->running_)
> > > > > > +             setThreadAffinityInternal();
> > > > > > +
> > > > > > +     return 0;
> > > > > > +}
> > > > > > +
> > > > > > +void Thread::setThreadAffinityInternal()
> > > > > > +{
> > > > > > +     if (!data_->cpuset_)
> > > > > > +             return;
> > > > > > +
> > > > > > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > > > > > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
> > > > >
> > > > > With this implementation, it's not possible to set the thread's CPU
> > > > > affinity before starting the thread. That's a limitation of the C++
> > > > > std::thread API, it could be done if we used the native pthread API
> > > > > instead. Is that a problem ? Could you actually point me to code that
> > > > > uses Thread::setThreadAffinity() ?
> > > >
> > > > There are two main usages in mtkisp7:
> > > > 1. Setting pipeline handler's thread [1].
> > >
> > > Heads up on that one, we probably want to handle this in the libcamera
> > > core. Multiple pipeline handlers can run on the same platform, currently
> > > sharing a single camera manager thread. If they all tried to set thread
> > > affinity, that would be messy. I'd like to explore how we could
> > > coordinate that in a cleaner way.
> >
> > Yes, your concern makes a lot of sense. If we want to process requests
> > on a different CPU core in pipeline handlers, would it be better to create
> > another thread in each pipeline handler? That involves an extra context
> > switch per frame though...
>
> One thread per pipeline handler is an option, yes. I don't think it
> requires a context switch though, we don't have to go from the
> application thread to the camera manager thread to the pipeline handler
> thread, we could bypass the camera manager thread.

Ah right, libcamera::Camera APIs are mostly thread-safe, I forgot.

>
> Another option that was floated around was to use a thread pool in case
> one thread per pipeline handler would create too many threads. It's very
> theoretical at this point whether or not this would help improve
> performance.

Yeah true. We have much work to do in this approach though.

>
> I'm fine keeping a single internal thread in libcamera, but I'd like its
> affinity to be managed in a way that doesn't let different pipeline
> handlers compete to set different affinities.

Definitely :)
Let me know if I should prevent setting thread affinity in a thread
that's already started.

>
> > > > 2. Setting IPA's two new threads [2].
> > > >
> > > > The first one sets thread affinity to a thread that has already been
> > > > started, while the second one set new threads.
> > >
> > > Out of curiosity, have you measured the difference in performance ? Is
> > > it a matter of pinning the threads to big cores to ensure they have
> > > enough CPU time ? Or a matter of avoiding latency caused by migration ?
> > > Or something else ?
> >
> > We run CTS, and the current config passes the performance tests with
> > sufficient fps and low enough latencies.
> >
> > Without this functionality though, I think we can still somehow pass CTS
> > with more retries.
>
> It would be interesting to see what kind of time targets are missed
> without this change.

For example:
```
android.camera.cts
android.hardware.camera2.cts.RecordingTest#testRecordingFromPersistentSurface[1]
FAILURE: junit.framewo
rk.AssertionFailedError: Camera 1: Video frame drop rate too high:
6.976744%, tolerance 5.000000%. Video size: 1280x720, expectedDuration
[2866.666504,2866.666504], expectedFrameDur
ation 33.333332, frameDropCnt 6, frameCount 86
```

>
> > If it's controversial, we can skip this for now.
>
> No, it's not controversial, I'd just like to design it right :-)
>
> > We can also allow setting thread affinity when the thread hasn't been
> > started yet, which at least fits the second use case in mtkisp7.
> >
> > > > By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
> > > > This means that we need to implement libcamera::Thread with pthread,
> > > > instead of std::thread?
> > >
> > > Yes. It doesn't mean we have to do that, I'm only asking if you think
> > > that would be a good idea, or overkill.
> >
> > I find it a bit an overkill :)
> > The approach below should be much easier.
>
> Fine with me.
>
> > > > [1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
> > > > [2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119
> > > >
> > > > > Even without the ability to set the affinity before starting the thread,
> > > > > we could set it from within the thread before calling the run()
> > > > > function. This would avoid the main workload being run on incorrect
> > > > > CPUs. Is that something that would be useful ?
> > > >
> > > > You mean calling `setThreadAffinityInterna()l` in `startThread()`
> > > > (before `run()`), instead of at the end of `start()`?
> > >
> > > Yes. Again, I'm not saing it has to be done, just exploring options.
> >
> > That makes a lot of sense. It works on ciri as well in my trial.
>
> Will you submit a new patch with this ?

Sure. Uploaded.

BR,
Harvey

>
> > > > > Regardless of which option we implement, Thread::setThreadAffinity()
> > > > > should document what happens if it's called before calling
> > > > > Thread::start().
> > > > >
> > > > > > +}
> > > > > > +
> > > > > >  /**
> > > > > >   * \brief Check if the thread is running
> > > > > >   *
>
> --
> Regards,
>
> Laurent Pinchart
Laurent Pinchart Oct. 24, 2024, 8:31 a.m. UTC | #7
On Thu, Oct 24, 2024 at 04:16:03PM +0800, Cheng-Hao Yang wrote:
> On Thu, Oct 24, 2024 at 7:43 AM Laurent Pinchart wrote:
> > On Wed, Oct 23, 2024 at 02:46:38PM +0800, Cheng-Hao Yang wrote:
> > > On Wed, Oct 23, 2024 at 6:49 AM Laurent Pinchart wrote:
> > > > On Thu, Oct 17, 2024 at 01:59:58PM +0800, Cheng-Hao Yang wrote:
> > > > > Hi Laurent,
> > > > >
> > > > > I'll update a new version when all changes are applied.
> > > >
> > > > Yes, let's finalize the discussion first :-)
> > > >
> > > > > On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart wrote:
> > > > > > On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > > > > > > From: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > > >
> > > > > > > Add method to set thread affinity to Thread class.
> > > > > > >
> > > > > > > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > > > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > > > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > > > ---
> > > > > > >  include/libcamera/base/thread.h |  5 ++++
> > > > > > >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> > > > > > >  2 files changed, 51 insertions(+)
> > > > > > >
> > > > > > > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > > > > > > index 4f33de63d..3209d4f7c 100644
> > > > > > > --- a/include/libcamera/base/thread.h
> > > > > > > +++ b/include/libcamera/base/thread.h
> > > > > > > @@ -15,6 +15,7 @@
> > > > > > >
> > > > > > >  #include <libcamera/base/message.h>
> > > > > > >  #include <libcamera/base/signal.h>
> > > > > > > +#include <libcamera/base/span.h>
> > > > > > >  #include <libcamera/base/utils.h>
> > > > > > >
> > > > > > >  namespace libcamera {
> > > > > > > @@ -35,6 +36,8 @@ public:
> > > > > > >       void exit(int code = 0);
> > > > > > >       bool wait(utils::duration duration = utils::duration::max());
> > > > > > >
> > > > > > > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > > > > > > +
> > > > > > >       bool isRunning();
> > > > > > >
> > > > > > >       Signal<> finished;
> > > > > > > @@ -54,6 +57,8 @@ private:
> > > > > > >       void startThread();
> > > > > > >       void finishThread();
> > > > > > >
> > > > > > > +     void setThreadAffinityInternal();
> > > > > > > +
> > > > > > >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> > > > > > >       void removeMessages(Object *receiver);
> > > > > > >
> > > > > > > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > > > > > > index 8735670b8..5dc787dba 100644
> > > > > > > --- a/src/libcamera/base/thread.cpp
> > > > > > > +++ b/src/libcamera/base/thread.cpp
> > > > > > > @@ -9,6 +9,7 @@
> > > > > > >
> > > > > > >  #include <atomic>
> > > > > > >  #include <list>
> > > > > > > +#include <optional>
> > > > > > >  #include <sys/syscall.h>
> > > > > > >  #include <sys/types.h>
> > > > > > >  #include <unistd.h>
> > > > > > > @@ -128,6 +129,8 @@ private:
> > > > > > >       int exitCode_;
> > > > > > >
> > > > > > >       MessageQueue messages_;
> > > > > > > +
> > > > > > > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
> > > > > >
> > > > > > Is there a need to initialize this to std::nullopt, as that's what the
> > > > > > default constructor does ?
> > > > >
> > > > > Removed.
> > > > >
> > > > > > >  };
> > > > > > >
> > > > > > >  /**
> > > > > > > @@ -254,6 +257,8 @@ void Thread::start()
> > > > > > >       data_->exit_.store(false, std::memory_order_relaxed);
> > > > > > >
> > > > > > >       thread_ = std::thread(&Thread::startThread, this);
> > > > > > > +
> > > > > > > +     setThreadAffinityInternal();
> > > > > > >  }
> > > > > > >
> > > > > > >  void Thread::startThread()
> > > > > > > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> > > > > > >       return hasFinished;
> > > > > > >  }
> > > > > > >
> > > > > > > +/**
> > > > > > > + * \brief Set the CPU affinity mask of the thread
> > > > > > > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > > > > > > + *
> > > > > > > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > > > > > > + * If any index is invalid, no cpu indices will be set.
> > > > > >
> > > > > >  * If any index is invalid, this function won't modify the thread affinity and
> > > > > >  * will return an error.
> > > > >
> > > > > Done.
> > > > >
> > > > > > > + *
> > > > > > > + * \return 0 if all indices are valid, -EINVAL otherwise
> > > > > > > + */
> > > > > > > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > > > > > > +{
> > > > > > > +     const unsigned int numCpus = std::thread::hardware_concurrency();
> > > > > >
> > > > > > https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> > > > > > says:
> > > > > >
> > > > > >   Returns the number of concurrent threads supported by the
> > > > > >   implementation. The value should be considered only a hint.
> > > > > >
> > > > > >   [snip]
> > > > > >
> > > > > >   Return value
> > > > > >
> > > > > >   Number of concurrent threads supported. If the value is not well
> > > > > >   defined or not computable, returns 0.
> > > > > >
> > > > > > libcxx seems to have the following implementation:
> > > > > >
> > > > > > unsigned thread::hardware_concurrency() noexcept {
> > > > > > #if defined(_SC_NPROCESSORS_ONLN)
> > > > > >   long result = sysconf(_SC_NPROCESSORS_ONLN);
> > > > > >   // sysconf returns -1 if the name is invalid, the option does not exist or
> > > > > >   // does not have a definite limit.
> > > > > >   // if sysconf returns some other negative number, we have no idea
> > > > > >   // what is going on. Default to something safe.
> > > > > >   if (result < 0)
> > > > > >     return 0;
> > > > > >   return static_cast<unsigned>(result);
> > > > > > #elif defined(_LIBCPP_WIN32API)
> > > > > >   SYSTEM_INFO info;
> > > > > >   GetSystemInfo(&info);
> > > > > >   return info.dwNumberOfProcessors;
> > > > > > #else // defined(CTL_HW) && defined(HW_NCPU)
> > > > > >   // TODO: grovel through /proc or check cpuid on x86 and similar
> > > > > >   // instructions on other architectures.
> > > > > > #  if defined(_LIBCPP_WARNING)
> > > > > >   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> > > > > > #  else
> > > > > > #    warning hardware_concurrency not yet implemented
> > > > > > #  endif
> > > > > >   return 0; // Means not computable [thread.thread.static]
> > > > > > #endif // defined(CTL_HW) && defined(HW_NCPU)
> > > > > > }
> > > > > >
> > > > > > while libstdc++ uses
> > > > > >
> > > > > > #if defined(_GLIBCXX_USE_GET_NPROCS)
> > > > > > # include <sys/sysinfo.h>
> > > > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > > > #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> > > > > > # define _GLIBCXX_NPROCS pthread_num_processors_np()
> > > > > > #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> > > > > > # include <stddef.h>
> > > > > > # include <sys/sysctl.h>
> > > > > > static inline int get_nprocs()
> > > > > > {
> > > > > >  int count;
> > > > > >  size_t size = sizeof(count);
> > > > > >  int mib[] = { CTL_HW, HW_NCPU };
> > > > > >  if (!sysctl(mib, 2, &count, &size, NULL, 0))
> > > > > >    return count;
> > > > > >  return 0;
> > > > > > }
> > > > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > > > #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> > > > > > # include <unistd.h>
> > > > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> > > > > > #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> > > > > > # include <unistd.h>
> > > > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> > > > > > #else
> > > > > > # define _GLIBCXX_NPROCS 0
> > > > > > #endif
> > > > > >
> > > > > >   unsigned int
> > > > > >   thread::hardware_concurrency() noexcept
> > > > > >   {
> > > > > >     int __n = _GLIBCXX_NPROCS;
> > > > > >     if (__n < 0)
> > > > > >       __n = 0;
> > > > > >     return __n;
> > > > > >   }
> > > > > >
> > > > > > get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> > > > > > is a wrapper around the former). So it seems that, while the C++ API
> > > > > > doesn't guarantee that thread::hardware_concurrency() will provide an
> > > > > > accurate value, the implementations we have will do the right thing.
> > > > > > They're however a bit expensive, as they open a file in /sys, but that's
> > > > > > probably the best we can do.
> > > > > >
> > > > > > > +
> > > > > > > +     MutexLocker locker(data_->mutex_);
> > > > > > > +     data_->cpuset_ = cpu_set_t();
> > > > > > > +     CPU_ZERO(&data_->cpuset_.value());
> > > > > > > +
> > > > > > > +     for (const unsigned int &cpu : cpus) {
> > > > > > > +             if (cpu >= numCpus) {
> > > > > > > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
> > > > > >
> > > > > > It's not ignored anymore
> > > > > >
> > > > > >                         LOG(Thread, Error)
> > > > > >                                 << "Invalid CPU " << cpu << " for thread affinity";
> > > > > >
> > > > >
> > > > > Right, updated.
> > > > >
> > > > > > > +                     return -EINVAL;
> > > > > > > +             }
> > > > > > > +
> > > > > > > +             CPU_SET(cpu, &data_->cpuset_.value());
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     if (data_->running_)
> > > > > > > +             setThreadAffinityInternal();
> > > > > > > +
> > > > > > > +     return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void Thread::setThreadAffinityInternal()
> > > > > > > +{
> > > > > > > +     if (!data_->cpuset_)
> > > > > > > +             return;
> > > > > > > +
> > > > > > > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > > > > > > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
> > > > > >
> > > > > > With this implementation, it's not possible to set the thread's CPU
> > > > > > affinity before starting the thread. That's a limitation of the C++
> > > > > > std::thread API, it could be done if we used the native pthread API
> > > > > > instead. Is that a problem ? Could you actually point me to code that
> > > > > > uses Thread::setThreadAffinity() ?
> > > > >
> > > > > There are two main usages in mtkisp7:
> > > > > 1. Setting pipeline handler's thread [1].
> > > >
> > > > Heads up on that one, we probably want to handle this in the libcamera
> > > > core. Multiple pipeline handlers can run on the same platform, currently
> > > > sharing a single camera manager thread. If they all tried to set thread
> > > > affinity, that would be messy. I'd like to explore how we could
> > > > coordinate that in a cleaner way.
> > >
> > > Yes, your concern makes a lot of sense. If we want to process requests
> > > on a different CPU core in pipeline handlers, would it be better to create
> > > another thread in each pipeline handler? That involves an extra context
> > > switch per frame though...
> >
> > One thread per pipeline handler is an option, yes. I don't think it
> > requires a context switch though, we don't have to go from the
> > application thread to the camera manager thread to the pipeline handler
> > thread, we could bypass the camera manager thread.
> 
> Ah right, libcamera::Camera APIs are mostly thread-safe, I forgot.
> 
> >
> > Another option that was floated around was to use a thread pool in case
> > one thread per pipeline handler would create too many threads. It's very
> > theoretical at this point whether or not this would help improve
> > performance.
> 
> Yeah true. We have much work to do in this approach though.
> 
> >
> > I'm fine keeping a single internal thread in libcamera, but I'd like its
> > affinity to be managed in a way that doesn't let different pipeline
> > handlers compete to set different affinities.
> 
> Definitely :)
> Let me know if I should prevent setting thread affinity in a thread
> that's already started.

I was thinking about that. Can I ask you to weight the pros and cons and
make a proposal, with a rationale ? Generally speaking, this is what I
usually expect from patches for libcamera. When multiple options have
been considered, the reason why a particular one was picked is useful to
document, either in the commit message, or the cover letter. This speeds
up reviews as we then don't have to wonder why particular decisions were
made and investigate alternatives.

> > > > > 2. Setting IPA's two new threads [2].
> > > > >
> > > > > The first one sets thread affinity to a thread that has already been
> > > > > started, while the second one set new threads.
> > > >
> > > > Out of curiosity, have you measured the difference in performance ? Is
> > > > it a matter of pinning the threads to big cores to ensure they have
> > > > enough CPU time ? Or a matter of avoiding latency caused by migration ?
> > > > Or something else ?
> > >
> > > We run CTS, and the current config passes the performance tests with
> > > sufficient fps and low enough latencies.
> > >
> > > Without this functionality though, I think we can still somehow pass CTS
> > > with more retries.
> >
> > It would be interesting to see what kind of time targets are missed
> > without this change.
> 
> For example:
> ```
> android.camera.cts
> android.hardware.camera2.cts.RecordingTest#testRecordingFromPersistentSurface[1]
> FAILURE: junit.framewo
> rk.AssertionFailedError: Camera 1: Video frame drop rate too high:
> 6.976744%, tolerance 5.000000%. Video size: 1280x720, expectedDuration
> [2866.666504,2866.666504], expectedFrameDur
> ation 33.333332, frameDropCnt 6, frameCount 86
> ```

What I meant is information about why these failures occurred and how
this has been debugged. If you don't have that information that's fine,
sometimes one tries a semi-random change that fixes a problem and moves
on without further analysis of the issue.

> > > If it's controversial, we can skip this for now.
> >
> > No, it's not controversial, I'd just like to design it right :-)
> >
> > > We can also allow setting thread affinity when the thread hasn't been
> > > started yet, which at least fits the second use case in mtkisp7.
> > >
> > > > > By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
> > > > > This means that we need to implement libcamera::Thread with pthread,
> > > > > instead of std::thread?
> > > >
> > > > Yes. It doesn't mean we have to do that, I'm only asking if you think
> > > > that would be a good idea, or overkill.
> > >
> > > I find it a bit an overkill :)
> > > The approach below should be much easier.
> >
> > Fine with me.
> >
> > > > > [1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
> > > > > [2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119
> > > > >
> > > > > > Even without the ability to set the affinity before starting the thread,
> > > > > > we could set it from within the thread before calling the run()
> > > > > > function. This would avoid the main workload being run on incorrect
> > > > > > CPUs. Is that something that would be useful ?
> > > > >
> > > > > You mean calling `setThreadAffinityInterna()l` in `startThread()`
> > > > > (before `run()`), instead of at the end of `start()`?
> > > >
> > > > Yes. Again, I'm not saing it has to be done, just exploring options.
> > >
> > > That makes a lot of sense. It works on ciri as well in my trial.
> >
> > Will you submit a new patch with this ?
> 
> Sure. Uploaded.
> 
> > > > > > Regardless of which option we implement, Thread::setThreadAffinity()
> > > > > > should document what happens if it's called before calling
> > > > > > Thread::start().
> > > > > >
> > > > > > > +}
> > > > > > > +
> > > > > > >  /**
> > > > > > >   * \brief Check if the thread is running
> > > > > > >   *
Cheng-Hao Yang Oct. 24, 2024, 1:25 p.m. UTC | #8
Hi Laurent,

On Thu, Oct 24, 2024 at 4:31 PM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> On Thu, Oct 24, 2024 at 04:16:03PM +0800, Cheng-Hao Yang wrote:
> > On Thu, Oct 24, 2024 at 7:43 AM Laurent Pinchart wrote:
> > > On Wed, Oct 23, 2024 at 02:46:38PM +0800, Cheng-Hao Yang wrote:
> > > > On Wed, Oct 23, 2024 at 6:49 AM Laurent Pinchart wrote:
> > > > > On Thu, Oct 17, 2024 at 01:59:58PM +0800, Cheng-Hao Yang wrote:
> > > > > > Hi Laurent,
> > > > > >
> > > > > > I'll update a new version when all changes are applied.
> > > > >
> > > > > Yes, let's finalize the discussion first :-)
> > > > >
> > > > > > On Thu, Oct 17, 2024 at 6:08 AM Laurent Pinchart wrote:
> > > > > > > On Wed, Oct 16, 2024 at 08:43:44AM +0000, Harvey Yang wrote:
> > > > > > > > From: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > > > >
> > > > > > > > Add method to set thread affinity to Thread class.
> > > > > > > >
> > > > > > > > Signed-off-by: Han-Lin Chen <hanlinchen@chromium.org>
> > > > > > > > Co-developed-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > > > > Signed-off-by: Harvey Yang <chenghaoyang@chromium.org>
> > > > > > > > ---
> > > > > > > >  include/libcamera/base/thread.h |  5 ++++
> > > > > > > >  src/libcamera/base/thread.cpp   | 46 +++++++++++++++++++++++++++++++++
> > > > > > > >  2 files changed, 51 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
> > > > > > > > index 4f33de63d..3209d4f7c 100644
> > > > > > > > --- a/include/libcamera/base/thread.h
> > > > > > > > +++ b/include/libcamera/base/thread.h
> > > > > > > > @@ -15,6 +15,7 @@
> > > > > > > >
> > > > > > > >  #include <libcamera/base/message.h>
> > > > > > > >  #include <libcamera/base/signal.h>
> > > > > > > > +#include <libcamera/base/span.h>
> > > > > > > >  #include <libcamera/base/utils.h>
> > > > > > > >
> > > > > > > >  namespace libcamera {
> > > > > > > > @@ -35,6 +36,8 @@ public:
> > > > > > > >       void exit(int code = 0);
> > > > > > > >       bool wait(utils::duration duration = utils::duration::max());
> > > > > > > >
> > > > > > > > +     int setThreadAffinity(const Span<const unsigned int> &cpus);
> > > > > > > > +
> > > > > > > >       bool isRunning();
> > > > > > > >
> > > > > > > >       Signal<> finished;
> > > > > > > > @@ -54,6 +57,8 @@ private:
> > > > > > > >       void startThread();
> > > > > > > >       void finishThread();
> > > > > > > >
> > > > > > > > +     void setThreadAffinityInternal();
> > > > > > > > +
> > > > > > > >       void postMessage(std::unique_ptr<Message> msg, Object *receiver);
> > > > > > > >       void removeMessages(Object *receiver);
> > > > > > > >
> > > > > > > > diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
> > > > > > > > index 8735670b8..5dc787dba 100644
> > > > > > > > --- a/src/libcamera/base/thread.cpp
> > > > > > > > +++ b/src/libcamera/base/thread.cpp
> > > > > > > > @@ -9,6 +9,7 @@
> > > > > > > >
> > > > > > > >  #include <atomic>
> > > > > > > >  #include <list>
> > > > > > > > +#include <optional>
> > > > > > > >  #include <sys/syscall.h>
> > > > > > > >  #include <sys/types.h>
> > > > > > > >  #include <unistd.h>
> > > > > > > > @@ -128,6 +129,8 @@ private:
> > > > > > > >       int exitCode_;
> > > > > > > >
> > > > > > > >       MessageQueue messages_;
> > > > > > > > +
> > > > > > > > +     std::optional<cpu_set_t> cpuset_ = std::nullopt;
> > > > > > >
> > > > > > > Is there a need to initialize this to std::nullopt, as that's what the
> > > > > > > default constructor does ?
> > > > > >
> > > > > > Removed.
> > > > > >
> > > > > > > >  };
> > > > > > > >
> > > > > > > >  /**
> > > > > > > > @@ -254,6 +257,8 @@ void Thread::start()
> > > > > > > >       data_->exit_.store(false, std::memory_order_relaxed);
> > > > > > > >
> > > > > > > >       thread_ = std::thread(&Thread::startThread, this);
> > > > > > > > +
> > > > > > > > +     setThreadAffinityInternal();
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  void Thread::startThread()
> > > > > > > > @@ -410,6 +415,47 @@ bool Thread::wait(utils::duration duration)
> > > > > > > >       return hasFinished;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +/**
> > > > > > > > + * \brief Set the CPU affinity mask of the thread
> > > > > > > > + * \param[in] cpus The list of CPU indices that the thread is set affinity to
> > > > > > > > + *
> > > > > > > > + * The CPU indices should be within [0, std::thread::hardware_concurrency()).
> > > > > > > > + * If any index is invalid, no cpu indices will be set.
> > > > > > >
> > > > > > >  * If any index is invalid, this function won't modify the thread affinity and
> > > > > > >  * will return an error.
> > > > > >
> > > > > > Done.
> > > > > >
> > > > > > > > + *
> > > > > > > > + * \return 0 if all indices are valid, -EINVAL otherwise
> > > > > > > > + */
> > > > > > > > +int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
> > > > > > > > +{
> > > > > > > > +     const unsigned int numCpus = std::thread::hardware_concurrency();
> > > > > > >
> > > > > > > https://en.cppreference.com/w/cpp/thread/thread/hardware_concurrency
> > > > > > > says:
> > > > > > >
> > > > > > >   Returns the number of concurrent threads supported by the
> > > > > > >   implementation. The value should be considered only a hint.
> > > > > > >
> > > > > > >   [snip]
> > > > > > >
> > > > > > >   Return value
> > > > > > >
> > > > > > >   Number of concurrent threads supported. If the value is not well
> > > > > > >   defined or not computable, returns 0.
> > > > > > >
> > > > > > > libcxx seems to have the following implementation:
> > > > > > >
> > > > > > > unsigned thread::hardware_concurrency() noexcept {
> > > > > > > #if defined(_SC_NPROCESSORS_ONLN)
> > > > > > >   long result = sysconf(_SC_NPROCESSORS_ONLN);
> > > > > > >   // sysconf returns -1 if the name is invalid, the option does not exist or
> > > > > > >   // does not have a definite limit.
> > > > > > >   // if sysconf returns some other negative number, we have no idea
> > > > > > >   // what is going on. Default to something safe.
> > > > > > >   if (result < 0)
> > > > > > >     return 0;
> > > > > > >   return static_cast<unsigned>(result);
> > > > > > > #elif defined(_LIBCPP_WIN32API)
> > > > > > >   SYSTEM_INFO info;
> > > > > > >   GetSystemInfo(&info);
> > > > > > >   return info.dwNumberOfProcessors;
> > > > > > > #else // defined(CTL_HW) && defined(HW_NCPU)
> > > > > > >   // TODO: grovel through /proc or check cpuid on x86 and similar
> > > > > > >   // instructions on other architectures.
> > > > > > > #  if defined(_LIBCPP_WARNING)
> > > > > > >   _LIBCPP_WARNING("hardware_concurrency not yet implemented")
> > > > > > > #  else
> > > > > > > #    warning hardware_concurrency not yet implemented
> > > > > > > #  endif
> > > > > > >   return 0; // Means not computable [thread.thread.static]
> > > > > > > #endif // defined(CTL_HW) && defined(HW_NCPU)
> > > > > > > }
> > > > > > >
> > > > > > > while libstdc++ uses
> > > > > > >
> > > > > > > #if defined(_GLIBCXX_USE_GET_NPROCS)
> > > > > > > # include <sys/sysinfo.h>
> > > > > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > > > > #elif defined(_GLIBCXX_USE_PTHREADS_NUM_PROCESSORS_NP)
> > > > > > > # define _GLIBCXX_NPROCS pthread_num_processors_np()
> > > > > > > #elif defined(_GLIBCXX_USE_SYSCTL_HW_NCPU)
> > > > > > > # include <stddef.h>
> > > > > > > # include <sys/sysctl.h>
> > > > > > > static inline int get_nprocs()
> > > > > > > {
> > > > > > >  int count;
> > > > > > >  size_t size = sizeof(count);
> > > > > > >  int mib[] = { CTL_HW, HW_NCPU };
> > > > > > >  if (!sysctl(mib, 2, &count, &size, NULL, 0))
> > > > > > >    return count;
> > > > > > >  return 0;
> > > > > > > }
> > > > > > > # define _GLIBCXX_NPROCS get_nprocs()
> > > > > > > #elif defined(_GLIBCXX_USE_SC_NPROCESSORS_ONLN)
> > > > > > > # include <unistd.h>
> > > > > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROCESSORS_ONLN)
> > > > > > > #elif defined(_GLIBCXX_USE_SC_NPROC_ONLN)
> > > > > > > # include <unistd.h>
> > > > > > > # define _GLIBCXX_NPROCS sysconf(_SC_NPROC_ONLN)
> > > > > > > #else
> > > > > > > # define _GLIBCXX_NPROCS 0
> > > > > > > #endif
> > > > > > >
> > > > > > >   unsigned int
> > > > > > >   thread::hardware_concurrency() noexcept
> > > > > > >   {
> > > > > > >     int __n = _GLIBCXX_NPROCS;
> > > > > > >     if (__n < 0)
> > > > > > >       __n = 0;
> > > > > > >     return __n;
> > > > > > >   }
> > > > > > >
> > > > > > > get_nprocs() and sysconf(_SC_NPROCESSORS_ONLN) are identical (the latter
> > > > > > > is a wrapper around the former). So it seems that, while the C++ API
> > > > > > > doesn't guarantee that thread::hardware_concurrency() will provide an
> > > > > > > accurate value, the implementations we have will do the right thing.
> > > > > > > They're however a bit expensive, as they open a file in /sys, but that's
> > > > > > > probably the best we can do.
> > > > > > >
> > > > > > > > +
> > > > > > > > +     MutexLocker locker(data_->mutex_);
> > > > > > > > +     data_->cpuset_ = cpu_set_t();
> > > > > > > > +     CPU_ZERO(&data_->cpuset_.value());
> > > > > > > > +
> > > > > > > > +     for (const unsigned int &cpu : cpus) {
> > > > > > > > +             if (cpu >= numCpus) {
> > > > > > > > +                     LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
> > > > > > >
> > > > > > > It's not ignored anymore
> > > > > > >
> > > > > > >                         LOG(Thread, Error)
> > > > > > >                                 << "Invalid CPU " << cpu << " for thread affinity";
> > > > > > >
> > > > > >
> > > > > > Right, updated.
> > > > > >
> > > > > > > > +                     return -EINVAL;
> > > > > > > > +             }
> > > > > > > > +
> > > > > > > > +             CPU_SET(cpu, &data_->cpuset_.value());
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     if (data_->running_)
> > > > > > > > +             setThreadAffinityInternal();
> > > > > > > > +
> > > > > > > > +     return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void Thread::setThreadAffinityInternal()
> > > > > > > > +{
> > > > > > > > +     if (!data_->cpuset_)
> > > > > > > > +             return;
> > > > > > > > +
> > > > > > > > +     const cpu_set_t &cpuset = data_->cpuset_.value();
> > > > > > > > +     pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
> > > > > > >
> > > > > > > With this implementation, it's not possible to set the thread's CPU
> > > > > > > affinity before starting the thread. That's a limitation of the C++
> > > > > > > std::thread API, it could be done if we used the native pthread API
> > > > > > > instead. Is that a problem ? Could you actually point me to code that
> > > > > > > uses Thread::setThreadAffinity() ?
> > > > > >
> > > > > > There are two main usages in mtkisp7:
> > > > > > 1. Setting pipeline handler's thread [1].
> > > > >
> > > > > Heads up on that one, we probably want to handle this in the libcamera
> > > > > core. Multiple pipeline handlers can run on the same platform, currently
> > > > > sharing a single camera manager thread. If they all tried to set thread
> > > > > affinity, that would be messy. I'd like to explore how we could
> > > > > coordinate that in a cleaner way.
> > > >
> > > > Yes, your concern makes a lot of sense. If we want to process requests
> > > > on a different CPU core in pipeline handlers, would it be better to create
> > > > another thread in each pipeline handler? That involves an extra context
> > > > switch per frame though...
> > >
> > > One thread per pipeline handler is an option, yes. I don't think it
> > > requires a context switch though, we don't have to go from the
> > > application thread to the camera manager thread to the pipeline handler
> > > thread, we could bypass the camera manager thread.
> >
> > Ah right, libcamera::Camera APIs are mostly thread-safe, I forgot.
> >
> > >
> > > Another option that was floated around was to use a thread pool in case
> > > one thread per pipeline handler would create too many threads. It's very
> > > theoretical at this point whether or not this would help improve
> > > performance.
> >
> > Yeah true. We have much work to do in this approach though.
> >
> > >
> > > I'm fine keeping a single internal thread in libcamera, but I'd like its
> > > affinity to be managed in a way that doesn't let different pipeline
> > > handlers compete to set different affinities.
> >
> > Definitely :)
> > Let me know if I should prevent setting thread affinity in a thread
> > that's already started.
>
> I was thinking about that. Can I ask you to weight the pros and cons and
> make a proposal, with a rationale ? Generally speaking, this is what I
> usually expect from patches for libcamera. When multiple options have
> been considered, the reason why a particular one was picked is useful to
> document, either in the commit message, or the cover letter. This speeds
> up reviews as we then don't have to wonder why particular decisions were
> made and investigate alternatives.

Sure, while I still have some questions on the other approach:
If we prevent setting thread affinity in a thread that's already started,
how could we set a pipeline handler's owner thread's affinity?

I assume we were discussing having a separate thread for each pipeline
handler instance. If this is the case, how could we maintain the lifetime
of the thread? (Keeping the thread as a member variable of PipelineHandler
doesn't seem to work) I guess we need to change the lifetime of
PipelineHandler as well, which is currently held by Camera::Private's
shared_ptr.
Otherwise, we need to have an internal thread in PipelineHandler that's not
the PipelineHandler instance (as an Object)'s owner thread.

WDYT?


>
> > > > > > 2. Setting IPA's two new threads [2].
> > > > > >
> > > > > > The first one sets thread affinity to a thread that has already been
> > > > > > started, while the second one set new threads.
> > > > >
> > > > > Out of curiosity, have you measured the difference in performance ? Is
> > > > > it a matter of pinning the threads to big cores to ensure they have
> > > > > enough CPU time ? Or a matter of avoiding latency caused by migration ?
> > > > > Or something else ?
> > > >
> > > > We run CTS, and the current config passes the performance tests with
> > > > sufficient fps and low enough latencies.
> > > >
> > > > Without this functionality though, I think we can still somehow pass CTS
> > > > with more retries.
> > >
> > > It would be interesting to see what kind of time targets are missed
> > > without this change.
> >
> > For example:
> > ```
> > android.camera.cts
> > android.hardware.camera2.cts.RecordingTest#testRecordingFromPersistentSurface[1]
> > FAILURE: junit.framewo
> > rk.AssertionFailedError: Camera 1: Video frame drop rate too high:
> > 6.976744%, tolerance 5.000000%. Video size: 1280x720, expectedDuration
> > [2866.666504,2866.666504], expectedFrameDur
> > ation 33.333332, frameDropCnt 6, frameCount 86
> > ```
>
> What I meant is information about why these failures occurred and how
> this has been debugged. If you don't have that information that's fine,
> sometimes one tries a semi-random change that fixes a problem and moves
> on without further analysis of the issue.

Hmm, I think the failures are quite straight-forward to me that the fps is too
low that there are more frame drops occurring. The fps is low because the tasks
take longer time to be processed.

But yeah, we didn't spend much time being very sure about that. We used big
cores and the issues were solved :)

BR,
Harvey

>
> > > > If it's controversial, we can skip this for now.
> > >
> > > No, it's not controversial, I'd just like to design it right :-)
> > >
> > > > We can also allow setting thread affinity when the thread hasn't been
> > > > started yet, which at least fits the second use case in mtkisp7.
> > > >
> > > > > > By "the native pthread API", you mean `pthread_attr_setaffinity_np`?
> > > > > > This means that we need to implement libcamera::Thread with pthread,
> > > > > > instead of std::thread?
> > > > >
> > > > > Yes. It doesn't mean we have to do that, I'm only asking if you think
> > > > > that would be a good idea, or overkill.
> > > >
> > > > I find it a bit an overkill :)
> > > > The approach below should be much easier.
> > >
> > > Fine with me.
> > >
> > > > > > [1]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/libcamera/pipeline/mtkisp7/mtkisp7.cpp;l=511
> > > > > > [2]: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/third_party/libcamera/mtkisp7/src/ipa/mtkisp7/mtkisp7.cpp;l=119
> > > > > >
> > > > > > > Even without the ability to set the affinity before starting the thread,
> > > > > > > we could set it from within the thread before calling the run()
> > > > > > > function. This would avoid the main workload being run on incorrect
> > > > > > > CPUs. Is that something that would be useful ?
> > > > > >
> > > > > > You mean calling `setThreadAffinityInterna()l` in `startThread()`
> > > > > > (before `run()`), instead of at the end of `start()`?
> > > > >
> > > > > Yes. Again, I'm not saing it has to be done, just exploring options.
> > > >
> > > > That makes a lot of sense. It works on ciri as well in my trial.
> > >
> > > Will you submit a new patch with this ?
> >
> > Sure. Uploaded.
> >
> > > > > > > Regardless of which option we implement, Thread::setThreadAffinity()
> > > > > > > should document what happens if it's called before calling
> > > > > > > Thread::start().
> > > > > > >
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /**
> > > > > > > >   * \brief Check if the thread is running
> > > > > > > >   *
>
> --
> Regards,
>
> Laurent Pinchart

Patch
diff mbox series

diff --git a/include/libcamera/base/thread.h b/include/libcamera/base/thread.h
index 4f33de63d..3209d4f7c 100644
--- a/include/libcamera/base/thread.h
+++ b/include/libcamera/base/thread.h
@@ -15,6 +15,7 @@ 
 
 #include <libcamera/base/message.h>
 #include <libcamera/base/signal.h>
+#include <libcamera/base/span.h>
 #include <libcamera/base/utils.h>
 
 namespace libcamera {
@@ -35,6 +36,8 @@  public:
 	void exit(int code = 0);
 	bool wait(utils::duration duration = utils::duration::max());
 
+	int setThreadAffinity(const Span<const unsigned int> &cpus);
+
 	bool isRunning();
 
 	Signal<> finished;
@@ -54,6 +57,8 @@  private:
 	void startThread();
 	void finishThread();
 
+	void setThreadAffinityInternal();
+
 	void postMessage(std::unique_ptr<Message> msg, Object *receiver);
 	void removeMessages(Object *receiver);
 
diff --git a/src/libcamera/base/thread.cpp b/src/libcamera/base/thread.cpp
index 8735670b8..5dc787dba 100644
--- a/src/libcamera/base/thread.cpp
+++ b/src/libcamera/base/thread.cpp
@@ -9,6 +9,7 @@ 
 
 #include <atomic>
 #include <list>
+#include <optional>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -128,6 +129,8 @@  private:
 	int exitCode_;
 
 	MessageQueue messages_;
+
+	std::optional<cpu_set_t> cpuset_ = std::nullopt;
 };
 
 /**
@@ -254,6 +257,8 @@  void Thread::start()
 	data_->exit_.store(false, std::memory_order_relaxed);
 
 	thread_ = std::thread(&Thread::startThread, this);
+
+	setThreadAffinityInternal();
 }
 
 void Thread::startThread()
@@ -410,6 +415,47 @@  bool Thread::wait(utils::duration duration)
 	return hasFinished;
 }
 
+/**
+ * \brief Set the CPU affinity mask of the thread
+ * \param[in] cpus The list of CPU indices that the thread is set affinity to
+ *
+ * The CPU indices should be within [0, std::thread::hardware_concurrency()).
+ * If any index is invalid, no cpu indices will be set.
+ *
+ * \return 0 if all indices are valid, -EINVAL otherwise
+ */
+int Thread::setThreadAffinity(const Span<const unsigned int> &cpus)
+{
+	const unsigned int numCpus = std::thread::hardware_concurrency();
+
+	MutexLocker locker(data_->mutex_);
+	data_->cpuset_ = cpu_set_t();
+	CPU_ZERO(&data_->cpuset_.value());
+
+	for (const unsigned int &cpu : cpus) {
+		if (cpu >= numCpus) {
+			LOG(Thread, Error) << "Ignore an invalid cpu index: " << cpu;
+			return -EINVAL;
+		}
+
+		CPU_SET(cpu, &data_->cpuset_.value());
+	}
+
+	if (data_->running_)
+		setThreadAffinityInternal();
+
+	return 0;
+}
+
+void Thread::setThreadAffinityInternal()
+{
+	if (!data_->cpuset_)
+		return;
+
+	const cpu_set_t &cpuset = data_->cpuset_.value();
+	pthread_setaffinity_np(thread_.native_handle(), sizeof(cpuset), &cpuset);
+}
+
 /**
  * \brief Check if the thread is running
  *