Message ID | 20210906140152.636883-5-jacopo@jmondi.org |
---|---|
State | Not Applicable |
Delegated to: | Jacopo Mondi |
Headers | show |
Series |
|
Related | show |
Hi Jacopo, thank you for the patch. On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > The CameraDevice class stores one CaptureRequestDescriptor instance > for each capture request queued to the Camera. Such descriptors are > cleared when the Camera completes the Request in the > CameraDevice::requestComplete() slot. > > The CameraDevice class relies on an internal worker to off-load > waiting on synchronization fences, and does unconditionally store > the descriptor in the descriptors_ class member map to handle its > lifetime. > > If waiting on a fence fails, the created descriptor remains in the > descriptors_ map until the next call to stop() or close(), as > requestComplete() will never be called, and the camera > service is never notified about the queueing failure. > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > waiting on a fence has failed, by installing a new requestQueueFailed > Signal. > > The Signal is emitted by the CameraWorker internal worker if waiting on > a fence has failed. The CameraDevice is augmented with a slot connected > to the Signal that removes the descriptor from the map and notifies the > camera framework about the failure. > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > --- > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > src/android/camera_device.h | 1 + > src/android/camera_worker.cpp | 10 ++++++---- > src/android/camera_worker.h | 9 +++++++-- > 4 files changed, 35 insertions(+), 7 deletions(-) > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > index a0ea138d9499..0ce9b699bfae 100644 > --- a/src/android/camera_device.cpp > +++ b/src/android/camera_device.cpp > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > * lifetime to the descriptor. > */ > - request_ = std::make_unique<CaptureRequest>(camera); > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > } > > /* > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > facing_(CAMERA_FACING_FRONT), orientation_(0) > { > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > maker_ = "libcamera"; > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > return 0; > } > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > +{ > + /* > + * Remove the descriptor from the map if the worker has failed > + * to process it and notify the camera service about the failure. > + */ > + MutexLocker descriptorsLock(descriptorsMutex_); > + auto it = descriptors_.find(request->cookie()); > + if (it == descriptors_.end()) { > + LOG(HAL, Fatal) If I understand correctly, this must not happen because requestQueueFailed() is invoked by worker::stop() and it is executed before descriptors_.clear() always. Correct? Reviewed-by: Hirokazu Honda <hiroh@chromium.org> -Hiro > + << "Unknown request: " << request->cookie(); > + return; > + } > + > + descriptors_.extract(it); > + > + abortRequest(request->camera3Request()); > +} > + > void CameraDevice::requestComplete(Request *request) > { > decltype(descriptors_)::node_type node; > diff --git a/src/android/camera_device.h b/src/android/camera_device.h > index 54c4cb9ab499..65456ecca7e3 100644 > --- a/src/android/camera_device.h > +++ b/src/android/camera_device.h > @@ -83,6 +83,7 @@ private: > std::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_; > CameraMetadata settings_; > std::unique_ptr<CaptureRequest> request_; > + const camera3_capture_request_t *camera3Request_; > }; > > enum class State { > diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp > index 98dddd9eb13b..7f39fab01638 100644 > --- a/src/android/camera_worker.cpp > +++ b/src/android/camera_worker.cpp > @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL) > * by the CameraWorker which queues it to the libcamera::Camera after handling > * fences. > */ > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > - : camera_(camera) > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > + const camera3_capture_request_t *camera3Request) > + : camera_(camera), camera3Request_(camera3Request) > { > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > } > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > { > /* Async process the request on the worker which runs its own thread. */ > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > - request); > + this, request); > } > > /* > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > return -errno; > } > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > { > /* Wait on all fences before queuing the Request. */ > for (int fence : request->fences()) { > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > if (ret < 0) { > LOG(HAL, Error) << "Failed waiting for fence: " > << fence << ": " << strerror(-ret); > + context->requestQueueFailed.emit(request); > return; > } > } > diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h > index 67ae50bd9288..86f20f741e54 100644 > --- a/src/android/camera_worker.h > +++ b/src/android/camera_worker.h > @@ -10,6 +10,7 @@ > #include <memory> > > #include <libcamera/base/object.h> > +#include <libcamera/base/signal.h> > #include <libcamera/base/thread.h> > > #include <libcamera/camera.h> > @@ -22,7 +23,8 @@ class CameraDevice; > class CaptureRequest > { > public: > - CaptureRequest(libcamera::Camera *camera); > + CaptureRequest(libcamera::Camera *camera, > + const camera3_capture_request_t *camera3Request); > > const std::vector<int> &fences() const { return acquireFences_; } > libcamera::ControlList &controls() { return request_->controls(); } > @@ -31,6 +33,7 @@ public: > return request_->metadata(); > } > unsigned long cookie() const { return request_->cookie(); } > + const camera3_capture_request_t *camera3Request() const { return camera3Request_; } > > void addBuffer(libcamera::Stream *stream, > libcamera::FrameBuffer *buffer, int fence); > @@ -40,6 +43,7 @@ private: > libcamera::Camera *camera_; > std::vector<int> acquireFences_; > std::unique_ptr<libcamera::Request> request_; > + const camera3_capture_request_t *camera3Request_; > }; > > class CameraWorker : private libcamera::Thread > @@ -51,6 +55,7 @@ public: > void stop(); > > void queueRequest(CaptureRequest *request); > + libcamera::Signal<CaptureRequest *> requestQueueFailed; > > protected: > void run() override; > @@ -59,7 +64,7 @@ private: > class Worker : public libcamera::Object > { > public: > - void processRequest(CaptureRequest *request); > + void processRequest(CameraWorker *context, CaptureRequest *request); > > private: > int waitFence(int fence); > -- > 2.32.0 >
Hi Hiro, On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote: > Hi Jacopo, thank you for the patch. > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > The CameraDevice class stores one CaptureRequestDescriptor instance > > for each capture request queued to the Camera. Such descriptors are > > cleared when the Camera completes the Request in the > > CameraDevice::requestComplete() slot. > > > > The CameraDevice class relies on an internal worker to off-load > > waiting on synchronization fences, and does unconditionally store > > the descriptor in the descriptors_ class member map to handle its > > lifetime. > > > > If waiting on a fence fails, the created descriptor remains in the > > descriptors_ map until the next call to stop() or close(), as > > requestComplete() will never be called, and the camera > > service is never notified about the queueing failure. > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > > waiting on a fence has failed, by installing a new requestQueueFailed > > Signal. > > > > The Signal is emitted by the CameraWorker internal worker if waiting on > > a fence has failed. The CameraDevice is augmented with a slot connected > > to the Signal that removes the descriptor from the map and notifies the > > camera framework about the failure. > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > > --- > > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > > src/android/camera_device.h | 1 + > > src/android/camera_worker.cpp | 10 ++++++---- > > src/android/camera_worker.h | 9 +++++++-- > > 4 files changed, 35 insertions(+), 7 deletions(-) > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > > index a0ea138d9499..0ce9b699bfae 100644 > > --- a/src/android/camera_device.cpp > > +++ b/src/android/camera_device.cpp > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > > * lifetime to the descriptor. > > */ > > - request_ = std::make_unique<CaptureRequest>(camera); > > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > > } > > > > /* > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > > facing_(CAMERA_FACING_FRONT), orientation_(0) > > { > > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > > > maker_ = "libcamera"; > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > > return 0; > > } > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > > +{ > > + /* > > + * Remove the descriptor from the map if the worker has failed > > + * to process it and notify the camera service about the failure. > > + */ > > + MutexLocker descriptorsLock(descriptorsMutex_); > > + auto it = descriptors_.find(request->cookie()); > > + if (it == descriptors_.end()) { > > + LOG(HAL, Fatal) > > If I understand correctly, this must not happen because Do you mean the it == end() with "this" ? It should not happen as requestQueueFail is called by the worker if it fails to wait on a fence and the descriptor is already part of descriptors. Actually, there's a small window for this to race now that I look at it again -------------------------------------------------------------------------- CameraDevice CameraWorker -------------------------------------------------------------------------- worker_.queueRequest() -> worker.processRequest fence.wait() == error <- requestQueueFailed.emit() requestQueueFailed() it == descriptors_.end() abort(); descriptors_[req] = desc; -------------------------------------------------------------------------- While the intended order of execution is CameraDevice CameraWorker -------------------------------------------------------------------------- worker_.queueRequest() descriptors_[req] = desc; -> worker.processRequest fence.wait() == error <- requestQueueFailed.emit() requestQueueFailed() it == descriptors_.end() abort(); -------------------------------------------------------------------------- I'll make sure to add the descriptor to descriptors_ before calling worker_.queuRequest(). > requestQueueFailed() is invoked by worker::stop() and it is executed Why do you say it is invoked by stop() ? The signal is emitted if waiting on a fence fails as shown... > before descriptors_.clear() always. > Correct? > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org> > > */ > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > > - : camera_(camera) > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > > + const camera3_capture_request_t *camera3Request) > > + : camera_(camera), camera3Request_(camera3Request) > > { > > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > > } > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > > { > > /* Async process the request on the worker which runs its own thread. */ > > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > > - request); > > + this, request); > > } > > > > /* > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > > return -errno; > > } > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > > { > > /* Wait on all fences before queuing the Request. */ > > for (int fence : request->fences()) { > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > > if (ret < 0) { > > LOG(HAL, Error) << "Failed waiting for fence: " > > << fence << ": " << strerror(-ret); > > + context->requestQueueFailed.emit(request); .... Here Is your question about stop() related to dispatching messages before the thread has actually been stopped ? Isn't that been fixed some time ago by overriding CameraWorker::exec() ? Thanks j
Hi Jacopo, On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > Hi Hiro, > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote: > > Hi Jacopo, thank you for the patch. > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance > > > for each capture request queued to the Camera. Such descriptors are > > > cleared when the Camera completes the Request in the > > > CameraDevice::requestComplete() slot. > > > > > > The CameraDevice class relies on an internal worker to off-load > > > waiting on synchronization fences, and does unconditionally store > > > the descriptor in the descriptors_ class member map to handle its > > > lifetime. > > > > > > If waiting on a fence fails, the created descriptor remains in the > > > descriptors_ map until the next call to stop() or close(), as > > > requestComplete() will never be called, and the camera > > > service is never notified about the queueing failure. > > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > > > waiting on a fence has failed, by installing a new requestQueueFailed > > > Signal. > > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on > > > a fence has failed. The CameraDevice is augmented with a slot connected > > > to the Signal that removes the descriptor from the map and notifies the > > > camera framework about the failure. > > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > > > --- > > > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > > > src/android/camera_device.h | 1 + > > > src/android/camera_worker.cpp | 10 ++++++---- > > > src/android/camera_worker.h | 9 +++++++-- > > > 4 files changed, 35 insertions(+), 7 deletions(-) > > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > > > index a0ea138d9499..0ce9b699bfae 100644 > > > --- a/src/android/camera_device.cpp > > > +++ b/src/android/camera_device.cpp > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > > > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > > > * lifetime to the descriptor. > > > */ > > > - request_ = std::make_unique<CaptureRequest>(camera); > > > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > > > } > > > > > > /* > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > > > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > > > facing_(CAMERA_FACING_FRONT), orientation_(0) > > > { > > > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > > > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > > > > > maker_ = "libcamera"; > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > > > return 0; > > > } > > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > > > +{ > > > + /* > > > + * Remove the descriptor from the map if the worker has failed > > > + * to process it and notify the camera service about the failure. > > > + */ > > > + MutexLocker descriptorsLock(descriptorsMutex_); > > > + auto it = descriptors_.find(request->cookie()); > > > + if (it == descriptors_.end()) { > > > + LOG(HAL, Fatal) > > > > If I understand correctly, this must not happen because > > Do you mean the it == end() with "this" ? > > It should not happen as requestQueueFail is called by the worker if it > fails to wait on a fence and the descriptor is already part of > descriptors. > > Actually, there's a small window for this to race now that I look at > it again > > -------------------------------------------------------------------------- > CameraDevice CameraWorker > -------------------------------------------------------------------------- > > worker_.queueRequest() -> > > worker.processRequest > fence.wait() == error > <- requestQueueFailed.emit() > requestQueueFailed() > it == descriptors_.end() > abort(); > > descriptors_[req] = desc; > -------------------------------------------------------------------------- > > While the intended order of execution is > > CameraDevice CameraWorker > -------------------------------------------------------------------------- > > worker_.queueRequest() > descriptors_[req] = desc; > -> > worker.processRequest > fence.wait() == error > <- requestQueueFailed.emit() > requestQueueFailed() > it == descriptors_.end() > abort(); > > -------------------------------------------------------------------------- > > I'll make sure to add the descriptor to descriptors_ before calling > worker_.queuRequest(). > > > requestQueueFailed() is invoked by worker::stop() and it is executed > > Why do you say it is invoked by stop() ? The signal is emitted if > waiting on a fence fails as shown... > I don't probably understand when fence waiting error happens. Could you explain when it happens? It looks like, in the next patch, descriptors_ must be empty in some places. How is it guaranteed that requestQueueFailed() is executed before descriptors_ is empty? -Hiro > > before descriptors_.clear() always. > > Correct? > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org> > > > > */ > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > > > - : camera_(camera) > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > > > + const camera3_capture_request_t *camera3Request) > > > + : camera_(camera), camera3Request_(camera3Request) > > > { > > > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > > > } > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > > > { > > > /* Async process the request on the worker which runs its own thread. */ > > > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > > > - request); > > > + this, request); > > > } > > > > > > /* > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > > > return -errno; > > > } > > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > > > { > > > /* Wait on all fences before queuing the Request. */ > > > for (int fence : request->fences()) { > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > if (ret < 0) { > > > LOG(HAL, Error) << "Failed waiting for fence: " > > > << fence << ": " << strerror(-ret); > > > + context->requestQueueFailed.emit(request); > > .... Here > > Is your question about stop() related to dispatching messages before > the thread has actually been stopped ? Isn't that been fixed some time > ago by overriding CameraWorker::exec() ? > > Thanks > j >
Hi Hiro, On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote: > Hi Jacopo, > > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > Hi Hiro, > > > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote: > > > Hi Jacopo, thank you for the patch. > > > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance > > > > for each capture request queued to the Camera. Such descriptors are > > > > cleared when the Camera completes the Request in the > > > > CameraDevice::requestComplete() slot. > > > > > > > > The CameraDevice class relies on an internal worker to off-load > > > > waiting on synchronization fences, and does unconditionally store > > > > the descriptor in the descriptors_ class member map to handle its > > > > lifetime. > > > > > > > > If waiting on a fence fails, the created descriptor remains in the > > > > descriptors_ map until the next call to stop() or close(), as > > > > requestComplete() will never be called, and the camera > > > > service is never notified about the queueing failure. > > > > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > > > > waiting on a fence has failed, by installing a new requestQueueFailed > > > > Signal. > > > > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on > > > > a fence has failed. The CameraDevice is augmented with a slot connected > > > > to the Signal that removes the descriptor from the map and notifies the > > > > camera framework about the failure. > > > > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > > > > --- > > > > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > > > > src/android/camera_device.h | 1 + > > > > src/android/camera_worker.cpp | 10 ++++++---- > > > > src/android/camera_worker.h | 9 +++++++-- > > > > 4 files changed, 35 insertions(+), 7 deletions(-) > > > > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > > > > index a0ea138d9499..0ce9b699bfae 100644 > > > > --- a/src/android/camera_device.cpp > > > > +++ b/src/android/camera_device.cpp > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > > > > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > > > > * lifetime to the descriptor. > > > > */ > > > > - request_ = std::make_unique<CaptureRequest>(camera); > > > > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > > > > } > > > > > > > > /* > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > > > > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > > > > facing_(CAMERA_FACING_FRONT), orientation_(0) > > > > { > > > > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > > > > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > > > > > > > maker_ = "libcamera"; > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > > > > return 0; > > > > } > > > > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > > > > +{ > > > > + /* > > > > + * Remove the descriptor from the map if the worker has failed > > > > + * to process it and notify the camera service about the failure. > > > > + */ > > > > + MutexLocker descriptorsLock(descriptorsMutex_); > > > > + auto it = descriptors_.find(request->cookie()); > > > > + if (it == descriptors_.end()) { > > > > + LOG(HAL, Fatal) > > > > > > If I understand correctly, this must not happen because > > > > Do you mean the it == end() with "this" ? > > > > It should not happen as requestQueueFail is called by the worker if it > > fails to wait on a fence and the descriptor is already part of > > descriptors. > > > > Actually, there's a small window for this to race now that I look at > > it again > > > > -------------------------------------------------------------------------- > > CameraDevice CameraWorker > > -------------------------------------------------------------------------- > > > > worker_.queueRequest() -> > > > > worker.processRequest > > fence.wait() == error > > <- requestQueueFailed.emit() > > requestQueueFailed() > > it == descriptors_.end() > > abort(); > > > > descriptors_[req] = desc; > > -------------------------------------------------------------------------- > > > > While the intended order of execution is > > > > CameraDevice CameraWorker > > -------------------------------------------------------------------------- > > > > worker_.queueRequest() > > descriptors_[req] = desc; > > -> > > worker.processRequest > > fence.wait() == error > > <- requestQueueFailed.emit() > > requestQueueFailed() > > it == descriptors_.end() > > abort(); > > > > -------------------------------------------------------------------------- > > > > I'll make sure to add the descriptor to descriptors_ before calling > > worker_.queuRequest(). > > > > > requestQueueFailed() is invoked by worker::stop() and it is executed > > > > Why do you say it is invoked by stop() ? The signal is emitted if > > waiting on a fence fails as shown... > > > > I don't probably understand when fence waiting error happens. > Could you explain when it happens? I wish that doesn't happen, but sometimes things break [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired don't know why, we have an arbitrary timeout there, that should probably be better characterized. But it guess protecting against poll() errors on a fence is not crazy. > > It looks like, in the next patch, descriptors_ must be empty in some places. Not in some places. after the camera has been stopped it must be empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons: 1) It validates the pipeline correctly returns all buffers at Camera::stop() time. If that doesn't happen we'll be missing error notifications to the camera service, I feel it's better to validate that early and ASSERT() during development than chaising errors across the IPC boundary 2) It validates that, if 1) holds, the HAL implementation correctly returns all requests to the camera service. As soon as I added the ASSERT() I stumbled upon a crash, due to the fence wait being failed as shown above. That lead to this patch. > How is it guaranteed that requestQueueFailed() is executed before > descriptors_ is empty? CameraDevice::processCaptureRequest CameraWorker A) worker_.queueRequest(req); B) descriptors_[req] = move(desc) -> queueRequest() processReques() wait(fence) == error requestQueueFailed.emit() <- C) descriptor.erase(req) As said, A) and B) should probably be inverted to make sure the descriptor is queued before calling worker_.queueRequest() (which can lead to emitting requestQueueFailed if waiting on a fence fails). In this version there's a tiny window where the control could be given to the CameraWorker and returned to the CameraDevice after the requestQueueFailed slot has been executed. In that case we would have A, C, B that would lead to the Fatal assertion being triggered. If we guarantee that (A, B) are executed atomically, or simply do B before A, then when the slot is called the descriptor is guaranteed to be in map. I might be missing some races though. Can you point me to the part that doesn't work for you ? Thanks j > -Hiro > > > before descriptors_.clear() always. > > > Correct? > > > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org> > > > > > > */ > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > > > > - : camera_(camera) > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > > > > + const camera3_capture_request_t *camera3Request) > > > > + : camera_(camera), camera3Request_(camera3Request) > > > > { > > > > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > > > > } > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > > > > { > > > > /* Async process the request on the worker which runs its own thread. */ > > > > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > > > > - request); > > > > + this, request); > > > > } > > > > > > > > /* > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > > > > return -errno; > > > > } > > > > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > > > > { > > > > /* Wait on all fences before queuing the Request. */ > > > > for (int fence : request->fences()) { > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > if (ret < 0) { > > > > LOG(HAL, Error) << "Failed waiting for fence: " > > > > << fence << ": " << strerror(-ret); > > > > + context->requestQueueFailed.emit(request); > > > > .... Here > > > > Is your question about stop() related to dispatching messages before > > the thread has actually been stopped ? Isn't that been fixed some time > > ago by overriding CameraWorker::exec() ? > > > > Thanks > > j > >
Hi Jacopo, On Tue, Sep 7, 2021 at 2:04 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > Hi Hiro, > > On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote: > > Hi Jacopo, > > > > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > Hi Hiro, > > > > > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote: > > > > Hi Jacopo, thank you for the patch. > > > > > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance > > > > > for each capture request queued to the Camera. Such descriptors are > > > > > cleared when the Camera completes the Request in the > > > > > CameraDevice::requestComplete() slot. > > > > > > > > > > The CameraDevice class relies on an internal worker to off-load > > > > > waiting on synchronization fences, and does unconditionally store > > > > > the descriptor in the descriptors_ class member map to handle its > > > > > lifetime. > > > > > > > > > > If waiting on a fence fails, the created descriptor remains in the > > > > > descriptors_ map until the next call to stop() or close(), as > > > > > requestComplete() will never be called, and the camera > > > > > service is never notified about the queueing failure. > > > > > > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > > > > > waiting on a fence has failed, by installing a new requestQueueFailed > > > > > Signal. > > > > > > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on > > > > > a fence has failed. The CameraDevice is augmented with a slot connected > > > > > to the Signal that removes the descriptor from the map and notifies the > > > > > camera framework about the failure. > > > > > > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > > > > > --- > > > > > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > > > > > src/android/camera_device.h | 1 + > > > > > src/android/camera_worker.cpp | 10 ++++++---- > > > > > src/android/camera_worker.h | 9 +++++++-- > > > > > 4 files changed, 35 insertions(+), 7 deletions(-) > > > > > > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > > > > > index a0ea138d9499..0ce9b699bfae 100644 > > > > > --- a/src/android/camera_device.cpp > > > > > +++ b/src/android/camera_device.cpp > > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > > > > > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > > > > > * lifetime to the descriptor. > > > > > */ > > > > > - request_ = std::make_unique<CaptureRequest>(camera); > > > > > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > > > > > } > > > > > > > > > > /* > > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > > > > > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > > > > > facing_(CAMERA_FACING_FRONT), orientation_(0) > > > > > { > > > > > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > > > > > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > > > > > > > > > maker_ = "libcamera"; > > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > > > > > return 0; > > > > > } > > > > > > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > > > > > +{ > > > > > + /* > > > > > + * Remove the descriptor from the map if the worker has failed > > > > > + * to process it and notify the camera service about the failure. > > > > > + */ > > > > > + MutexLocker descriptorsLock(descriptorsMutex_); > > > > > + auto it = descriptors_.find(request->cookie()); > > > > > + if (it == descriptors_.end()) { > > > > > + LOG(HAL, Fatal) > > > > > > > > If I understand correctly, this must not happen because > > > > > > Do you mean the it == end() with "this" ? > > > > > > It should not happen as requestQueueFail is called by the worker if it > > > fails to wait on a fence and the descriptor is already part of > > > descriptors. > > > > > > Actually, there's a small window for this to race now that I look at > > > it again > > > > > > -------------------------------------------------------------------------- > > > CameraDevice CameraWorker > > > -------------------------------------------------------------------------- > > > > > > worker_.queueRequest() -> > > > > > > worker.processRequest > > > fence.wait() == error > > > <- requestQueueFailed.emit() > > > requestQueueFailed() > > > it == descriptors_.end() > > > abort(); > > > > > > descriptors_[req] = desc; > > > -------------------------------------------------------------------------- > > > > > > While the intended order of execution is > > > > > > CameraDevice CameraWorker > > > -------------------------------------------------------------------------- > > > > > > worker_.queueRequest() > > > descriptors_[req] = desc; > > > -> > > > worker.processRequest > > > fence.wait() == error > > > <- requestQueueFailed.emit() > > > requestQueueFailed() > > > it == descriptors_.end() > > > abort(); > > > > > > -------------------------------------------------------------------------- > > > > > > I'll make sure to add the descriptor to descriptors_ before calling > > > worker_.queuRequest(). > > > > > > > requestQueueFailed() is invoked by worker::stop() and it is executed > > > > > > Why do you say it is invoked by stop() ? The signal is emitted if > > > waiting on a fence fails as shown... > > > > > > > I don't probably understand when fence waiting error happens. > > Could you explain when it happens? > > I wish that doesn't happen, but sometimes things break > > [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired > > don't know why, we have an arbitrary timeout there, that should > probably be better characterized. But it guess protecting against > poll() errors on a fence is not crazy. > > > > > It looks like, in the next patch, descriptors_ must be empty in some places. > > Not in some places. after the camera has been stopped it must be > empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons: > > 1) It validates the pipeline correctly returns all buffers at > Camera::stop() time. If that doesn't happen we'll be missing error > notifications to the camera service, I feel it's better to validate > that early and ASSERT() during development than chaising errors across > the IPC boundary > > 2) It validates that, if 1) holds, the HAL implementation > correctly returns all requests to the camera service. As soon as I > added the ASSERT() I stumbled upon a crash, due to the fence wait being > failed as shown above. That lead to this patch. > > > How is it guaranteed that requestQueueFailed() is executed before > > descriptors_ is empty? > > CameraDevice::processCaptureRequest CameraWorker > A) worker_.queueRequest(req); > B) descriptors_[req] = move(desc) > > -> queueRequest() > processReques() > > wait(fence) == error > requestQueueFailed.emit() > <- > C) descriptor.erase(req) > > > As said, A) and B) should probably be inverted to make sure the > descriptor is queued before calling worker_.queueRequest() (which can > lead to emitting requestQueueFailed if waiting on a fence fails). > > In this version there's a tiny window where the control could be given > to the CameraWorker and returned to the CameraDevice after the > requestQueueFailed slot has been executed. In that case we would have > A, C, B that would lead to the Fatal assertion being triggered. > > If we guarantee that (A, B) are executed atomically, or simply do B > before A, then when the slot is called the descriptor is guaranteed to > be in map. > > I might be missing some races though. Can you point me to the part > that doesn't work for you ? > I think, when CameraDevice::stop() and CameraDevice::flush() is called, there may be a pending request waiting on fence. So considering from the ASSERT, Worker::stop() causes requestQueueFailed() for the waiting request. But if I remember/understand correctly, Worker::stop() flushes pending requests in a worker queue, that is, requests for which processRequest() has not been called. Therefore descriptors_ might not be empty thanks to the request waiting on fence. I probably miss something here. -Hiro > Thanks > j > > > > > -Hiro > > > > before descriptors_.clear() always. > > > > Correct? > > > > > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org> > > > > > > > > */ > > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > > > > > - : camera_(camera) > > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > > > > > + const camera3_capture_request_t *camera3Request) > > > > > + : camera_(camera), camera3Request_(camera3Request) > > > > > { > > > > > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > > > > > } > > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > > > > > { > > > > > /* Async process the request on the worker which runs its own thread. */ > > > > > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > > > > > - request); > > > > > + this, request); > > > > > } > > > > > > > > > > /* > > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > > > > > return -errno; > > > > > } > > > > > > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > > > > > { > > > > > /* Wait on all fences before queuing the Request. */ > > > > > for (int fence : request->fences()) { > > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > > if (ret < 0) { > > > > > LOG(HAL, Error) << "Failed waiting for fence: " > > > > > << fence << ": " << strerror(-ret); > > > > > + context->requestQueueFailed.emit(request); > > > > > > .... Here > > > > > > Is your question about stop() related to dispatching messages before > > > the thread has actually been stopped ? Isn't that been fixed some time > > > ago by overriding CameraWorker::exec() ? > > > > > > Thanks > > > j > > >
Hi Hiro, On Tue, Sep 07, 2021 at 05:32:29PM +0900, Hirokazu Honda wrote: > Hi Jacopo, > > On Tue, Sep 7, 2021 at 2:04 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > Hi Hiro, > > > > On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote: > > > Hi Jacopo, > > > > > > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > > > Hi Hiro, > > > > > > > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote: > > > > > Hi Jacopo, thank you for the patch. > > > > > > > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > > > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance > > > > > > for each capture request queued to the Camera. Such descriptors are > > > > > > cleared when the Camera completes the Request in the > > > > > > CameraDevice::requestComplete() slot. > > > > > > > > > > > > The CameraDevice class relies on an internal worker to off-load > > > > > > waiting on synchronization fences, and does unconditionally store > > > > > > the descriptor in the descriptors_ class member map to handle its > > > > > > lifetime. > > > > > > > > > > > > If waiting on a fence fails, the created descriptor remains in the > > > > > > descriptors_ map until the next call to stop() or close(), as > > > > > > requestComplete() will never be called, and the camera > > > > > > service is never notified about the queueing failure. > > > > > > > > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > > > > > > waiting on a fence has failed, by installing a new requestQueueFailed > > > > > > Signal. > > > > > > > > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on > > > > > > a fence has failed. The CameraDevice is augmented with a slot connected > > > > > > to the Signal that removes the descriptor from the map and notifies the > > > > > > camera framework about the failure. > > > > > > > > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > > > > > > --- > > > > > > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > > > > > > src/android/camera_device.h | 1 + > > > > > > src/android/camera_worker.cpp | 10 ++++++---- > > > > > > src/android/camera_worker.h | 9 +++++++-- > > > > > > 4 files changed, 35 insertions(+), 7 deletions(-) > > > > > > > > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > > > > > > index a0ea138d9499..0ce9b699bfae 100644 > > > > > > --- a/src/android/camera_device.cpp > > > > > > +++ b/src/android/camera_device.cpp > > > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > > > > > > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > > > > > > * lifetime to the descriptor. > > > > > > */ > > > > > > - request_ = std::make_unique<CaptureRequest>(camera); > > > > > > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > > > > > > } > > > > > > > > > > > > /* > > > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > > > > > > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > > > > > > facing_(CAMERA_FACING_FRONT), orientation_(0) > > > > > > { > > > > > > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > > > > > > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > > > > > > > > > > > maker_ = "libcamera"; > > > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > > > > > > return 0; > > > > > > } > > > > > > > > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > > > > > > +{ > > > > > > + /* > > > > > > + * Remove the descriptor from the map if the worker has failed > > > > > > + * to process it and notify the camera service about the failure. > > > > > > + */ > > > > > > + MutexLocker descriptorsLock(descriptorsMutex_); > > > > > > + auto it = descriptors_.find(request->cookie()); > > > > > > + if (it == descriptors_.end()) { > > > > > > + LOG(HAL, Fatal) > > > > > > > > > > If I understand correctly, this must not happen because > > > > > > > > Do you mean the it == end() with "this" ? > > > > > > > > It should not happen as requestQueueFail is called by the worker if it > > > > fails to wait on a fence and the descriptor is already part of > > > > descriptors. > > > > > > > > Actually, there's a small window for this to race now that I look at > > > > it again > > > > > > > > -------------------------------------------------------------------------- > > > > CameraDevice CameraWorker > > > > -------------------------------------------------------------------------- > > > > > > > > worker_.queueRequest() -> > > > > > > > > worker.processRequest > > > > fence.wait() == error > > > > <- requestQueueFailed.emit() > > > > requestQueueFailed() > > > > it == descriptors_.end() > > > > abort(); > > > > > > > > descriptors_[req] = desc; > > > > -------------------------------------------------------------------------- > > > > > > > > While the intended order of execution is > > > > > > > > CameraDevice CameraWorker > > > > -------------------------------------------------------------------------- > > > > > > > > worker_.queueRequest() > > > > descriptors_[req] = desc; > > > > -> > > > > worker.processRequest > > > > fence.wait() == error > > > > <- requestQueueFailed.emit() > > > > requestQueueFailed() > > > > it == descriptors_.end() > > > > abort(); > > > > > > > > -------------------------------------------------------------------------- > > > > > > > > I'll make sure to add the descriptor to descriptors_ before calling > > > > worker_.queuRequest(). > > > > > > > > > requestQueueFailed() is invoked by worker::stop() and it is executed > > > > > > > > Why do you say it is invoked by stop() ? The signal is emitted if > > > > waiting on a fence fails as shown... > > > > > > > > > > I don't probably understand when fence waiting error happens. > > > Could you explain when it happens? > > > > I wish that doesn't happen, but sometimes things break > > > > [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired > > > > don't know why, we have an arbitrary timeout there, that should > > probably be better characterized. But it guess protecting against > > poll() errors on a fence is not crazy. > > > > > > > > It looks like, in the next patch, descriptors_ must be empty in some places. > > > > Not in some places. after the camera has been stopped it must be > > empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons: > > > > 1) It validates the pipeline correctly returns all buffers at > > Camera::stop() time. If that doesn't happen we'll be missing error > > notifications to the camera service, I feel it's better to validate > > that early and ASSERT() during development than chaising errors across > > the IPC boundary > > > > 2) It validates that, if 1) holds, the HAL implementation > > correctly returns all requests to the camera service. As soon as I > > added the ASSERT() I stumbled upon a crash, due to the fence wait being > > failed as shown above. That lead to this patch. > > > > > How is it guaranteed that requestQueueFailed() is executed before > > > descriptors_ is empty? > > > > CameraDevice::processCaptureRequest CameraWorker > > A) worker_.queueRequest(req); > > B) descriptors_[req] = move(desc) > > > > -> queueRequest() > > processReques() > > > > wait(fence) == error > > requestQueueFailed.emit() > > <- > > C) descriptor.erase(req) > > > > > > As said, A) and B) should probably be inverted to make sure the > > descriptor is queued before calling worker_.queueRequest() (which can > > lead to emitting requestQueueFailed if waiting on a fence fails). > > > > In this version there's a tiny window where the control could be given > > to the CameraWorker and returned to the CameraDevice after the > > requestQueueFailed slot has been executed. In that case we would have > > A, C, B that would lead to the Fatal assertion being triggered. > > > > If we guarantee that (A, B) are executed atomically, or simply do B > > before A, then when the slot is called the descriptor is guaranteed to > > be in map. > > > > I might be missing some races though. Can you point me to the part > > that doesn't work for you ? > > > > I think, when CameraDevice::stop() and CameraDevice::flush() is > called, there may be a pending request waiting on fence. > So considering from the ASSERT, Worker::stop() causes > requestQueueFailed() for the waiting request. > But if I remember/understand correctly, Worker::stop() flushes pending > requests in a worker queue, that is, requests for which > processRequest() has not been called. > Therefore descriptors_ might not be empty thanks to the request > waiting on fence. > I probably miss something here. It is my understanding that since commit: 7de2daf653bc ("android: camera_worker: Process all queued requests when stopping") all the messages pending in the queue (the async processRequest() calls) are dispatched to the Worker which processes them until its message queue its empty. This, in my understanding means that all the requests posted to the CameraWorker with queueRequest() are processed and queued to Camera (if there are no isses waiting on the fence) or deleted with the new requestQueueFailed signal if waiting on the fence fails. If Request are queued to the Camera, Camera::stop() will complete them (and their descriptors deleted), if instead requestQueueFailed is emitted, the descriptor is now deleted by the associated slot in CameraDevice. Now, that's my understanding and it's based on the assumptions that: 1) All pending messages are delivered (which should be guaranteed by 7de2daf653bc 2) The async processRequest() function is invoked for each message dispatched after the thread has stopped 3) All the processRequest() async calls either succesfully wait on the fence and queue the Request to the Camera, or signal a failure in waiting and emit the requestQueueFailed signal 4) All of this happens before worker_.stop() returns Explicitly Cc-ed Laurent to confirm or disprove my understanding. Anyway, I went probably too far with this patch as it introduces something that makes me feel more confortable (ASSERT no requests are left in-flight when stopping the camera) but it is not a strict requirement nor it fixes any immediate bug, so if it proves to be controversial I'll be happy to drop and move forward with more pressing issues. Thanks j > > -Hiro > > Thanks > > j > > > > > > > > > -Hiro > > > > > before descriptors_.clear() always. > > > > > Correct? > > > > > > > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org> > > > > > > > > > > */ > > > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > > > > > > - : camera_(camera) > > > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > > > > > > + const camera3_capture_request_t *camera3Request) > > > > > > + : camera_(camera), camera3Request_(camera3Request) > > > > > > { > > > > > > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > > > > > > } > > > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > > > > > > { > > > > > > /* Async process the request on the worker which runs its own thread. */ > > > > > > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > > > > > > - request); > > > > > > + this, request); > > > > > > } > > > > > > > > > > > > /* > > > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > > > > > > return -errno; > > > > > > } > > > > > > > > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > > > > > > { > > > > > > /* Wait on all fences before queuing the Request. */ > > > > > > for (int fence : request->fences()) { > > > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > > > if (ret < 0) { > > > > > > LOG(HAL, Error) << "Failed waiting for fence: " > > > > > > << fence << ": " << strerror(-ret); > > > > > > + context->requestQueueFailed.emit(request); > > > > > > > > .... Here > > > > > > > > Is your question about stop() related to dispatching messages before > > > > the thread has actually been stopped ? Isn't that been fixed some time > > > > ago by overriding CameraWorker::exec() ? > > > > > > > > Thanks > > > > j > > > >
Hi Jacopo, On Tue, Sep 7, 2021 at 10:51 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > Hi Hiro, > > On Tue, Sep 07, 2021 at 05:32:29PM +0900, Hirokazu Honda wrote: > > Hi Jacopo, > > > > On Tue, Sep 7, 2021 at 2:04 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > Hi Hiro, > > > > > > On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote: > > > > Hi Jacopo, > > > > > > > > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > > > > > Hi Hiro, > > > > > > > > > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote: > > > > > > Hi Jacopo, thank you for the patch. > > > > > > > > > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote: > > > > > > > > > > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance > > > > > > > for each capture request queued to the Camera. Such descriptors are > > > > > > > cleared when the Camera completes the Request in the > > > > > > > CameraDevice::requestComplete() slot. > > > > > > > > > > > > > > The CameraDevice class relies on an internal worker to off-load > > > > > > > waiting on synchronization fences, and does unconditionally store > > > > > > > the descriptor in the descriptors_ class member map to handle its > > > > > > > lifetime. > > > > > > > > > > > > > > If waiting on a fence fails, the created descriptor remains in the > > > > > > > descriptors_ map until the next call to stop() or close(), as > > > > > > > requestComplete() will never be called, and the camera > > > > > > > service is never notified about the queueing failure. > > > > > > > > > > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > > > > > > > waiting on a fence has failed, by installing a new requestQueueFailed > > > > > > > Signal. > > > > > > > > > > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on > > > > > > > a fence has failed. The CameraDevice is augmented with a slot connected > > > > > > > to the Signal that removes the descriptor from the map and notifies the > > > > > > > camera framework about the failure. > > > > > > > > > > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > > > > > > > --- > > > > > > > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > > > > > > > src/android/camera_device.h | 1 + > > > > > > > src/android/camera_worker.cpp | 10 ++++++---- > > > > > > > src/android/camera_worker.h | 9 +++++++-- > > > > > > > 4 files changed, 35 insertions(+), 7 deletions(-) > > > > > > > > > > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > > > > > > > index a0ea138d9499..0ce9b699bfae 100644 > > > > > > > --- a/src/android/camera_device.cpp > > > > > > > +++ b/src/android/camera_device.cpp > > > > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > > > > > > > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > > > > > > > * lifetime to the descriptor. > > > > > > > */ > > > > > > > - request_ = std::make_unique<CaptureRequest>(camera); > > > > > > > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > > > > > > > } > > > > > > > > > > > > > > /* > > > > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > > > > > > > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > > > > > > > facing_(CAMERA_FACING_FRONT), orientation_(0) > > > > > > > { > > > > > > > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > > > > > > > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > > > > > > > > > > > > > maker_ = "libcamera"; > > > > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > > > > > > > return 0; > > > > > > > } > > > > > > > > > > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > > > > > > > +{ > > > > > > > + /* > > > > > > > + * Remove the descriptor from the map if the worker has failed > > > > > > > + * to process it and notify the camera service about the failure. > > > > > > > + */ > > > > > > > + MutexLocker descriptorsLock(descriptorsMutex_); > > > > > > > + auto it = descriptors_.find(request->cookie()); > > > > > > > + if (it == descriptors_.end()) { > > > > > > > + LOG(HAL, Fatal) > > > > > > > > > > > > If I understand correctly, this must not happen because > > > > > > > > > > Do you mean the it == end() with "this" ? > > > > > > > > > > It should not happen as requestQueueFail is called by the worker if it > > > > > fails to wait on a fence and the descriptor is already part of > > > > > descriptors. > > > > > > > > > > Actually, there's a small window for this to race now that I look at > > > > > it again > > > > > > > > > > -------------------------------------------------------------------------- > > > > > CameraDevice CameraWorker > > > > > -------------------------------------------------------------------------- > > > > > > > > > > worker_.queueRequest() -> > > > > > > > > > > worker.processRequest > > > > > fence.wait() == error > > > > > <- requestQueueFailed.emit() > > > > > requestQueueFailed() > > > > > it == descriptors_.end() > > > > > abort(); > > > > > > > > > > descriptors_[req] = desc; > > > > > -------------------------------------------------------------------------- > > > > > > > > > > While the intended order of execution is > > > > > > > > > > CameraDevice CameraWorker > > > > > -------------------------------------------------------------------------- > > > > > > > > > > worker_.queueRequest() > > > > > descriptors_[req] = desc; > > > > > -> > > > > > worker.processRequest > > > > > fence.wait() == error > > > > > <- requestQueueFailed.emit() > > > > > requestQueueFailed() > > > > > it == descriptors_.end() > > > > > abort(); > > > > > > > > > > -------------------------------------------------------------------------- > > > > > > > > > > I'll make sure to add the descriptor to descriptors_ before calling > > > > > worker_.queuRequest(). > > > > > > > > > > > requestQueueFailed() is invoked by worker::stop() and it is executed > > > > > > > > > > Why do you say it is invoked by stop() ? The signal is emitted if > > > > > waiting on a fence fails as shown... > > > > > > > > > > > > > I don't probably understand when fence waiting error happens. > > > > Could you explain when it happens? > > > > > > I wish that doesn't happen, but sometimes things break > > > > > > [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired > > > > > > don't know why, we have an arbitrary timeout there, that should > > > probably be better characterized. But it guess protecting against > > > poll() errors on a fence is not crazy. > > > > > > > > > > > It looks like, in the next patch, descriptors_ must be empty in some places. > > > > > > Not in some places. after the camera has been stopped it must be > > > empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons: > > > > > > 1) It validates the pipeline correctly returns all buffers at > > > Camera::stop() time. If that doesn't happen we'll be missing error > > > notifications to the camera service, I feel it's better to validate > > > that early and ASSERT() during development than chaising errors across > > > the IPC boundary > > > > > > 2) It validates that, if 1) holds, the HAL implementation > > > correctly returns all requests to the camera service. As soon as I > > > added the ASSERT() I stumbled upon a crash, due to the fence wait being > > > failed as shown above. That lead to this patch. > > > > > > > How is it guaranteed that requestQueueFailed() is executed before > > > > descriptors_ is empty? > > > > > > CameraDevice::processCaptureRequest CameraWorker > > > A) worker_.queueRequest(req); > > > B) descriptors_[req] = move(desc) > > > > > > -> queueRequest() > > > processReques() > > > > > > wait(fence) == error > > > requestQueueFailed.emit() > > > <- > > > C) descriptor.erase(req) > > > > > > > > > As said, A) and B) should probably be inverted to make sure the > > > descriptor is queued before calling worker_.queueRequest() (which can > > > lead to emitting requestQueueFailed if waiting on a fence fails). > > > > > > In this version there's a tiny window where the control could be given > > > to the CameraWorker and returned to the CameraDevice after the > > > requestQueueFailed slot has been executed. In that case we would have > > > A, C, B that would lead to the Fatal assertion being triggered. > > > > > > If we guarantee that (A, B) are executed atomically, or simply do B > > > before A, then when the slot is called the descriptor is guaranteed to > > > be in map. > > > > > > I might be missing some races though. Can you point me to the part > > > that doesn't work for you ? > > > > > > > I think, when CameraDevice::stop() and CameraDevice::flush() is > > called, there may be a pending request waiting on fence. > > So considering from the ASSERT, Worker::stop() causes > > requestQueueFailed() for the waiting request. > > But if I remember/understand correctly, Worker::stop() flushes pending > > requests in a worker queue, that is, requests for which > > processRequest() has not been called. > > Therefore descriptors_ might not be empty thanks to the request > > waiting on fence. > > I probably miss something here. > > It is my understanding that since commit: 7de2daf653bc ("android: > camera_worker: Process all queued requests when stopping") all the > messages pending in the queue (the async processRequest() calls) are > dispatched to the Worker which processes them until its message queue > its empty. This, in my understanding means that all the requests > posted to the CameraWorker with queueRequest() are processed and > queued to Camera (if there are no isses waiting on the fence) or > deleted with the new requestQueueFailed signal if waiting on the fence > fails. > > If Request are queued to the Camera, Camera::stop() will complete them > (and their descriptors deleted), if instead requestQueueFailed is > emitted, the descriptor is now deleted by the associated slot in > CameraDevice. > > Now, that's my understanding and it's based on the assumptions that: > > 1) All pending messages are delivered (which should be guaranteed by 7de2daf653bc > 2) The async processRequest() function is invoked for each message > dispatched after the thread has stopped > 3) All the processRequest() async calls either succesfully wait on > the fence and queue the Request to the Camera, or signal a failure in > waiting and emit the requestQueueFailed signal > 4) All of this happens before worker_.stop() returns > > Explicitly Cc-ed Laurent to confirm or disprove my understanding. > > Anyway, I went probably too far with this patch as it introduces > something that makes me feel more confortable (ASSERT no requests are > left in-flight when stopping the camera) but it is not a strict > requirement nor it fixes any immediate bug, so if it proves to be > controversial I'll be happy to drop and move forward with more > pressing issues. > I see. The LOG(FATAL) and ASSERT() concerns me as I forget how the things go with stop(). Let's wait Lauren'ts reply. -Hiro > Thanks > j > > > > > -Hiro > > > Thanks > > > j > > > > > > > > > > > > > -Hiro > > > > > > before descriptors_.clear() always. > > > > > > Correct? > > > > > > > > > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org> > > > > > > > > > > > > */ > > > > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > > > > > > > - : camera_(camera) > > > > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > > > > > > > + const camera3_capture_request_t *camera3Request) > > > > > > > + : camera_(camera), camera3Request_(camera3Request) > > > > > > > { > > > > > > > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > > > > > > > } > > > > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > > > > > > > { > > > > > > > /* Async process the request on the worker which runs its own thread. */ > > > > > > > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > > > > > > > - request); > > > > > > > + this, request); > > > > > > > } > > > > > > > > > > > > > > /* > > > > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > > > > > > > return -errno; > > > > > > > } > > > > > > > > > > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > > > > > > > { > > > > > > > /* Wait on all fences before queuing the Request. */ > > > > > > > for (int fence : request->fences()) { > > > > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > > > > > > > if (ret < 0) { > > > > > > > LOG(HAL, Error) << "Failed waiting for fence: " > > > > > > > << fence << ": " << strerror(-ret); > > > > > > > + context->requestQueueFailed.emit(request); > > > > > > > > > > .... Here > > > > > > > > > > Is your question about stop() related to dispatching messages before > > > > > the thread has actually been stopped ? Isn't that been fixed some time > > > > > ago by overriding CameraWorker::exec() ? > > > > > > > > > > Thanks > > > > > j > > > > >
Hi Jacopo, On 9/6/21 7:31 PM, Jacopo Mondi wrote: > The CameraDevice class stores one CaptureRequestDescriptor instance > for each capture request queued to the Camera. Such descriptors are > cleared when the Camera completes the Request in the > CameraDevice::requestComplete() slot. > > The CameraDevice class relies on an internal worker to off-load > waiting on synchronization fences, and does unconditionally store > the descriptor in the descriptors_ class member map to handle its > lifetime. > > If waiting on a fence fails, the created descriptor remains in the > descriptors_ map until the next call to stop() or close(), as > requestComplete() will never be called, and the camera > service is never notified about the queueing failure. > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > waiting on a fence has failed, by installing a new requestQueueFailed > Signal. > > The Signal is emitted by the CameraWorker internal worker if waiting on > a fence has failed. The CameraDevice is augmented with a slot connected > to the Signal that removes the descriptor from the map and notifies the > camera framework about the failure. > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> Reviewed-by: Umang Jain <umang.jain@ideasonboard.com> > --- > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > src/android/camera_device.h | 1 + > src/android/camera_worker.cpp | 10 ++++++---- > src/android/camera_worker.h | 9 +++++++-- > 4 files changed, 35 insertions(+), 7 deletions(-) > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > index a0ea138d9499..0ce9b699bfae 100644 > --- a/src/android/camera_device.cpp > +++ b/src/android/camera_device.cpp > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > * lifetime to the descriptor. > */ > - request_ = std::make_unique<CaptureRequest>(camera); > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > } > > /* > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > facing_(CAMERA_FACING_FRONT), orientation_(0) > { > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > maker_ = "libcamera"; > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > return 0; > } > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > +{ > + /* > + * Remove the descriptor from the map if the worker has failed > + * to process it and notify the camera service about the failure. > + */ > + MutexLocker descriptorsLock(descriptorsMutex_); > + auto it = descriptors_.find(request->cookie()); > + if (it == descriptors_.end()) { > + LOG(HAL, Fatal) > + << "Unknown request: " << request->cookie(); > + return; > + } > + > + descriptors_.extract(it); > + > + abortRequest(request->camera3Request()); > +} > + > void CameraDevice::requestComplete(Request *request) > { > decltype(descriptors_)::node_type node; > diff --git a/src/android/camera_device.h b/src/android/camera_device.h > index 54c4cb9ab499..65456ecca7e3 100644 > --- a/src/android/camera_device.h > +++ b/src/android/camera_device.h > @@ -83,6 +83,7 @@ private: > std::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_; > CameraMetadata settings_; > std::unique_ptr<CaptureRequest> request_; > + const camera3_capture_request_t *camera3Request_; > }; > > enum class State { > diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp > index 98dddd9eb13b..7f39fab01638 100644 > --- a/src/android/camera_worker.cpp > +++ b/src/android/camera_worker.cpp > @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL) > * by the CameraWorker which queues it to the libcamera::Camera after handling > * fences. > */ > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > - : camera_(camera) > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > + const camera3_capture_request_t *camera3Request) > + : camera_(camera), camera3Request_(camera3Request) > { > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > } > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > { > /* Async process the request on the worker which runs its own thread. */ > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > - request); > + this, request); > } > > /* > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > return -errno; > } > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > { > /* Wait on all fences before queuing the Request. */ > for (int fence : request->fences()) { > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > if (ret < 0) { > LOG(HAL, Error) << "Failed waiting for fence: " > << fence << ": " << strerror(-ret); > + context->requestQueueFailed.emit(request); > return; > } > } > diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h > index 67ae50bd9288..86f20f741e54 100644 > --- a/src/android/camera_worker.h > +++ b/src/android/camera_worker.h > @@ -10,6 +10,7 @@ > #include <memory> > > #include <libcamera/base/object.h> > +#include <libcamera/base/signal.h> > #include <libcamera/base/thread.h> > > #include <libcamera/camera.h> > @@ -22,7 +23,8 @@ class CameraDevice; > class CaptureRequest > { > public: > - CaptureRequest(libcamera::Camera *camera); > + CaptureRequest(libcamera::Camera *camera, > + const camera3_capture_request_t *camera3Request); > > const std::vector<int> &fences() const { return acquireFences_; } > libcamera::ControlList &controls() { return request_->controls(); } > @@ -31,6 +33,7 @@ public: > return request_->metadata(); > } > unsigned long cookie() const { return request_->cookie(); } > + const camera3_capture_request_t *camera3Request() const { return camera3Request_; } > > void addBuffer(libcamera::Stream *stream, > libcamera::FrameBuffer *buffer, int fence); > @@ -40,6 +43,7 @@ private: > libcamera::Camera *camera_; > std::vector<int> acquireFences_; > std::unique_ptr<libcamera::Request> request_; > + const camera3_capture_request_t *camera3Request_; > }; > > class CameraWorker : private libcamera::Thread > @@ -51,6 +55,7 @@ public: > void stop(); > > void queueRequest(CaptureRequest *request); > + libcamera::Signal<CaptureRequest *> requestQueueFailed; > > protected: > void run() override; > @@ -59,7 +64,7 @@ private: > class Worker : public libcamera::Object > { > public: > - void processRequest(CaptureRequest *request); > + void processRequest(CameraWorker *context, CaptureRequest *request); > > private: > int waitFence(int fence);
Hi Jacopo, Thank you for the patch. On Mon, Sep 06, 2021 at 04:01:51PM +0200, Jacopo Mondi wrote: > The CameraDevice class stores one CaptureRequestDescriptor instance > for each capture request queued to the Camera. Such descriptors are > cleared when the Camera completes the Request in the > CameraDevice::requestComplete() slot. > > The CameraDevice class relies on an internal worker to off-load > waiting on synchronization fences, and does unconditionally store > the descriptor in the descriptors_ class member map to handle its > lifetime. > > If waiting on a fence fails, the created descriptor remains in the > descriptors_ map until the next call to stop() or close(), as > requestComplete() will never be called, and the camera > service is never notified about the queueing failure. > > Fix that by allowing the CameraWorker to notify to the CameraDevice if > waiting on a fence has failed, by installing a new requestQueueFailed > Signal. > > The Signal is emitted by the CameraWorker internal worker if waiting on > a fence has failed. The CameraDevice is augmented with a slot connected > to the Signal that removes the descriptor from the map and notifies the > camera framework about the failure. > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> > --- > src/android/camera_device.cpp | 22 +++++++++++++++++++++- > src/android/camera_device.h | 1 + > src/android/camera_worker.cpp | 10 ++++++---- > src/android/camera_worker.h | 9 +++++++-- > 4 files changed, 35 insertions(+), 7 deletions(-) > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp > index a0ea138d9499..0ce9b699bfae 100644 > --- a/src/android/camera_device.cpp > +++ b/src/android/camera_device.cpp > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( > * Create the CaptureRequest, stored as a unique_ptr<> to tie its > * lifetime to the descriptor. > */ > - request_ = std::make_unique<CaptureRequest>(camera); > + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); > } > > /* > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) > : id_(id), state_(State::Stopped), camera_(std::move(camera)), > facing_(CAMERA_FACING_FRONT), orientation_(0) > { > + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); > camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); > > maker_ = "libcamera"; > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques > return 0; > } > > +void CameraDevice::requestQueueFailed(CaptureRequest *request) > +{ > + /* > + * Remove the descriptor from the map if the worker has failed > + * to process it and notify the camera service about the failure. > + */ > + MutexLocker descriptorsLock(descriptorsMutex_); > + auto it = descriptors_.find(request->cookie()); > + if (it == descriptors_.end()) { > + LOG(HAL, Fatal) > + << "Unknown request: " << request->cookie(); > + return; > + } > + > + descriptors_.extract(it); > + > + abortRequest(request->camera3Request()); I'd like to avoid adding the camera3_capture_request_t pointer to CaptureRequest. Could we modify CameraDevice::abortRequest() to take a Camera3RequestDescriptor (which we have in CameraDevice::processCaptureRequest() where abortRequest() is called today), and get it here from the descriptors_ map ? I also wonder if we could somehow set the Request status to RequestCancelled and call CameraDevice::requestComplete(). The rationale is that we'll eventually move the fence handling to the libcamera core, so the callback to the HAL will be requestComplete() then. There's an additional problem, if a wait operation fails, you'll end up signaling the request completion to the camera service out-of-order. I wonder if it wouldn't be better to rebase this on Umang's ongoing work that will include a pending request queue to address this issue. > +} > + > void CameraDevice::requestComplete(Request *request) > { > decltype(descriptors_)::node_type node; > diff --git a/src/android/camera_device.h b/src/android/camera_device.h > index 54c4cb9ab499..65456ecca7e3 100644 > --- a/src/android/camera_device.h > +++ b/src/android/camera_device.h > @@ -83,6 +83,7 @@ private: > std::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_; > CameraMetadata settings_; > std::unique_ptr<CaptureRequest> request_; > + const camera3_capture_request_t *camera3Request_; > }; > > enum class State { > diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp > index 98dddd9eb13b..7f39fab01638 100644 > --- a/src/android/camera_worker.cpp > +++ b/src/android/camera_worker.cpp > @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL) > * by the CameraWorker which queues it to the libcamera::Camera after handling > * fences. > */ > -CaptureRequest::CaptureRequest(libcamera::Camera *camera) > - : camera_(camera) > +CaptureRequest::CaptureRequest(libcamera::Camera *camera, > + const camera3_capture_request_t *camera3Request) > + : camera_(camera), camera3Request_(camera3Request) > { > request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); > } > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) > { > /* Async process the request on the worker which runs its own thread. */ > worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, > - request); > + this, request); We could also pass the CameraWorker pointer to the CameraWorker::Worker constructor and store it (not sure it would be better actually). Another option is to store the Signal in CameraWorker::Worker, with a CameraWorker::requestQueueFailed() accessor function that returns a reference to the signal. > } > > /* > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) > return -errno; > } > > -void CameraWorker::Worker::processRequest(CaptureRequest *request) > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) > { > /* Wait on all fences before queuing the Request. */ > for (int fence : request->fences()) { > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) > if (ret < 0) { > LOG(HAL, Error) << "Failed waiting for fence: " > << fence << ": " << strerror(-ret); > + context->requestQueueFailed.emit(request); > return; > } > } > diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h > index 67ae50bd9288..86f20f741e54 100644 > --- a/src/android/camera_worker.h > +++ b/src/android/camera_worker.h > @@ -10,6 +10,7 @@ > #include <memory> > > #include <libcamera/base/object.h> > +#include <libcamera/base/signal.h> > #include <libcamera/base/thread.h> > > #include <libcamera/camera.h> > @@ -22,7 +23,8 @@ class CameraDevice; > class CaptureRequest > { > public: > - CaptureRequest(libcamera::Camera *camera); > + CaptureRequest(libcamera::Camera *camera, > + const camera3_capture_request_t *camera3Request); > > const std::vector<int> &fences() const { return acquireFences_; } > libcamera::ControlList &controls() { return request_->controls(); } > @@ -31,6 +33,7 @@ public: > return request_->metadata(); > } > unsigned long cookie() const { return request_->cookie(); } > + const camera3_capture_request_t *camera3Request() const { return camera3Request_; } > > void addBuffer(libcamera::Stream *stream, > libcamera::FrameBuffer *buffer, int fence); > @@ -40,6 +43,7 @@ private: > libcamera::Camera *camera_; > std::vector<int> acquireFences_; > std::unique_ptr<libcamera::Request> request_; > + const camera3_capture_request_t *camera3Request_; > }; > > class CameraWorker : private libcamera::Thread > @@ -51,6 +55,7 @@ public: > void stop(); > > void queueRequest(CaptureRequest *request); > + libcamera::Signal<CaptureRequest *> requestQueueFailed; > > protected: > void run() override; > @@ -59,7 +64,7 @@ private: > class Worker : public libcamera::Object > { > public: > - void processRequest(CaptureRequest *request); > + void processRequest(CameraWorker *context, CaptureRequest *request); > > private: > int waitFence(int fence);
diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp index a0ea138d9499..0ce9b699bfae 100644 --- a/src/android/camera_device.cpp +++ b/src/android/camera_device.cpp @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor( * Create the CaptureRequest, stored as a unique_ptr<> to tie its * lifetime to the descriptor. */ - request_ = std::make_unique<CaptureRequest>(camera); + request_ = std::make_unique<CaptureRequest>(camera, camera3Request); } /* @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera) : id_(id), state_(State::Stopped), camera_(std::move(camera)), facing_(CAMERA_FACING_FRONT), orientation_(0) { + worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed); camera_->requestCompleted.connect(this, &CameraDevice::requestComplete); maker_ = "libcamera"; @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques return 0; } +void CameraDevice::requestQueueFailed(CaptureRequest *request) +{ + /* + * Remove the descriptor from the map if the worker has failed + * to process it and notify the camera service about the failure. + */ + MutexLocker descriptorsLock(descriptorsMutex_); + auto it = descriptors_.find(request->cookie()); + if (it == descriptors_.end()) { + LOG(HAL, Fatal) + << "Unknown request: " << request->cookie(); + return; + } + + descriptors_.extract(it); + + abortRequest(request->camera3Request()); +} + void CameraDevice::requestComplete(Request *request) { decltype(descriptors_)::node_type node; diff --git a/src/android/camera_device.h b/src/android/camera_device.h index 54c4cb9ab499..65456ecca7e3 100644 --- a/src/android/camera_device.h +++ b/src/android/camera_device.h @@ -83,6 +83,7 @@ private: std::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_; CameraMetadata settings_; std::unique_ptr<CaptureRequest> request_; + const camera3_capture_request_t *camera3Request_; }; enum class State { diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp index 98dddd9eb13b..7f39fab01638 100644 --- a/src/android/camera_worker.cpp +++ b/src/android/camera_worker.cpp @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL) * by the CameraWorker which queues it to the libcamera::Camera after handling * fences. */ -CaptureRequest::CaptureRequest(libcamera::Camera *camera) - : camera_(camera) +CaptureRequest::CaptureRequest(libcamera::Camera *camera, + const camera3_capture_request_t *camera3Request) + : camera_(camera), camera3Request_(camera3Request) { request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this)); } @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request) { /* Async process the request on the worker which runs its own thread. */ worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued, - request); + this, request); } /* @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence) return -errno; } -void CameraWorker::Worker::processRequest(CaptureRequest *request) +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request) { /* Wait on all fences before queuing the Request. */ for (int fence : request->fences()) { @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request) if (ret < 0) { LOG(HAL, Error) << "Failed waiting for fence: " << fence << ": " << strerror(-ret); + context->requestQueueFailed.emit(request); return; } } diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h index 67ae50bd9288..86f20f741e54 100644 --- a/src/android/camera_worker.h +++ b/src/android/camera_worker.h @@ -10,6 +10,7 @@ #include <memory> #include <libcamera/base/object.h> +#include <libcamera/base/signal.h> #include <libcamera/base/thread.h> #include <libcamera/camera.h> @@ -22,7 +23,8 @@ class CameraDevice; class CaptureRequest { public: - CaptureRequest(libcamera::Camera *camera); + CaptureRequest(libcamera::Camera *camera, + const camera3_capture_request_t *camera3Request); const std::vector<int> &fences() const { return acquireFences_; } libcamera::ControlList &controls() { return request_->controls(); } @@ -31,6 +33,7 @@ public: return request_->metadata(); } unsigned long cookie() const { return request_->cookie(); } + const camera3_capture_request_t *camera3Request() const { return camera3Request_; } void addBuffer(libcamera::Stream *stream, libcamera::FrameBuffer *buffer, int fence); @@ -40,6 +43,7 @@ private: libcamera::Camera *camera_; std::vector<int> acquireFences_; std::unique_ptr<libcamera::Request> request_; + const camera3_capture_request_t *camera3Request_; }; class CameraWorker : private libcamera::Thread @@ -51,6 +55,7 @@ public: void stop(); void queueRequest(CaptureRequest *request); + libcamera::Signal<CaptureRequest *> requestQueueFailed; protected: void run() override; @@ -59,7 +64,7 @@ private: class Worker : public libcamera::Object { public: - void processRequest(CaptureRequest *request); + void processRequest(CameraWorker *context, CaptureRequest *request); private: int waitFence(int fence);
The CameraDevice class stores one CaptureRequestDescriptor instance for each capture request queued to the Camera. Such descriptors are cleared when the Camera completes the Request in the CameraDevice::requestComplete() slot. The CameraDevice class relies on an internal worker to off-load waiting on synchronization fences, and does unconditionally store the descriptor in the descriptors_ class member map to handle its lifetime. If waiting on a fence fails, the created descriptor remains in the descriptors_ map until the next call to stop() or close(), as requestComplete() will never be called, and the camera service is never notified about the queueing failure. Fix that by allowing the CameraWorker to notify to the CameraDevice if waiting on a fence has failed, by installing a new requestQueueFailed Signal. The Signal is emitted by the CameraWorker internal worker if waiting on a fence has failed. The CameraDevice is augmented with a slot connected to the Signal that removes the descriptor from the map and notifies the camera framework about the failure. Signed-off-by: Jacopo Mondi <jacopo@jmondi.org> --- src/android/camera_device.cpp | 22 +++++++++++++++++++++- src/android/camera_device.h | 1 + src/android/camera_worker.cpp | 10 ++++++---- src/android/camera_worker.h | 9 +++++++-- 4 files changed, 35 insertions(+), 7 deletions(-)