[{"id":19469,"web_url":"https://patchwork.libcamera.org/comment/19469/","msgid":"<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>","date":"2021-09-06T14:55:15","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":63,"url":"https://patchwork.libcamera.org/api/people/63/","name":"Hirokazu Honda","email":"hiroh@chromium.org"},"content":"Hi Jacopo, thank you for the patch.\n\nOn Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n>\n> The CameraDevice class stores one CaptureRequestDescriptor instance\n> for each capture request queued to the Camera. Such descriptors are\n> cleared when the Camera completes the Request in the\n> CameraDevice::requestComplete() slot.\n>\n> The CameraDevice class relies on an internal worker to off-load\n> waiting on synchronization fences, and does unconditionally store\n> the descriptor in the descriptors_ class member map to handle its\n> lifetime.\n>\n> If waiting on a fence fails, the created descriptor remains in the\n> descriptors_ map until the next call to stop() or close(), as\n> requestComplete() will never be called, and the camera\n> service is never notified about the queueing failure.\n>\n> Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> waiting on a fence has failed, by installing a new requestQueueFailed\n> Signal.\n>\n> The Signal is emitted by the CameraWorker internal worker if waiting on\n> a fence has failed. The CameraDevice is augmented with a slot connected\n> to the Signal that removes the descriptor from the map and notifies the\n> camera framework about the failure.\n>\n> Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> ---\n>  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n>  src/android/camera_device.h   |  1 +\n>  src/android/camera_worker.cpp | 10 ++++++----\n>  src/android/camera_worker.h   |  9 +++++++--\n>  4 files changed, 35 insertions(+), 7 deletions(-)\n>\n> diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> index a0ea138d9499..0ce9b699bfae 100644\n> --- a/src/android/camera_device.cpp\n> +++ b/src/android/camera_device.cpp\n> @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n>          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n>          * lifetime to the descriptor.\n>          */\n> -       request_ = std::make_unique<CaptureRequest>(camera);\n> +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n>  }\n>\n>  /*\n> @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n>         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n>           facing_(CAMERA_FACING_FRONT), orientation_(0)\n>  {\n> +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n>         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n>\n>         maker_ = \"libcamera\";\n> @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n>         return 0;\n>  }\n>\n> +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> +{\n> +       /*\n> +        * Remove the descriptor from the map if the worker has failed\n> +        * to process it and notify the camera service about the failure.\n> +        */\n> +       MutexLocker descriptorsLock(descriptorsMutex_);\n> +       auto it = descriptors_.find(request->cookie());\n> +       if (it == descriptors_.end()) {\n> +               LOG(HAL, Fatal)\n\nIf I understand correctly, this must not happen because\nrequestQueueFailed() is invoked by worker::stop() and it is executed\nbefore descriptors_.clear() always.\nCorrect?\n\nReviewed-by: Hirokazu Honda <hiroh@chromium.org>\n\n-Hiro\n> +                       << \"Unknown request: \" << request->cookie();\n> +               return;\n> +       }\n> +\n> +       descriptors_.extract(it);\n> +\n> +       abortRequest(request->camera3Request());\n> +}\n> +\n>  void CameraDevice::requestComplete(Request *request)\n>  {\n>         decltype(descriptors_)::node_type node;\n> diff --git a/src/android/camera_device.h b/src/android/camera_device.h\n> index 54c4cb9ab499..65456ecca7e3 100644\n> --- a/src/android/camera_device.h\n> +++ b/src/android/camera_device.h\n> @@ -83,6 +83,7 @@ private:\n>                 std::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_;\n>                 CameraMetadata settings_;\n>                 std::unique_ptr<CaptureRequest> request_;\n> +               const camera3_capture_request_t *camera3Request_;\n>         };\n>\n>         enum class State {\n> diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp\n> index 98dddd9eb13b..7f39fab01638 100644\n> --- a/src/android/camera_worker.cpp\n> +++ b/src/android/camera_worker.cpp\n> @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL)\n>   * by the CameraWorker which queues it to the libcamera::Camera after handling\n>   * fences.\n>   */\n> -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> -       : camera_(camera)\n> +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> +                              const camera3_capture_request_t *camera3Request)\n> +       : camera_(camera), camera3Request_(camera3Request)\n>  {\n>         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n>  }\n> @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n>  {\n>         /* Async process the request on the worker which runs its own thread. */\n>         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> -                            request);\n> +                            this, request);\n>  }\n>\n>  /*\n> @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n>         return -errno;\n>  }\n>\n> -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n>  {\n>         /* Wait on all fences before queuing the Request. */\n>         for (int fence : request->fences()) {\n> @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n>                 if (ret < 0) {\n>                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n>                                         << fence << \": \" << strerror(-ret);\n> +                       context->requestQueueFailed.emit(request);\n>                         return;\n>                 }\n>         }\n> diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h\n> index 67ae50bd9288..86f20f741e54 100644\n> --- a/src/android/camera_worker.h\n> +++ b/src/android/camera_worker.h\n> @@ -10,6 +10,7 @@\n>  #include <memory>\n>\n>  #include <libcamera/base/object.h>\n> +#include <libcamera/base/signal.h>\n>  #include <libcamera/base/thread.h>\n>\n>  #include <libcamera/camera.h>\n> @@ -22,7 +23,8 @@ class CameraDevice;\n>  class CaptureRequest\n>  {\n>  public:\n> -       CaptureRequest(libcamera::Camera *camera);\n> +       CaptureRequest(libcamera::Camera *camera,\n> +                      const camera3_capture_request_t *camera3Request);\n>\n>         const std::vector<int> &fences() const { return acquireFences_; }\n>         libcamera::ControlList &controls() { return request_->controls(); }\n> @@ -31,6 +33,7 @@ public:\n>                 return request_->metadata();\n>         }\n>         unsigned long cookie() const { return request_->cookie(); }\n> +       const camera3_capture_request_t *camera3Request() const { return camera3Request_; }\n>\n>         void addBuffer(libcamera::Stream *stream,\n>                        libcamera::FrameBuffer *buffer, int fence);\n> @@ -40,6 +43,7 @@ private:\n>         libcamera::Camera *camera_;\n>         std::vector<int> acquireFences_;\n>         std::unique_ptr<libcamera::Request> request_;\n> +       const camera3_capture_request_t *camera3Request_;\n>  };\n>\n>  class CameraWorker : private libcamera::Thread\n> @@ -51,6 +55,7 @@ public:\n>         void stop();\n>\n>         void queueRequest(CaptureRequest *request);\n> +       libcamera::Signal<CaptureRequest *> requestQueueFailed;\n>\n>  protected:\n>         void run() override;\n> @@ -59,7 +64,7 @@ private:\n>         class Worker : public libcamera::Object\n>         {\n>         public:\n> -               void processRequest(CaptureRequest *request);\n> +               void processRequest(CameraWorker *context, CaptureRequest *request);\n>\n>         private:\n>                 int waitFence(int fence);\n> --\n> 2.32.0\n>","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id 9894DBD87D\n\tfor <parsemail@patchwork.libcamera.org>;\n\tMon,  6 Sep 2021 14:55:29 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id D19926916A;\n\tMon,  6 Sep 2021 16:55:28 +0200 (CEST)","from mail-ed1-x534.google.com (mail-ed1-x534.google.com\n\t[IPv6:2a00:1450:4864:20::534])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id 4BBFB60137\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon,  6 Sep 2021 16:55:27 +0200 (CEST)","by mail-ed1-x534.google.com with SMTP id z19so9840383edi.9\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon, 06 Sep 2021 07:55:27 -0700 (PDT)"],"Authentication-Results":"lancelot.ideasonboard.com; dkim=pass (1024-bit key;\n\tunprotected) header.d=chromium.org header.i=@chromium.org\n\theader.b=\"kHQwBrDO\"; dkim-atps=neutral","DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org;\n\ts=google; \n\th=mime-version:references:in-reply-to:from:date:message-id:subject:to\n\t:cc; bh=g+b5GR+0r6lDB0USF6x2nYMu7x5ZH3W+yAlk5DEePx0=;\n\tb=kHQwBrDOxtXhs9lE0Uv5Drdtb5pzHaCJP96/Z4NjKuY9ZTVHWSXW8FR10w2XqkhWoB\n\tSlHS+hRbnqgDTWB1JVNQN4EjlXV6wAf9whsmJb0mhrhAhv168nWQNtEqlheLeBqg/OFI\n\tVu/XslTE1nYcKwtunyVY7NDOSNzNPqesX6fNY=","X-Google-DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed;\n\td=1e100.net; s=20161025;\n\th=x-gm-message-state:mime-version:references:in-reply-to:from:date\n\t:message-id:subject:to:cc;\n\tbh=g+b5GR+0r6lDB0USF6x2nYMu7x5ZH3W+yAlk5DEePx0=;\n\tb=rgD/IV+T/uTs4w7pbAfl59miFs4aBkjPwYcNUOwDgI0CFaTtz6GpryBseCEPNjVDhl\n\tLyQ9uSprXV7KrqZ+9pU8J8er+TH8eV1GWt6kyOC8s5Pv5qZheT75oQZUujYupoq4CMHt\n\tL7yZqW1Z6M6FJdSms3aLssABQhnMkyOBdsUarjFqMqomVp90w+n5+fOIs9EznLKeLHfp\n\tKbHYG26IWLvZ/+6RblyqVI6M3cJDp+yZntIn/MisyRhJPs0ZUMFjAd+biqaSG8l9i00H\n\tDFqZ3IZpmb7eXKMacOMFrjzlRXRsC3AqoCOXnuvLuL3JPRb0V0gagB3eyo+RKXO32F72\n\tWozw==","X-Gm-Message-State":"AOAM532QXVqDHnRzcbGZOIZTtAG3LNc1AU+r+FZlo8oJJa6v73F0d/5w\n\t6LtiFx0ut5MjlMt08oNeSW5Sfs+OWKAMFqzcdw/FW/onLQU=","X-Google-Smtp-Source":"ABdhPJyZCY/GS5wGa8HlTHluUQMy7YpK9N3MXG+SMf2qzDRSI+vD3FdfSKKSMAGjY9MYE/YYfJ2hwlPlzmdAH4WeQ38=","X-Received":"by 2002:aa7:d617:: with SMTP id\n\tc23mr14098997edr.206.1630940126801; \n\tMon, 06 Sep 2021 07:55:26 -0700 (PDT)","MIME-Version":"1.0","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>","In-Reply-To":"<20210906140152.636883-5-jacopo@jmondi.org>","From":"Hirokazu Honda <hiroh@chromium.org>","Date":"Mon, 6 Sep 2021 23:55:15 +0900","Message-ID":"<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>","To":"Jacopo Mondi <jacopo@jmondi.org>","Content-Type":"text/plain; charset=\"UTF-8\"","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19472,"web_url":"https://patchwork.libcamera.org/comment/19472/","msgid":"<20210906162040.v6thmls25tiocwkp@uno.localdomain>","date":"2021-09-06T16:20:40","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":3,"url":"https://patchwork.libcamera.org/api/people/3/","name":"Jacopo Mondi","email":"jacopo@jmondi.org"},"content":"Hi Hiro,\n\nOn Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote:\n> Hi Jacopo, thank you for the patch.\n>\n> On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> >\n> > The CameraDevice class stores one CaptureRequestDescriptor instance\n> > for each capture request queued to the Camera. Such descriptors are\n> > cleared when the Camera completes the Request in the\n> > CameraDevice::requestComplete() slot.\n> >\n> > The CameraDevice class relies on an internal worker to off-load\n> > waiting on synchronization fences, and does unconditionally store\n> > the descriptor in the descriptors_ class member map to handle its\n> > lifetime.\n> >\n> > If waiting on a fence fails, the created descriptor remains in the\n> > descriptors_ map until the next call to stop() or close(), as\n> > requestComplete() will never be called, and the camera\n> > service is never notified about the queueing failure.\n> >\n> > Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> > waiting on a fence has failed, by installing a new requestQueueFailed\n> > Signal.\n> >\n> > The Signal is emitted by the CameraWorker internal worker if waiting on\n> > a fence has failed. The CameraDevice is augmented with a slot connected\n> > to the Signal that removes the descriptor from the map and notifies the\n> > camera framework about the failure.\n> >\n> > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> > ---\n> >  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n> >  src/android/camera_device.h   |  1 +\n> >  src/android/camera_worker.cpp | 10 ++++++----\n> >  src/android/camera_worker.h   |  9 +++++++--\n> >  4 files changed, 35 insertions(+), 7 deletions(-)\n> >\n> > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> > index a0ea138d9499..0ce9b699bfae 100644\n> > --- a/src/android/camera_device.cpp\n> > +++ b/src/android/camera_device.cpp\n> > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n> >          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n> >          * lifetime to the descriptor.\n> >          */\n> > -       request_ = std::make_unique<CaptureRequest>(camera);\n> > +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n> >  }\n> >\n> >  /*\n> > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n> >         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n> >           facing_(CAMERA_FACING_FRONT), orientation_(0)\n> >  {\n> > +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n> >         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n> >\n> >         maker_ = \"libcamera\";\n> > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n> >         return 0;\n> >  }\n> >\n> > +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> > +{\n> > +       /*\n> > +        * Remove the descriptor from the map if the worker has failed\n> > +        * to process it and notify the camera service about the failure.\n> > +        */\n> > +       MutexLocker descriptorsLock(descriptorsMutex_);\n> > +       auto it = descriptors_.find(request->cookie());\n> > +       if (it == descriptors_.end()) {\n> > +               LOG(HAL, Fatal)\n>\n> If I understand correctly, this must not happen because\n\nDo you mean the it == end() with \"this\" ?\n\nIt should not happen as requestQueueFail is called by the worker if it\nfails to wait on a fence and the descriptor is already part of\ndescriptors.\n\nActually, there's a small window for this to race now that I look at\nit again\n\n--------------------------------------------------------------------------\nCameraDevice                            CameraWorker\n--------------------------------------------------------------------------\n\nworker_.queueRequest()          ->\n\n                                        worker.processRequest\n                                        fence.wait() == error\n                                <-      requestQueueFailed.emit()\nrequestQueueFailed()\n        it == descriptors_.end()\n        abort();\n\ndescriptors_[req] = desc;\n--------------------------------------------------------------------------\n\nWhile the intended order of execution is\n\nCameraDevice                            CameraWorker\n--------------------------------------------------------------------------\n\nworker_.queueRequest()\ndescriptors_[req] = desc;\n                                ->\n                                        worker.processRequest\n                                        fence.wait() == error\n                                <-      requestQueueFailed.emit()\nrequestQueueFailed()\n        it == descriptors_.end()\n        abort();\n\n--------------------------------------------------------------------------\n\nI'll make sure to add the descriptor to descriptors_ before calling\nworker_.queuRequest().\n\n> requestQueueFailed() is invoked by worker::stop() and it is executed\n\nWhy do you say it is invoked by stop() ? The signal is emitted if\nwaiting on a fence fails as shown...\n\n> before descriptors_.clear() always.\n> Correct?\n>\n> Reviewed-by: Hirokazu Honda <hiroh@chromium.org>\n\n> >   */\n> > -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> > -       : camera_(camera)\n> > +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> > +                              const camera3_capture_request_t *camera3Request)\n> > +       : camera_(camera), camera3Request_(camera3Request)\n> >  {\n> >         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n> >  }\n> > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n> >  {\n> >         /* Async process the request on the worker which runs its own thread. */\n> >         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> > -                            request);\n> > +                            this, request);\n> >  }\n> >\n> >  /*\n> > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n> >         return -errno;\n> >  }\n> >\n> > -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n> >  {\n> >         /* Wait on all fences before queuing the Request. */\n> >         for (int fence : request->fences()) {\n> > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> >                 if (ret < 0) {\n> >                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n> >                                         << fence << \": \" << strerror(-ret);\n> > +                       context->requestQueueFailed.emit(request);\n\n.... Here\n\nIs your question about stop() related to dispatching messages before\nthe thread has actually been stopped ? Isn't that been fixed some time\nago by overriding CameraWorker::exec() ?\n\nThanks\n   j","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id 05582BD87D\n\tfor <parsemail@patchwork.libcamera.org>;\n\tMon,  6 Sep 2021 16:19:56 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 8B9276916A;\n\tMon,  6 Sep 2021 18:19:55 +0200 (CEST)","from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net\n\t[217.70.183.194])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id E33E160137\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon,  6 Sep 2021 18:19:53 +0200 (CEST)","(Authenticated sender: jacopo@jmondi.org)\n\tby relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 6F9BF40003;\n\tMon,  6 Sep 2021 16:19:53 +0000 (UTC)"],"Date":"Mon, 6 Sep 2021 18:20:40 +0200","From":"Jacopo Mondi <jacopo@jmondi.org>","To":"Hirokazu Honda <hiroh@chromium.org>","Message-ID":"<20210906162040.v6thmls25tiocwkp@uno.localdomain>","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>\n\t<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>","MIME-Version":"1.0","Content-Type":"text/plain; charset=utf-8","Content-Disposition":"inline","In-Reply-To":"<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19474,"web_url":"https://patchwork.libcamera.org/comment/19474/","msgid":"<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>","date":"2021-09-06T16:36:43","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":63,"url":"https://patchwork.libcamera.org/api/people/63/","name":"Hirokazu Honda","email":"hiroh@chromium.org"},"content":"Hi Jacopo,\n\nOn Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n>\n> Hi Hiro,\n>\n> On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote:\n> > Hi Jacopo, thank you for the patch.\n> >\n> > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > >\n> > > The CameraDevice class stores one CaptureRequestDescriptor instance\n> > > for each capture request queued to the Camera. Such descriptors are\n> > > cleared when the Camera completes the Request in the\n> > > CameraDevice::requestComplete() slot.\n> > >\n> > > The CameraDevice class relies on an internal worker to off-load\n> > > waiting on synchronization fences, and does unconditionally store\n> > > the descriptor in the descriptors_ class member map to handle its\n> > > lifetime.\n> > >\n> > > If waiting on a fence fails, the created descriptor remains in the\n> > > descriptors_ map until the next call to stop() or close(), as\n> > > requestComplete() will never be called, and the camera\n> > > service is never notified about the queueing failure.\n> > >\n> > > Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> > > waiting on a fence has failed, by installing a new requestQueueFailed\n> > > Signal.\n> > >\n> > > The Signal is emitted by the CameraWorker internal worker if waiting on\n> > > a fence has failed. The CameraDevice is augmented with a slot connected\n> > > to the Signal that removes the descriptor from the map and notifies the\n> > > camera framework about the failure.\n> > >\n> > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> > > ---\n> > >  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n> > >  src/android/camera_device.h   |  1 +\n> > >  src/android/camera_worker.cpp | 10 ++++++----\n> > >  src/android/camera_worker.h   |  9 +++++++--\n> > >  4 files changed, 35 insertions(+), 7 deletions(-)\n> > >\n> > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> > > index a0ea138d9499..0ce9b699bfae 100644\n> > > --- a/src/android/camera_device.cpp\n> > > +++ b/src/android/camera_device.cpp\n> > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n> > >          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n> > >          * lifetime to the descriptor.\n> > >          */\n> > > -       request_ = std::make_unique<CaptureRequest>(camera);\n> > > +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n> > >  }\n> > >\n> > >  /*\n> > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n> > >         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n> > >           facing_(CAMERA_FACING_FRONT), orientation_(0)\n> > >  {\n> > > +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n> > >         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n> > >\n> > >         maker_ = \"libcamera\";\n> > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n> > >         return 0;\n> > >  }\n> > >\n> > > +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> > > +{\n> > > +       /*\n> > > +        * Remove the descriptor from the map if the worker has failed\n> > > +        * to process it and notify the camera service about the failure.\n> > > +        */\n> > > +       MutexLocker descriptorsLock(descriptorsMutex_);\n> > > +       auto it = descriptors_.find(request->cookie());\n> > > +       if (it == descriptors_.end()) {\n> > > +               LOG(HAL, Fatal)\n> >\n> > If I understand correctly, this must not happen because\n>\n> Do you mean the it == end() with \"this\" ?\n>\n> It should not happen as requestQueueFail is called by the worker if it\n> fails to wait on a fence and the descriptor is already part of\n> descriptors.\n>\n> Actually, there's a small window for this to race now that I look at\n> it again\n>\n> --------------------------------------------------------------------------\n> CameraDevice                            CameraWorker\n> --------------------------------------------------------------------------\n>\n> worker_.queueRequest()          ->\n>\n>                                         worker.processRequest\n>                                         fence.wait() == error\n>                                 <-      requestQueueFailed.emit()\n> requestQueueFailed()\n>         it == descriptors_.end()\n>         abort();\n>\n> descriptors_[req] = desc;\n> --------------------------------------------------------------------------\n>\n> While the intended order of execution is\n>\n> CameraDevice                            CameraWorker\n> --------------------------------------------------------------------------\n>\n> worker_.queueRequest()\n> descriptors_[req] = desc;\n>                                 ->\n>                                         worker.processRequest\n>                                         fence.wait() == error\n>                                 <-      requestQueueFailed.emit()\n> requestQueueFailed()\n>         it == descriptors_.end()\n>         abort();\n>\n> --------------------------------------------------------------------------\n>\n> I'll make sure to add the descriptor to descriptors_ before calling\n> worker_.queuRequest().\n>\n> > requestQueueFailed() is invoked by worker::stop() and it is executed\n>\n> Why do you say it is invoked by stop() ? The signal is emitted if\n> waiting on a fence fails as shown...\n>\n\nI don't probably understand when fence waiting error happens.\nCould you explain when it happens?\n\nIt looks like, in the next patch, descriptors_ must be empty in some places.\nHow is it guaranteed that requestQueueFailed() is executed before\ndescriptors_ is empty?\n\n-Hiro\n> > before descriptors_.clear() always.\n> > Correct?\n> >\n> > Reviewed-by: Hirokazu Honda <hiroh@chromium.org>\n>\n> > >   */\n> > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> > > -       : camera_(camera)\n> > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> > > +                              const camera3_capture_request_t *camera3Request)\n> > > +       : camera_(camera), camera3Request_(camera3Request)\n> > >  {\n> > >         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n> > >  }\n> > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n> > >  {\n> > >         /* Async process the request on the worker which runs its own thread. */\n> > >         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> > > -                            request);\n> > > +                            this, request);\n> > >  }\n> > >\n> > >  /*\n> > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n> > >         return -errno;\n> > >  }\n> > >\n> > > -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n> > >  {\n> > >         /* Wait on all fences before queuing the Request. */\n> > >         for (int fence : request->fences()) {\n> > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > >                 if (ret < 0) {\n> > >                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n> > >                                         << fence << \": \" << strerror(-ret);\n> > > +                       context->requestQueueFailed.emit(request);\n>\n> .... Here\n>\n> Is your question about stop() related to dispatching messages before\n> the thread has actually been stopped ? Isn't that been fixed some time\n> ago by overriding CameraWorker::exec() ?\n>\n> Thanks\n>    j\n>","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id 1CDACBDC71\n\tfor <parsemail@patchwork.libcamera.org>;\n\tMon,  6 Sep 2021 16:36:57 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 98D9D69167;\n\tMon,  6 Sep 2021 18:36:56 +0200 (CEST)","from mail-ej1-x634.google.com (mail-ej1-x634.google.com\n\t[IPv6:2a00:1450:4864:20::634])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id BBC4560137\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon,  6 Sep 2021 18:36:54 +0200 (CEST)","by mail-ej1-x634.google.com with SMTP id me10so14467734ejb.11\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon, 06 Sep 2021 09:36:54 -0700 (PDT)"],"Authentication-Results":"lancelot.ideasonboard.com; dkim=pass (1024-bit key;\n\tunprotected) header.d=chromium.org header.i=@chromium.org\n\theader.b=\"EtDkfgmL\"; dkim-atps=neutral","DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org;\n\ts=google; \n\th=mime-version:references:in-reply-to:from:date:message-id:subject:to\n\t:cc; bh=5TgdMF5XP+IRxAA9fz2kPEy2H3vRLbE5useH+nJLDiU=;\n\tb=EtDkfgmL0vylR6QAz/W8YBsIN53RoBFlXvhlbvzlBvp4Dv/pcazPwK0JZ/hCU0eG3n\n\txUxrwdEowlbQDXT01kMpMszMKYeJSLf80lCZEJNIEpdV9rc2+E4L2XoPxYt1nvg3cmx4\n\tcW3S68/c3hQd0sdOaR19ldKnKxyJwtUBmhnyE=","X-Google-DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed;\n\td=1e100.net; s=20161025;\n\th=x-gm-message-state:mime-version:references:in-reply-to:from:date\n\t:message-id:subject:to:cc;\n\tbh=5TgdMF5XP+IRxAA9fz2kPEy2H3vRLbE5useH+nJLDiU=;\n\tb=Vptv4kDzqWgmcWyF89CqcX+NETRCy80rNnn5Ltxv/oVqGGXVXU00jYMoIGq6ZkuNab\n\t4JSb+r+QAP2Y0ItRJv+g1TdBJx1Pl9OG0S0Jc2W4lXqY5JXSYBI/Az3oap8O0P7unFFo\n\t419biFRyiXUsVp1x00zT+9R5DgXVYsCUdYV7kMplhnu8+wKil16B9wkTEB799Xz+u/oM\n\tgSnt0OldYhUVa1Uqkb6oHaz1Y8Mk4/u0TWjEEbLYD7LTtU8VwQqMAu9n35Uyagx1MaW5\n\tt9g4rhIFZojdAb+R24+57UsxmPimInl1lwPptQEAHAgBxO7eyGsV76g3YroX8JForUB7\n\tVqhA==","X-Gm-Message-State":"AOAM532E8zkRIXATStY3QNkWTrW7temVJl/qeD8hq2Fd/CJSibMJ98R1\n\tqWYH0ybHosJPClIPbOo1ZicYgiGArwCv+NWxM7dngnfXUKo=","X-Google-Smtp-Source":"ABdhPJzzfSJE+mpBelX3DTReLiSxCquyOYTiteE2ENc96DZEI6usD0DylR8a5C27UqAmg1H1A7LujZtxMF/o+ZJoquw=","X-Received":"by 2002:a17:906:32c9:: with SMTP id\n\tk9mr14674088ejk.218.1630946214294; \n\tMon, 06 Sep 2021 09:36:54 -0700 (PDT)","MIME-Version":"1.0","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>\n\t<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>\n\t<20210906162040.v6thmls25tiocwkp@uno.localdomain>","In-Reply-To":"<20210906162040.v6thmls25tiocwkp@uno.localdomain>","From":"Hirokazu Honda <hiroh@chromium.org>","Date":"Tue, 7 Sep 2021 01:36:43 +0900","Message-ID":"<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>","To":"Jacopo Mondi <jacopo@jmondi.org>","Content-Type":"text/plain; charset=\"UTF-8\"","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19476,"web_url":"https://patchwork.libcamera.org/comment/19476/","msgid":"<20210906170542.ooaptf3vz7gkwulr@uno.localdomain>","date":"2021-09-06T17:05:42","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":3,"url":"https://patchwork.libcamera.org/api/people/3/","name":"Jacopo Mondi","email":"jacopo@jmondi.org"},"content":"Hi Hiro,\n\nOn Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote:\n> Hi Jacopo,\n>\n> On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> >\n> > Hi Hiro,\n> >\n> > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote:\n> > > Hi Jacopo, thank you for the patch.\n> > >\n> > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > > >\n> > > > The CameraDevice class stores one CaptureRequestDescriptor instance\n> > > > for each capture request queued to the Camera. Such descriptors are\n> > > > cleared when the Camera completes the Request in the\n> > > > CameraDevice::requestComplete() slot.\n> > > >\n> > > > The CameraDevice class relies on an internal worker to off-load\n> > > > waiting on synchronization fences, and does unconditionally store\n> > > > the descriptor in the descriptors_ class member map to handle its\n> > > > lifetime.\n> > > >\n> > > > If waiting on a fence fails, the created descriptor remains in the\n> > > > descriptors_ map until the next call to stop() or close(), as\n> > > > requestComplete() will never be called, and the camera\n> > > > service is never notified about the queueing failure.\n> > > >\n> > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> > > > waiting on a fence has failed, by installing a new requestQueueFailed\n> > > > Signal.\n> > > >\n> > > > The Signal is emitted by the CameraWorker internal worker if waiting on\n> > > > a fence has failed. The CameraDevice is augmented with a slot connected\n> > > > to the Signal that removes the descriptor from the map and notifies the\n> > > > camera framework about the failure.\n> > > >\n> > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> > > > ---\n> > > >  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n> > > >  src/android/camera_device.h   |  1 +\n> > > >  src/android/camera_worker.cpp | 10 ++++++----\n> > > >  src/android/camera_worker.h   |  9 +++++++--\n> > > >  4 files changed, 35 insertions(+), 7 deletions(-)\n> > > >\n> > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> > > > index a0ea138d9499..0ce9b699bfae 100644\n> > > > --- a/src/android/camera_device.cpp\n> > > > +++ b/src/android/camera_device.cpp\n> > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n> > > >          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n> > > >          * lifetime to the descriptor.\n> > > >          */\n> > > > -       request_ = std::make_unique<CaptureRequest>(camera);\n> > > > +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n> > > >  }\n> > > >\n> > > >  /*\n> > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n> > > >         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n> > > >           facing_(CAMERA_FACING_FRONT), orientation_(0)\n> > > >  {\n> > > > +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n> > > >         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n> > > >\n> > > >         maker_ = \"libcamera\";\n> > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n> > > >         return 0;\n> > > >  }\n> > > >\n> > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> > > > +{\n> > > > +       /*\n> > > > +        * Remove the descriptor from the map if the worker has failed\n> > > > +        * to process it and notify the camera service about the failure.\n> > > > +        */\n> > > > +       MutexLocker descriptorsLock(descriptorsMutex_);\n> > > > +       auto it = descriptors_.find(request->cookie());\n> > > > +       if (it == descriptors_.end()) {\n> > > > +               LOG(HAL, Fatal)\n> > >\n> > > If I understand correctly, this must not happen because\n> >\n> > Do you mean the it == end() with \"this\" ?\n> >\n> > It should not happen as requestQueueFail is called by the worker if it\n> > fails to wait on a fence and the descriptor is already part of\n> > descriptors.\n> >\n> > Actually, there's a small window for this to race now that I look at\n> > it again\n> >\n> > --------------------------------------------------------------------------\n> > CameraDevice                            CameraWorker\n> > --------------------------------------------------------------------------\n> >\n> > worker_.queueRequest()          ->\n> >\n> >                                         worker.processRequest\n> >                                         fence.wait() == error\n> >                                 <-      requestQueueFailed.emit()\n> > requestQueueFailed()\n> >         it == descriptors_.end()\n> >         abort();\n> >\n> > descriptors_[req] = desc;\n> > --------------------------------------------------------------------------\n> >\n> > While the intended order of execution is\n> >\n> > CameraDevice                            CameraWorker\n> > --------------------------------------------------------------------------\n> >\n> > worker_.queueRequest()\n> > descriptors_[req] = desc;\n> >                                 ->\n> >                                         worker.processRequest\n> >                                         fence.wait() == error\n> >                                 <-      requestQueueFailed.emit()\n> > requestQueueFailed()\n> >         it == descriptors_.end()\n> >         abort();\n> >\n> > --------------------------------------------------------------------------\n> >\n> > I'll make sure to add the descriptor to descriptors_ before calling\n> > worker_.queuRequest().\n> >\n> > > requestQueueFailed() is invoked by worker::stop() and it is executed\n> >\n> > Why do you say it is invoked by stop() ? The signal is emitted if\n> > waiting on a fence fails as shown...\n> >\n>\n> I don't probably understand when fence waiting error happens.\n> Could you explain when it happens?\n\nI wish that doesn't happen, but sometimes things break\n\n[1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired\n\ndon't know why, we have an arbitrary timeout there, that should\nprobably be better characterized. But it guess protecting against\npoll() errors on a fence is not crazy.\n\n>\n> It looks like, in the next patch, descriptors_ must be empty in some places.\n\nNot in some places. after the camera has been stopped it must be\nempty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons:\n\n1) It validates the pipeline correctly returns all buffers at\nCamera::stop() time. If that doesn't happen we'll be missing error\nnotifications to the camera service, I feel it's better to validate\nthat early and ASSERT() during development than chaising errors across\nthe IPC boundary\n\n2) It validates that, if 1) holds, the HAL implementation\ncorrectly returns all requests to the camera service. As soon as I\nadded the ASSERT() I stumbled upon a crash, due to the fence wait being\nfailed as shown above. That lead to this patch.\n\n> How is it guaranteed that requestQueueFailed() is executed before\n> descriptors_ is empty?\n\nCameraDevice::processCaptureRequest          CameraWorker\n     A) worker_.queueRequest(req);\n     B) descriptors_[req] = move(desc)\n\n                                        ->      queueRequest()\n                                                   processReques()\n\n                                                   wait(fence) == error\n                                                   requestQueueFailed.emit()\n                                        <-\n    C)  descriptor.erase(req)\n\n\nAs said, A) and B) should probably be inverted to make sure the\ndescriptor is queued before calling worker_.queueRequest() (which can\nlead to emitting requestQueueFailed if waiting on a fence fails).\n\nIn this version there's a tiny window where the control could be given\nto the CameraWorker and returned to the CameraDevice after the\nrequestQueueFailed slot has been executed. In that case we would have\nA, C, B that would lead to the Fatal assertion being triggered.\n\nIf we guarantee that (A, B) are executed atomically, or simply do B\nbefore A, then when the slot is called the descriptor is guaranteed to\nbe in map.\n\nI might be missing some races though. Can you point me to the part\nthat doesn't work for you ?\n\nThanks\n  j\n\n\n\n> -Hiro\n> > > before descriptors_.clear() always.\n> > > Correct?\n> > >\n> > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org>\n> >\n> > > >   */\n> > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> > > > -       : camera_(camera)\n> > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> > > > +                              const camera3_capture_request_t *camera3Request)\n> > > > +       : camera_(camera), camera3Request_(camera3Request)\n> > > >  {\n> > > >         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n> > > >  }\n> > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n> > > >  {\n> > > >         /* Async process the request on the worker which runs its own thread. */\n> > > >         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> > > > -                            request);\n> > > > +                            this, request);\n> > > >  }\n> > > >\n> > > >  /*\n> > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n> > > >         return -errno;\n> > > >  }\n> > > >\n> > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n> > > >  {\n> > > >         /* Wait on all fences before queuing the Request. */\n> > > >         for (int fence : request->fences()) {\n> > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > >                 if (ret < 0) {\n> > > >                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n> > > >                                         << fence << \": \" << strerror(-ret);\n> > > > +                       context->requestQueueFailed.emit(request);\n> >\n> > .... Here\n> >\n> > Is your question about stop() related to dispatching messages before\n> > the thread has actually been stopped ? Isn't that been fixed some time\n> > ago by overriding CameraWorker::exec() ?\n> >\n> > Thanks\n> >    j\n> >","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id 40A5BBDC71\n\tfor <parsemail@patchwork.libcamera.org>;\n\tMon,  6 Sep 2021 17:04:57 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 9D8376916C;\n\tMon,  6 Sep 2021 19:04:56 +0200 (CEST)","from relay6-d.mail.gandi.net (relay6-d.mail.gandi.net\n\t[217.70.183.198])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id AA7AA60137\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon,  6 Sep 2021 19:04:55 +0200 (CEST)","(Authenticated sender: jacopo@jmondi.org)\n\tby relay6-d.mail.gandi.net (Postfix) with ESMTPSA id 2EF1BC0003;\n\tMon,  6 Sep 2021 17:04:54 +0000 (UTC)"],"Date":"Mon, 6 Sep 2021 19:05:42 +0200","From":"Jacopo Mondi <jacopo@jmondi.org>","To":"Hirokazu Honda <hiroh@chromium.org>","Message-ID":"<20210906170542.ooaptf3vz7gkwulr@uno.localdomain>","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>\n\t<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>\n\t<20210906162040.v6thmls25tiocwkp@uno.localdomain>\n\t<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>","MIME-Version":"1.0","Content-Type":"text/plain; charset=utf-8","Content-Disposition":"inline","In-Reply-To":"<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19496,"web_url":"https://patchwork.libcamera.org/comment/19496/","msgid":"<CAO5uPHOx-7W5_SGXWkdEukKRZvo2hV8Zfo8NHkW1jv2aHLNtJA@mail.gmail.com>","date":"2021-09-07T08:32:29","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":63,"url":"https://patchwork.libcamera.org/api/people/63/","name":"Hirokazu Honda","email":"hiroh@chromium.org"},"content":"Hi Jacopo,\n\nOn Tue, Sep 7, 2021 at 2:04 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n>\n> Hi Hiro,\n>\n> On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote:\n> > Hi Jacopo,\n> >\n> > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > >\n> > > Hi Hiro,\n> > >\n> > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote:\n> > > > Hi Jacopo, thank you for the patch.\n> > > >\n> > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > > > >\n> > > > > The CameraDevice class stores one CaptureRequestDescriptor instance\n> > > > > for each capture request queued to the Camera. Such descriptors are\n> > > > > cleared when the Camera completes the Request in the\n> > > > > CameraDevice::requestComplete() slot.\n> > > > >\n> > > > > The CameraDevice class relies on an internal worker to off-load\n> > > > > waiting on synchronization fences, and does unconditionally store\n> > > > > the descriptor in the descriptors_ class member map to handle its\n> > > > > lifetime.\n> > > > >\n> > > > > If waiting on a fence fails, the created descriptor remains in the\n> > > > > descriptors_ map until the next call to stop() or close(), as\n> > > > > requestComplete() will never be called, and the camera\n> > > > > service is never notified about the queueing failure.\n> > > > >\n> > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> > > > > waiting on a fence has failed, by installing a new requestQueueFailed\n> > > > > Signal.\n> > > > >\n> > > > > The Signal is emitted by the CameraWorker internal worker if waiting on\n> > > > > a fence has failed. The CameraDevice is augmented with a slot connected\n> > > > > to the Signal that removes the descriptor from the map and notifies the\n> > > > > camera framework about the failure.\n> > > > >\n> > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> > > > > ---\n> > > > >  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n> > > > >  src/android/camera_device.h   |  1 +\n> > > > >  src/android/camera_worker.cpp | 10 ++++++----\n> > > > >  src/android/camera_worker.h   |  9 +++++++--\n> > > > >  4 files changed, 35 insertions(+), 7 deletions(-)\n> > > > >\n> > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> > > > > index a0ea138d9499..0ce9b699bfae 100644\n> > > > > --- a/src/android/camera_device.cpp\n> > > > > +++ b/src/android/camera_device.cpp\n> > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n> > > > >          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n> > > > >          * lifetime to the descriptor.\n> > > > >          */\n> > > > > -       request_ = std::make_unique<CaptureRequest>(camera);\n> > > > > +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n> > > > >  }\n> > > > >\n> > > > >  /*\n> > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n> > > > >         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n> > > > >           facing_(CAMERA_FACING_FRONT), orientation_(0)\n> > > > >  {\n> > > > > +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n> > > > >         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n> > > > >\n> > > > >         maker_ = \"libcamera\";\n> > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n> > > > >         return 0;\n> > > > >  }\n> > > > >\n> > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> > > > > +{\n> > > > > +       /*\n> > > > > +        * Remove the descriptor from the map if the worker has failed\n> > > > > +        * to process it and notify the camera service about the failure.\n> > > > > +        */\n> > > > > +       MutexLocker descriptorsLock(descriptorsMutex_);\n> > > > > +       auto it = descriptors_.find(request->cookie());\n> > > > > +       if (it == descriptors_.end()) {\n> > > > > +               LOG(HAL, Fatal)\n> > > >\n> > > > If I understand correctly, this must not happen because\n> > >\n> > > Do you mean the it == end() with \"this\" ?\n> > >\n> > > It should not happen as requestQueueFail is called by the worker if it\n> > > fails to wait on a fence and the descriptor is already part of\n> > > descriptors.\n> > >\n> > > Actually, there's a small window for this to race now that I look at\n> > > it again\n> > >\n> > > --------------------------------------------------------------------------\n> > > CameraDevice                            CameraWorker\n> > > --------------------------------------------------------------------------\n> > >\n> > > worker_.queueRequest()          ->\n> > >\n> > >                                         worker.processRequest\n> > >                                         fence.wait() == error\n> > >                                 <-      requestQueueFailed.emit()\n> > > requestQueueFailed()\n> > >         it == descriptors_.end()\n> > >         abort();\n> > >\n> > > descriptors_[req] = desc;\n> > > --------------------------------------------------------------------------\n> > >\n> > > While the intended order of execution is\n> > >\n> > > CameraDevice                            CameraWorker\n> > > --------------------------------------------------------------------------\n> > >\n> > > worker_.queueRequest()\n> > > descriptors_[req] = desc;\n> > >                                 ->\n> > >                                         worker.processRequest\n> > >                                         fence.wait() == error\n> > >                                 <-      requestQueueFailed.emit()\n> > > requestQueueFailed()\n> > >         it == descriptors_.end()\n> > >         abort();\n> > >\n> > > --------------------------------------------------------------------------\n> > >\n> > > I'll make sure to add the descriptor to descriptors_ before calling\n> > > worker_.queuRequest().\n> > >\n> > > > requestQueueFailed() is invoked by worker::stop() and it is executed\n> > >\n> > > Why do you say it is invoked by stop() ? The signal is emitted if\n> > > waiting on a fence fails as shown...\n> > >\n> >\n> > I don't probably understand when fence waiting error happens.\n> > Could you explain when it happens?\n>\n> I wish that doesn't happen, but sometimes things break\n>\n> [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired\n>\n> don't know why, we have an arbitrary timeout there, that should\n> probably be better characterized. But it guess protecting against\n> poll() errors on a fence is not crazy.\n>\n> >\n> > It looks like, in the next patch, descriptors_ must be empty in some places.\n>\n> Not in some places. after the camera has been stopped it must be\n> empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons:\n>\n> 1) It validates the pipeline correctly returns all buffers at\n> Camera::stop() time. If that doesn't happen we'll be missing error\n> notifications to the camera service, I feel it's better to validate\n> that early and ASSERT() during development than chaising errors across\n> the IPC boundary\n>\n> 2) It validates that, if 1) holds, the HAL implementation\n> correctly returns all requests to the camera service. As soon as I\n> added the ASSERT() I stumbled upon a crash, due to the fence wait being\n> failed as shown above. That lead to this patch.\n>\n> > How is it guaranteed that requestQueueFailed() is executed before\n> > descriptors_ is empty?\n>\n> CameraDevice::processCaptureRequest          CameraWorker\n>      A) worker_.queueRequest(req);\n>      B) descriptors_[req] = move(desc)\n>\n>                                         ->      queueRequest()\n>                                                    processReques()\n>\n>                                                    wait(fence) == error\n>                                                    requestQueueFailed.emit()\n>                                         <-\n>     C)  descriptor.erase(req)\n>\n>\n> As said, A) and B) should probably be inverted to make sure the\n> descriptor is queued before calling worker_.queueRequest() (which can\n> lead to emitting requestQueueFailed if waiting on a fence fails).\n>\n> In this version there's a tiny window where the control could be given\n> to the CameraWorker and returned to the CameraDevice after the\n> requestQueueFailed slot has been executed. In that case we would have\n> A, C, B that would lead to the Fatal assertion being triggered.\n>\n> If we guarantee that (A, B) are executed atomically, or simply do B\n> before A, then when the slot is called the descriptor is guaranteed to\n> be in map.\n>\n> I might be missing some races though. Can you point me to the part\n> that doesn't work for you ?\n>\n\nI think, when CameraDevice::stop() and CameraDevice::flush() is\ncalled, there may be a pending request waiting on fence.\nSo considering from the ASSERT, Worker::stop() causes\nrequestQueueFailed() for the waiting request.\nBut if I remember/understand correctly, Worker::stop() flushes pending\nrequests in a worker queue, that is, requests for which\nprocessRequest() has not been called.\nTherefore descriptors_ might not be empty thanks to the request\nwaiting on fence.\nI probably miss something here.\n\n-Hiro\n> Thanks\n>   j\n>\n>\n>\n> > -Hiro\n> > > > before descriptors_.clear() always.\n> > > > Correct?\n> > > >\n> > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org>\n> > >\n> > > > >   */\n> > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> > > > > -       : camera_(camera)\n> > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> > > > > +                              const camera3_capture_request_t *camera3Request)\n> > > > > +       : camera_(camera), camera3Request_(camera3Request)\n> > > > >  {\n> > > > >         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n> > > > >  }\n> > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n> > > > >  {\n> > > > >         /* Async process the request on the worker which runs its own thread. */\n> > > > >         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> > > > > -                            request);\n> > > > > +                            this, request);\n> > > > >  }\n> > > > >\n> > > > >  /*\n> > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n> > > > >         return -errno;\n> > > > >  }\n> > > > >\n> > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n> > > > >  {\n> > > > >         /* Wait on all fences before queuing the Request. */\n> > > > >         for (int fence : request->fences()) {\n> > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > >                 if (ret < 0) {\n> > > > >                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n> > > > >                                         << fence << \": \" << strerror(-ret);\n> > > > > +                       context->requestQueueFailed.emit(request);\n> > >\n> > > .... Here\n> > >\n> > > Is your question about stop() related to dispatching messages before\n> > > the thread has actually been stopped ? Isn't that been fixed some time\n> > > ago by overriding CameraWorker::exec() ?\n> > >\n> > > Thanks\n> > >    j\n> > >","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id AEBC2BDC71\n\tfor <parsemail@patchwork.libcamera.org>;\n\tTue,  7 Sep 2021 08:32:43 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 304836916C;\n\tTue,  7 Sep 2021 10:32:43 +0200 (CEST)","from mail-ej1-x635.google.com (mail-ej1-x635.google.com\n\t[IPv6:2a00:1450:4864:20::635])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id 42EB169166\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tTue,  7 Sep 2021 10:32:41 +0200 (CEST)","by mail-ej1-x635.google.com with SMTP id u14so18094487ejf.13\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tTue, 07 Sep 2021 01:32:41 -0700 (PDT)"],"Authentication-Results":"lancelot.ideasonboard.com; dkim=pass (1024-bit key;\n\tunprotected) header.d=chromium.org header.i=@chromium.org\n\theader.b=\"bZMh+KS2\"; dkim-atps=neutral","DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org;\n\ts=google; \n\th=mime-version:references:in-reply-to:from:date:message-id:subject:to\n\t:cc; bh=aB6HkXmfm+RmVIZPRWPta6og9yfQzZO9E05k75SGLKs=;\n\tb=bZMh+KS2Br21L9Nw/dtELyFOKAW9KBgl8y3xVn130sl57bmbuh+7lkynqJdISLY7iL\n\tUNe8R4SfQSMcuK6Oc6ylbbttR6BcIJGKOvzjkTA0T2WI3ncoucBIxZBjLQgmkIDMRM+s\n\t87NHXUYRKvq+sUXPyJIK0nF2pfG41MOGYXRnA=","X-Google-DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed;\n\td=1e100.net; s=20161025;\n\th=x-gm-message-state:mime-version:references:in-reply-to:from:date\n\t:message-id:subject:to:cc;\n\tbh=aB6HkXmfm+RmVIZPRWPta6og9yfQzZO9E05k75SGLKs=;\n\tb=k4+Ulb+YBOuffMY8nyVfGyAN7R8IsZOfcCSB0qmxH1B5cxUMbg9GwBmcIS0+9b0ueo\n\tGmCh2kGQjosCplu20AbwPNB8whlsC2P12FhzyTuvR++6/rwVrPN1F/al54Z7YSwgoOgv\n\tWJN+TYC+N8liccr0hr4iAfPQImUe/r6BQZWmVUDj/JEI/goCWurCxhAeDKPrOaTMweyt\n\tSqKNBn7dYVTfwkdhSUBRqsvUNFTY8cskMxeMQx+3hOtLfyoYTJXq32atY1PqHWwMay9J\n\trx5XPfWMCeQAMHQ4k+cZk0QPCex8xJpY0sEKyW/IWm/2HKD438b/IldUKFCD2fPxEjjv\n\tLiHA==","X-Gm-Message-State":"AOAM533UcXH0J+zX+WesVNYXSX47+8W3qajUUny9s/hXK93RDrXqak2P\n\tdv5g53/ZruBQYxLKFnPap0W93EzfrPH7cD6ioH91CsLppn7rCQ==","X-Google-Smtp-Source":"ABdhPJw8nbzfZ2T0ryBS9qvC0P1GitIQCUmA+Nm0pSbgW2ATIgDNMQsU6h8v9YF6g1DsYGi/2yBX6HAUm1/F83x9jKA=","X-Received":"by 2002:a17:906:3adb:: with SMTP id\n\tz27mr17591773ejd.291.1631003560744; \n\tTue, 07 Sep 2021 01:32:40 -0700 (PDT)","MIME-Version":"1.0","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>\n\t<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>\n\t<20210906162040.v6thmls25tiocwkp@uno.localdomain>\n\t<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>\n\t<20210906170542.ooaptf3vz7gkwulr@uno.localdomain>","In-Reply-To":"<20210906170542.ooaptf3vz7gkwulr@uno.localdomain>","From":"Hirokazu Honda <hiroh@chromium.org>","Date":"Tue, 7 Sep 2021 17:32:29 +0900","Message-ID":"<CAO5uPHOx-7W5_SGXWkdEukKRZvo2hV8Zfo8NHkW1jv2aHLNtJA@mail.gmail.com>","To":"Jacopo Mondi <jacopo@jmondi.org>","Content-Type":"text/plain; charset=\"UTF-8\"","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19511,"web_url":"https://patchwork.libcamera.org/comment/19511/","msgid":"<20210907135244.bt6la3asp5qjrqtx@uno.localdomain>","date":"2021-09-07T13:52:44","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":3,"url":"https://patchwork.libcamera.org/api/people/3/","name":"Jacopo Mondi","email":"jacopo@jmondi.org"},"content":"Hi Hiro,\n\nOn Tue, Sep 07, 2021 at 05:32:29PM +0900, Hirokazu Honda wrote:\n> Hi Jacopo,\n>\n> On Tue, Sep 7, 2021 at 2:04 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> >\n> > Hi Hiro,\n> >\n> > On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote:\n> > > Hi Jacopo,\n> > >\n> > > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > > >\n> > > > Hi Hiro,\n> > > >\n> > > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote:\n> > > > > Hi Jacopo, thank you for the patch.\n> > > > >\n> > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > > > > >\n> > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance\n> > > > > > for each capture request queued to the Camera. Such descriptors are\n> > > > > > cleared when the Camera completes the Request in the\n> > > > > > CameraDevice::requestComplete() slot.\n> > > > > >\n> > > > > > The CameraDevice class relies on an internal worker to off-load\n> > > > > > waiting on synchronization fences, and does unconditionally store\n> > > > > > the descriptor in the descriptors_ class member map to handle its\n> > > > > > lifetime.\n> > > > > >\n> > > > > > If waiting on a fence fails, the created descriptor remains in the\n> > > > > > descriptors_ map until the next call to stop() or close(), as\n> > > > > > requestComplete() will never be called, and the camera\n> > > > > > service is never notified about the queueing failure.\n> > > > > >\n> > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> > > > > > waiting on a fence has failed, by installing a new requestQueueFailed\n> > > > > > Signal.\n> > > > > >\n> > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on\n> > > > > > a fence has failed. The CameraDevice is augmented with a slot connected\n> > > > > > to the Signal that removes the descriptor from the map and notifies the\n> > > > > > camera framework about the failure.\n> > > > > >\n> > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> > > > > > ---\n> > > > > >  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n> > > > > >  src/android/camera_device.h   |  1 +\n> > > > > >  src/android/camera_worker.cpp | 10 ++++++----\n> > > > > >  src/android/camera_worker.h   |  9 +++++++--\n> > > > > >  4 files changed, 35 insertions(+), 7 deletions(-)\n> > > > > >\n> > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> > > > > > index a0ea138d9499..0ce9b699bfae 100644\n> > > > > > --- a/src/android/camera_device.cpp\n> > > > > > +++ b/src/android/camera_device.cpp\n> > > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n> > > > > >          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n> > > > > >          * lifetime to the descriptor.\n> > > > > >          */\n> > > > > > -       request_ = std::make_unique<CaptureRequest>(camera);\n> > > > > > +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n> > > > > >  }\n> > > > > >\n> > > > > >  /*\n> > > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n> > > > > >         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n> > > > > >           facing_(CAMERA_FACING_FRONT), orientation_(0)\n> > > > > >  {\n> > > > > > +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n> > > > > >         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n> > > > > >\n> > > > > >         maker_ = \"libcamera\";\n> > > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n> > > > > >         return 0;\n> > > > > >  }\n> > > > > >\n> > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> > > > > > +{\n> > > > > > +       /*\n> > > > > > +        * Remove the descriptor from the map if the worker has failed\n> > > > > > +        * to process it and notify the camera service about the failure.\n> > > > > > +        */\n> > > > > > +       MutexLocker descriptorsLock(descriptorsMutex_);\n> > > > > > +       auto it = descriptors_.find(request->cookie());\n> > > > > > +       if (it == descriptors_.end()) {\n> > > > > > +               LOG(HAL, Fatal)\n> > > > >\n> > > > > If I understand correctly, this must not happen because\n> > > >\n> > > > Do you mean the it == end() with \"this\" ?\n> > > >\n> > > > It should not happen as requestQueueFail is called by the worker if it\n> > > > fails to wait on a fence and the descriptor is already part of\n> > > > descriptors.\n> > > >\n> > > > Actually, there's a small window for this to race now that I look at\n> > > > it again\n> > > >\n> > > > --------------------------------------------------------------------------\n> > > > CameraDevice                            CameraWorker\n> > > > --------------------------------------------------------------------------\n> > > >\n> > > > worker_.queueRequest()          ->\n> > > >\n> > > >                                         worker.processRequest\n> > > >                                         fence.wait() == error\n> > > >                                 <-      requestQueueFailed.emit()\n> > > > requestQueueFailed()\n> > > >         it == descriptors_.end()\n> > > >         abort();\n> > > >\n> > > > descriptors_[req] = desc;\n> > > > --------------------------------------------------------------------------\n> > > >\n> > > > While the intended order of execution is\n> > > >\n> > > > CameraDevice                            CameraWorker\n> > > > --------------------------------------------------------------------------\n> > > >\n> > > > worker_.queueRequest()\n> > > > descriptors_[req] = desc;\n> > > >                                 ->\n> > > >                                         worker.processRequest\n> > > >                                         fence.wait() == error\n> > > >                                 <-      requestQueueFailed.emit()\n> > > > requestQueueFailed()\n> > > >         it == descriptors_.end()\n> > > >         abort();\n> > > >\n> > > > --------------------------------------------------------------------------\n> > > >\n> > > > I'll make sure to add the descriptor to descriptors_ before calling\n> > > > worker_.queuRequest().\n> > > >\n> > > > > requestQueueFailed() is invoked by worker::stop() and it is executed\n> > > >\n> > > > Why do you say it is invoked by stop() ? The signal is emitted if\n> > > > waiting on a fence fails as shown...\n> > > >\n> > >\n> > > I don't probably understand when fence waiting error happens.\n> > > Could you explain when it happens?\n> >\n> > I wish that doesn't happen, but sometimes things break\n> >\n> > [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired\n> >\n> > don't know why, we have an arbitrary timeout there, that should\n> > probably be better characterized. But it guess protecting against\n> > poll() errors on a fence is not crazy.\n> >\n> > >\n> > > It looks like, in the next patch, descriptors_ must be empty in some places.\n> >\n> > Not in some places. after the camera has been stopped it must be\n> > empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons:\n> >\n> > 1) It validates the pipeline correctly returns all buffers at\n> > Camera::stop() time. If that doesn't happen we'll be missing error\n> > notifications to the camera service, I feel it's better to validate\n> > that early and ASSERT() during development than chaising errors across\n> > the IPC boundary\n> >\n> > 2) It validates that, if 1) holds, the HAL implementation\n> > correctly returns all requests to the camera service. As soon as I\n> > added the ASSERT() I stumbled upon a crash, due to the fence wait being\n> > failed as shown above. That lead to this patch.\n> >\n> > > How is it guaranteed that requestQueueFailed() is executed before\n> > > descriptors_ is empty?\n> >\n> > CameraDevice::processCaptureRequest          CameraWorker\n> >      A) worker_.queueRequest(req);\n> >      B) descriptors_[req] = move(desc)\n> >\n> >                                         ->      queueRequest()\n> >                                                    processReques()\n> >\n> >                                                    wait(fence) == error\n> >                                                    requestQueueFailed.emit()\n> >                                         <-\n> >     C)  descriptor.erase(req)\n> >\n> >\n> > As said, A) and B) should probably be inverted to make sure the\n> > descriptor is queued before calling worker_.queueRequest() (which can\n> > lead to emitting requestQueueFailed if waiting on a fence fails).\n> >\n> > In this version there's a tiny window where the control could be given\n> > to the CameraWorker and returned to the CameraDevice after the\n> > requestQueueFailed slot has been executed. In that case we would have\n> > A, C, B that would lead to the Fatal assertion being triggered.\n> >\n> > If we guarantee that (A, B) are executed atomically, or simply do B\n> > before A, then when the slot is called the descriptor is guaranteed to\n> > be in map.\n> >\n> > I might be missing some races though. Can you point me to the part\n> > that doesn't work for you ?\n> >\n>\n> I think, when CameraDevice::stop() and CameraDevice::flush() is\n> called, there may be a pending request waiting on fence.\n> So considering from the ASSERT, Worker::stop() causes\n> requestQueueFailed() for the waiting request.\n> But if I remember/understand correctly, Worker::stop() flushes pending\n> requests in a worker queue, that is, requests for which\n> processRequest() has not been called.\n> Therefore descriptors_ might not be empty thanks to the request\n> waiting on fence.\n> I probably miss something here.\n\nIt is my understanding that since commit: 7de2daf653bc (\"android:\ncamera_worker: Process all queued requests when stopping\") all the\nmessages pending in the queue (the async processRequest() calls) are\ndispatched to the Worker which processes them until its message queue\nits empty. This, in my understanding means that all the requests\nposted to the CameraWorker with queueRequest() are processed and\nqueued to Camera (if there are no isses waiting on the fence) or\ndeleted with the new requestQueueFailed signal if waiting on the fence\nfails.\n\nIf Request are queued to the Camera, Camera::stop() will complete them\n(and their descriptors deleted), if instead requestQueueFailed is\nemitted, the descriptor is now deleted by the associated slot in\nCameraDevice.\n\nNow, that's my understanding and it's based on the assumptions that:\n\n1) All pending messages are delivered (which should be guaranteed by 7de2daf653bc\n2) The async processRequest() function is invoked for each message\ndispatched after the thread has stopped\n3) All the processRequest() async calls either succesfully wait on\nthe fence and queue the Request to the Camera, or signal a failure in\nwaiting and emit the requestQueueFailed signal\n4) All of this happens before worker_.stop() returns\n\nExplicitly Cc-ed Laurent to confirm or disprove my understanding.\n\nAnyway, I went probably too far with this patch as it introduces\nsomething that makes me feel more confortable (ASSERT no requests are\nleft in-flight when stopping the camera) but it is not a strict\nrequirement nor it fixes any immediate bug, so if it proves to be\ncontroversial I'll be happy to drop and move forward with more\npressing issues.\n\nThanks\n   j\n\n>\n> -Hiro\n> > Thanks\n> >   j\n> >\n> >\n> >\n> > > -Hiro\n> > > > > before descriptors_.clear() always.\n> > > > > Correct?\n> > > > >\n> > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org>\n> > > >\n> > > > > >   */\n> > > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> > > > > > -       : camera_(camera)\n> > > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> > > > > > +                              const camera3_capture_request_t *camera3Request)\n> > > > > > +       : camera_(camera), camera3Request_(camera3Request)\n> > > > > >  {\n> > > > > >         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n> > > > > >  }\n> > > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n> > > > > >  {\n> > > > > >         /* Async process the request on the worker which runs its own thread. */\n> > > > > >         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> > > > > > -                            request);\n> > > > > > +                            this, request);\n> > > > > >  }\n> > > > > >\n> > > > > >  /*\n> > > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n> > > > > >         return -errno;\n> > > > > >  }\n> > > > > >\n> > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n> > > > > >  {\n> > > > > >         /* Wait on all fences before queuing the Request. */\n> > > > > >         for (int fence : request->fences()) {\n> > > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > > >                 if (ret < 0) {\n> > > > > >                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n> > > > > >                                         << fence << \": \" << strerror(-ret);\n> > > > > > +                       context->requestQueueFailed.emit(request);\n> > > >\n> > > > .... Here\n> > > >\n> > > > Is your question about stop() related to dispatching messages before\n> > > > the thread has actually been stopped ? Isn't that been fixed some time\n> > > > ago by overriding CameraWorker::exec() ?\n> > > >\n> > > > Thanks\n> > > >    j\n> > > >","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id B79F8BE175\n\tfor <parsemail@patchwork.libcamera.org>;\n\tTue,  7 Sep 2021 13:52:00 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 0D7A06916C;\n\tTue,  7 Sep 2021 15:52:00 +0200 (CEST)","from relay8-d.mail.gandi.net (relay8-d.mail.gandi.net\n\t[217.70.183.201])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id 5A15460251\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tTue,  7 Sep 2021 15:51:58 +0200 (CEST)","(Authenticated sender: jacopo@jmondi.org)\n\tby relay8-d.mail.gandi.net (Postfix) with ESMTPSA id 98E8B1BF207;\n\tTue,  7 Sep 2021 13:51:57 +0000 (UTC)"],"Date":"Tue, 7 Sep 2021 15:52:44 +0200","From":"Jacopo Mondi <jacopo@jmondi.org>","To":"Hirokazu Honda <hiroh@chromium.org>,\n\tLaurent Pinchart <laurent.pinchart@ideasonboard.com>","Message-ID":"<20210907135244.bt6la3asp5qjrqtx@uno.localdomain>","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>\n\t<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>\n\t<20210906162040.v6thmls25tiocwkp@uno.localdomain>\n\t<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>\n\t<20210906170542.ooaptf3vz7gkwulr@uno.localdomain>\n\t<CAO5uPHOx-7W5_SGXWkdEukKRZvo2hV8Zfo8NHkW1jv2aHLNtJA@mail.gmail.com>","MIME-Version":"1.0","Content-Type":"text/plain; charset=utf-8","Content-Disposition":"inline","In-Reply-To":"<CAO5uPHOx-7W5_SGXWkdEukKRZvo2hV8Zfo8NHkW1jv2aHLNtJA@mail.gmail.com>","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19530,"web_url":"https://patchwork.libcamera.org/comment/19530/","msgid":"<CAO5uPHOCvDZPt9shHJ9d=w+_5AGGZJEhu4Rom5rjRHBypQKF=A@mail.gmail.com>","date":"2021-09-08T07:34:48","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":63,"url":"https://patchwork.libcamera.org/api/people/63/","name":"Hirokazu Honda","email":"hiroh@chromium.org"},"content":"Hi Jacopo,\n\nOn Tue, Sep 7, 2021 at 10:51 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n>\n> Hi Hiro,\n>\n> On Tue, Sep 07, 2021 at 05:32:29PM +0900, Hirokazu Honda wrote:\n> > Hi Jacopo,\n> >\n> > On Tue, Sep 7, 2021 at 2:04 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > >\n> > > Hi Hiro,\n> > >\n> > > On Tue, Sep 07, 2021 at 01:36:43AM +0900, Hirokazu Honda wrote:\n> > > > Hi Jacopo,\n> > > >\n> > > > On Tue, Sep 7, 2021 at 1:19 AM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > > > >\n> > > > > Hi Hiro,\n> > > > >\n> > > > > On Mon, Sep 06, 2021 at 11:55:15PM +0900, Hirokazu Honda wrote:\n> > > > > > Hi Jacopo, thank you for the patch.\n> > > > > >\n> > > > > > On Mon, Sep 6, 2021 at 11:01 PM Jacopo Mondi <jacopo@jmondi.org> wrote:\n> > > > > > >\n> > > > > > > The CameraDevice class stores one CaptureRequestDescriptor instance\n> > > > > > > for each capture request queued to the Camera. Such descriptors are\n> > > > > > > cleared when the Camera completes the Request in the\n> > > > > > > CameraDevice::requestComplete() slot.\n> > > > > > >\n> > > > > > > The CameraDevice class relies on an internal worker to off-load\n> > > > > > > waiting on synchronization fences, and does unconditionally store\n> > > > > > > the descriptor in the descriptors_ class member map to handle its\n> > > > > > > lifetime.\n> > > > > > >\n> > > > > > > If waiting on a fence fails, the created descriptor remains in the\n> > > > > > > descriptors_ map until the next call to stop() or close(), as\n> > > > > > > requestComplete() will never be called, and the camera\n> > > > > > > service is never notified about the queueing failure.\n> > > > > > >\n> > > > > > > Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> > > > > > > waiting on a fence has failed, by installing a new requestQueueFailed\n> > > > > > > Signal.\n> > > > > > >\n> > > > > > > The Signal is emitted by the CameraWorker internal worker if waiting on\n> > > > > > > a fence has failed. The CameraDevice is augmented with a slot connected\n> > > > > > > to the Signal that removes the descriptor from the map and notifies the\n> > > > > > > camera framework about the failure.\n> > > > > > >\n> > > > > > > Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> > > > > > > ---\n> > > > > > >  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n> > > > > > >  src/android/camera_device.h   |  1 +\n> > > > > > >  src/android/camera_worker.cpp | 10 ++++++----\n> > > > > > >  src/android/camera_worker.h   |  9 +++++++--\n> > > > > > >  4 files changed, 35 insertions(+), 7 deletions(-)\n> > > > > > >\n> > > > > > > diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> > > > > > > index a0ea138d9499..0ce9b699bfae 100644\n> > > > > > > --- a/src/android/camera_device.cpp\n> > > > > > > +++ b/src/android/camera_device.cpp\n> > > > > > > @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n> > > > > > >          * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n> > > > > > >          * lifetime to the descriptor.\n> > > > > > >          */\n> > > > > > > -       request_ = std::make_unique<CaptureRequest>(camera);\n> > > > > > > +       request_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n> > > > > > >  }\n> > > > > > >\n> > > > > > >  /*\n> > > > > > > @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n> > > > > > >         : id_(id), state_(State::Stopped), camera_(std::move(camera)),\n> > > > > > >           facing_(CAMERA_FACING_FRONT), orientation_(0)\n> > > > > > >  {\n> > > > > > > +       worker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n> > > > > > >         camera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n> > > > > > >\n> > > > > > >         maker_ = \"libcamera\";\n> > > > > > > @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n> > > > > > >         return 0;\n> > > > > > >  }\n> > > > > > >\n> > > > > > > +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> > > > > > > +{\n> > > > > > > +       /*\n> > > > > > > +        * Remove the descriptor from the map if the worker has failed\n> > > > > > > +        * to process it and notify the camera service about the failure.\n> > > > > > > +        */\n> > > > > > > +       MutexLocker descriptorsLock(descriptorsMutex_);\n> > > > > > > +       auto it = descriptors_.find(request->cookie());\n> > > > > > > +       if (it == descriptors_.end()) {\n> > > > > > > +               LOG(HAL, Fatal)\n> > > > > >\n> > > > > > If I understand correctly, this must not happen because\n> > > > >\n> > > > > Do you mean the it == end() with \"this\" ?\n> > > > >\n> > > > > It should not happen as requestQueueFail is called by the worker if it\n> > > > > fails to wait on a fence and the descriptor is already part of\n> > > > > descriptors.\n> > > > >\n> > > > > Actually, there's a small window for this to race now that I look at\n> > > > > it again\n> > > > >\n> > > > > --------------------------------------------------------------------------\n> > > > > CameraDevice                            CameraWorker\n> > > > > --------------------------------------------------------------------------\n> > > > >\n> > > > > worker_.queueRequest()          ->\n> > > > >\n> > > > >                                         worker.processRequest\n> > > > >                                         fence.wait() == error\n> > > > >                                 <-      requestQueueFailed.emit()\n> > > > > requestQueueFailed()\n> > > > >         it == descriptors_.end()\n> > > > >         abort();\n> > > > >\n> > > > > descriptors_[req] = desc;\n> > > > > --------------------------------------------------------------------------\n> > > > >\n> > > > > While the intended order of execution is\n> > > > >\n> > > > > CameraDevice                            CameraWorker\n> > > > > --------------------------------------------------------------------------\n> > > > >\n> > > > > worker_.queueRequest()\n> > > > > descriptors_[req] = desc;\n> > > > >                                 ->\n> > > > >                                         worker.processRequest\n> > > > >                                         fence.wait() == error\n> > > > >                                 <-      requestQueueFailed.emit()\n> > > > > requestQueueFailed()\n> > > > >         it == descriptors_.end()\n> > > > >         abort();\n> > > > >\n> > > > > --------------------------------------------------------------------------\n> > > > >\n> > > > > I'll make sure to add the descriptor to descriptors_ before calling\n> > > > > worker_.queuRequest().\n> > > > >\n> > > > > > requestQueueFailed() is invoked by worker::stop() and it is executed\n> > > > >\n> > > > > Why do you say it is invoked by stop() ? The signal is emitted if\n> > > > > waiting on a fence fails as shown...\n> > > > >\n> > > >\n> > > > I don't probably understand when fence waiting error happens.\n> > > > Could you explain when it happens?\n> > >\n> > > I wish that doesn't happen, but sometimes things break\n> > >\n> > > [1:44:39.146219593] [4470] ERROR HAL camera_worker.cpp:122 Failed waiting for fence: 80: Timer expired\n> > >\n> > > don't know why, we have an arbitrary timeout there, that should\n> > > probably be better characterized. But it guess protecting against\n> > > poll() errors on a fence is not crazy.\n> > >\n> > > >\n> > > > It looks like, in the next patch, descriptors_ must be empty in some places.\n> > >\n> > > Not in some places. after the camera has been stopped it must be\n> > > empty, in CameraDevice::stop() and CameraDevice::close(), for 2 reasons:\n> > >\n> > > 1) It validates the pipeline correctly returns all buffers at\n> > > Camera::stop() time. If that doesn't happen we'll be missing error\n> > > notifications to the camera service, I feel it's better to validate\n> > > that early and ASSERT() during development than chaising errors across\n> > > the IPC boundary\n> > >\n> > > 2) It validates that, if 1) holds, the HAL implementation\n> > > correctly returns all requests to the camera service. As soon as I\n> > > added the ASSERT() I stumbled upon a crash, due to the fence wait being\n> > > failed as shown above. That lead to this patch.\n> > >\n> > > > How is it guaranteed that requestQueueFailed() is executed before\n> > > > descriptors_ is empty?\n> > >\n> > > CameraDevice::processCaptureRequest          CameraWorker\n> > >      A) worker_.queueRequest(req);\n> > >      B) descriptors_[req] = move(desc)\n> > >\n> > >                                         ->      queueRequest()\n> > >                                                    processReques()\n> > >\n> > >                                                    wait(fence) == error\n> > >                                                    requestQueueFailed.emit()\n> > >                                         <-\n> > >     C)  descriptor.erase(req)\n> > >\n> > >\n> > > As said, A) and B) should probably be inverted to make sure the\n> > > descriptor is queued before calling worker_.queueRequest() (which can\n> > > lead to emitting requestQueueFailed if waiting on a fence fails).\n> > >\n> > > In this version there's a tiny window where the control could be given\n> > > to the CameraWorker and returned to the CameraDevice after the\n> > > requestQueueFailed slot has been executed. In that case we would have\n> > > A, C, B that would lead to the Fatal assertion being triggered.\n> > >\n> > > If we guarantee that (A, B) are executed atomically, or simply do B\n> > > before A, then when the slot is called the descriptor is guaranteed to\n> > > be in map.\n> > >\n> > > I might be missing some races though. Can you point me to the part\n> > > that doesn't work for you ?\n> > >\n> >\n> > I think, when CameraDevice::stop() and CameraDevice::flush() is\n> > called, there may be a pending request waiting on fence.\n> > So considering from the ASSERT, Worker::stop() causes\n> > requestQueueFailed() for the waiting request.\n> > But if I remember/understand correctly, Worker::stop() flushes pending\n> > requests in a worker queue, that is, requests for which\n> > processRequest() has not been called.\n> > Therefore descriptors_ might not be empty thanks to the request\n> > waiting on fence.\n> > I probably miss something here.\n>\n> It is my understanding that since commit: 7de2daf653bc (\"android:\n> camera_worker: Process all queued requests when stopping\") all the\n> messages pending in the queue (the async processRequest() calls) are\n> dispatched to the Worker which processes them until its message queue\n> its empty. This, in my understanding means that all the requests\n> posted to the CameraWorker with queueRequest() are processed and\n> queued to Camera (if there are no isses waiting on the fence) or\n> deleted with the new requestQueueFailed signal if waiting on the fence\n> fails.\n>\n> If Request are queued to the Camera, Camera::stop() will complete them\n> (and their descriptors deleted), if instead requestQueueFailed is\n> emitted, the descriptor is now deleted by the associated slot in\n> CameraDevice.\n>\n> Now, that's my understanding and it's based on the assumptions that:\n>\n> 1) All pending messages are delivered (which should be guaranteed by 7de2daf653bc\n> 2) The async processRequest() function is invoked for each message\n> dispatched after the thread has stopped\n> 3) All the processRequest() async calls either succesfully wait on\n> the fence and queue the Request to the Camera, or signal a failure in\n> waiting and emit the requestQueueFailed signal\n> 4) All of this happens before worker_.stop() returns\n>\n> Explicitly Cc-ed Laurent to confirm or disprove my understanding.\n>\n> Anyway, I went probably too far with this patch as it introduces\n> something that makes me feel more confortable (ASSERT no requests are\n> left in-flight when stopping the camera) but it is not a strict\n> requirement nor it fixes any immediate bug, so if it proves to be\n> controversial I'll be happy to drop and move forward with more\n> pressing issues.\n>\n\nI see.\nThe LOG(FATAL) and ASSERT() concerns me as I forget how the things go\nwith stop().\nLet's wait Lauren'ts reply.\n\n-Hiro\n\n> Thanks\n>    j\n>\n> >\n> > -Hiro\n> > > Thanks\n> > >   j\n> > >\n> > >\n> > >\n> > > > -Hiro\n> > > > > > before descriptors_.clear() always.\n> > > > > > Correct?\n> > > > > >\n> > > > > > Reviewed-by: Hirokazu Honda <hiroh@chromium.org>\n> > > > >\n> > > > > > >   */\n> > > > > > > -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> > > > > > > -       : camera_(camera)\n> > > > > > > +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> > > > > > > +                              const camera3_capture_request_t *camera3Request)\n> > > > > > > +       : camera_(camera), camera3Request_(camera3Request)\n> > > > > > >  {\n> > > > > > >         request_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n> > > > > > >  }\n> > > > > > > @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n> > > > > > >  {\n> > > > > > >         /* Async process the request on the worker which runs its own thread. */\n> > > > > > >         worker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> > > > > > > -                            request);\n> > > > > > > +                            this, request);\n> > > > > > >  }\n> > > > > > >\n> > > > > > >  /*\n> > > > > > > @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n> > > > > > >         return -errno;\n> > > > > > >  }\n> > > > > > >\n> > > > > > > -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > > > > +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n> > > > > > >  {\n> > > > > > >         /* Wait on all fences before queuing the Request. */\n> > > > > > >         for (int fence : request->fences()) {\n> > > > > > > @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> > > > > > >                 if (ret < 0) {\n> > > > > > >                         LOG(HAL, Error) << \"Failed waiting for fence: \"\n> > > > > > >                                         << fence << \": \" << strerror(-ret);\n> > > > > > > +                       context->requestQueueFailed.emit(request);\n> > > > >\n> > > > > .... Here\n> > > > >\n> > > > > Is your question about stop() related to dispatching messages before\n> > > > > the thread has actually been stopped ? Isn't that been fixed some time\n> > > > > ago by overriding CameraWorker::exec() ?\n> > > > >\n> > > > > Thanks\n> > > > >    j\n> > > > >","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id 9EDC9BDC71\n\tfor <parsemail@patchwork.libcamera.org>;\n\tWed,  8 Sep 2021 07:35:02 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id C50C169170;\n\tWed,  8 Sep 2021 09:35:01 +0200 (CEST)","from mail-ej1-x62a.google.com (mail-ej1-x62a.google.com\n\t[IPv6:2a00:1450:4864:20::62a])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id 35B176024D\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tWed,  8 Sep 2021 09:35:00 +0200 (CEST)","by mail-ej1-x62a.google.com with SMTP id h9so2280892ejs.4\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tWed, 08 Sep 2021 00:35:00 -0700 (PDT)"],"Authentication-Results":"lancelot.ideasonboard.com;\n\tdkim=fail reason=\"signature verification failed\" (1024-bit key;\n\tunprotected) header.d=chromium.org header.i=@chromium.org\n\theader.b=\"hPo1PGQ0\"; dkim-atps=neutral","DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org;\n\ts=google; \n\th=mime-version:references:in-reply-to:from:date:message-id:subject:to\n\t:cc; bh=HdlBQbTITcDI82mhQ+jvsx2r69cXpGk1zTHZd8lSMHs=;\n\tb=hPo1PGQ0RV1AHsIu5rIFIpBPOQ1Hp8jvJEcYzJCKyPnzqqbnQvtEZTkU5RsM2U2E44\n\t+ZgQoARkzEz2RQq1tsuneIS5Ar4acol6nP5NKktMIpgwndZJktC3C8GMXot7Hql+Mk3V\n\tseCMzABwVrz0ct9Y3b/snSVnvFVHRUaIgFORo=","X-Google-DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/relaxed;\n\td=1e100.net; s=20161025;\n\th=x-gm-message-state:mime-version:references:in-reply-to:from:date\n\t:message-id:subject:to:cc;\n\tbh=HdlBQbTITcDI82mhQ+jvsx2r69cXpGk1zTHZd8lSMHs=;\n\tb=AypD42PWMXloMNhjvAsYf4RAEMrvDXGPgl35DVcAklqO7nbQ1DLGtFHjX2Pvc1YYGv\n\tpxxyQFV5d3nnZVSpfSV74t9DLLYysnRm4deGsZCc1lqzYNAAPe6mWtuxTgjsszUamdWK\n\tPx9Koe8tju8H5/QqFX70p5/h84huzpSIxUzRRWtgV4dx9DYHt+2+AXs2iEu/veiL67vl\n\thpanDpsT9OO8ZxIoqGQxfKq0veZ2kGeti4C+49aGk1PR93gMd+y477uCvJwOixmuM6+y\n\tOirVtEO4uC9GcHbjC035ZW/s6Nuw2QB+7zTBp1tApux80w2+d/ls8eN13ZF1rJM9XEU3\n\tGeeA==","X-Gm-Message-State":"AOAM531gHGIcJXzCn/QTHIdzz2o9aUhdeUv+EXzKrOUKr0/Vp0GzHt9I\n\t2QhCvmOtK1GWyggo8WjawHFna2OleCkJwGb9FXKUAq68JPU=","X-Google-Smtp-Source":"ABdhPJwyq2tIyUyqDQe7xoKNR5uI8RdWupalGxKhCUIAlxHUxH+ql68uyVM3tlE1E2N8V7QKTjmCzTpjv8nZuTHnFLI=","X-Received":"by 2002:a17:906:3589:: with SMTP id\n\to9mr2741917ejb.150.1631086499435; \n\tWed, 08 Sep 2021 00:34:59 -0700 (PDT)","MIME-Version":"1.0","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>\n\t<CAO5uPHOBUbOxW7ZEeTxP9uiBg2pFNE1_p4biR+ettTXGBaDdAA@mail.gmail.com>\n\t<20210906162040.v6thmls25tiocwkp@uno.localdomain>\n\t<CAO5uPHNveN4R69zmDk1Ra_-BSuGT3Duud_2EyLOvRhB+58ACaQ@mail.gmail.com>\n\t<20210906170542.ooaptf3vz7gkwulr@uno.localdomain>\n\t<CAO5uPHOx-7W5_SGXWkdEukKRZvo2hV8Zfo8NHkW1jv2aHLNtJA@mail.gmail.com>\n\t<20210907135244.bt6la3asp5qjrqtx@uno.localdomain>","In-Reply-To":"<20210907135244.bt6la3asp5qjrqtx@uno.localdomain>","From":"Hirokazu Honda <hiroh@chromium.org>","Date":"Wed, 8 Sep 2021 16:34:48 +0900","Message-ID":"<CAO5uPHOCvDZPt9shHJ9d=w+_5AGGZJEhu4Rom5rjRHBypQKF=A@mail.gmail.com>","To":"Jacopo Mondi <jacopo@jmondi.org>","Content-Type":"text/plain; charset=\"UTF-8\"","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera devel <libcamera-devel@lists.libcamera.org>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19631,"web_url":"https://patchwork.libcamera.org/comment/19631/","msgid":"<a2febd40-ec50-e629-4474-64b086cb24c2@ideasonboard.com>","date":"2021-09-13T04:53:21","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":86,"url":"https://patchwork.libcamera.org/api/people/86/","name":"Umang Jain","email":"umang.jain@ideasonboard.com"},"content":"Hi Jacopo,\n\nOn 9/6/21 7:31 PM, Jacopo Mondi wrote:\n> The CameraDevice class stores one CaptureRequestDescriptor instance\n> for each capture request queued to the Camera. Such descriptors are\n> cleared when the Camera completes the Request in the\n> CameraDevice::requestComplete() slot.\n>\n> The CameraDevice class relies on an internal worker to off-load\n> waiting on synchronization fences, and does unconditionally store\n> the descriptor in the descriptors_ class member map to handle its\n> lifetime.\n>\n> If waiting on a fence fails, the created descriptor remains in the\n> descriptors_ map until the next call to stop() or close(), as\n> requestComplete() will never be called, and the camera\n> service is never notified about the queueing failure.\n>\n> Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> waiting on a fence has failed, by installing a new requestQueueFailed\n> Signal.\n>\n> The Signal is emitted by the CameraWorker internal worker if waiting on\n> a fence has failed. The CameraDevice is augmented with a slot connected\n> to the Signal that removes the descriptor from the map and notifies the\n> camera framework about the failure.\n>\n> Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n\n\nReviewed-by: Umang Jain <umang.jain@ideasonboard.com>\n\n> ---\n>   src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n>   src/android/camera_device.h   |  1 +\n>   src/android/camera_worker.cpp | 10 ++++++----\n>   src/android/camera_worker.h   |  9 +++++++--\n>   4 files changed, 35 insertions(+), 7 deletions(-)\n>\n> diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> index a0ea138d9499..0ce9b699bfae 100644\n> --- a/src/android/camera_device.cpp\n> +++ b/src/android/camera_device.cpp\n> @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n>   \t * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n>   \t * lifetime to the descriptor.\n>   \t */\n> -\trequest_ = std::make_unique<CaptureRequest>(camera);\n> +\trequest_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n>   }\n>   \n>   /*\n> @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n>   \t: id_(id), state_(State::Stopped), camera_(std::move(camera)),\n>   \t  facing_(CAMERA_FACING_FRONT), orientation_(0)\n>   {\n> +\tworker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n>   \tcamera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n>   \n>   \tmaker_ = \"libcamera\";\n> @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n>   \treturn 0;\n>   }\n>   \n> +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> +{\n> +\t/*\n> +\t * Remove the descriptor from the map if the worker has failed\n> +\t * to process it and notify the camera service about the failure.\n> +\t */\n> +\tMutexLocker descriptorsLock(descriptorsMutex_);\n> +\tauto it = descriptors_.find(request->cookie());\n> +\tif (it == descriptors_.end()) {\n> +\t\tLOG(HAL, Fatal)\n> +\t\t\t<< \"Unknown request: \" << request->cookie();\n> +\t\treturn;\n> +\t}\n> +\n> +\tdescriptors_.extract(it);\n> +\n> +\tabortRequest(request->camera3Request());\n> +}\n> +\n>   void CameraDevice::requestComplete(Request *request)\n>   {\n>   \tdecltype(descriptors_)::node_type node;\n> diff --git a/src/android/camera_device.h b/src/android/camera_device.h\n> index 54c4cb9ab499..65456ecca7e3 100644\n> --- a/src/android/camera_device.h\n> +++ b/src/android/camera_device.h\n> @@ -83,6 +83,7 @@ private:\n>   \t\tstd::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_;\n>   \t\tCameraMetadata settings_;\n>   \t\tstd::unique_ptr<CaptureRequest> request_;\n> +\t\tconst camera3_capture_request_t *camera3Request_;\n>   \t};\n>   \n>   \tenum class State {\n> diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp\n> index 98dddd9eb13b..7f39fab01638 100644\n> --- a/src/android/camera_worker.cpp\n> +++ b/src/android/camera_worker.cpp\n> @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL)\n>    * by the CameraWorker which queues it to the libcamera::Camera after handling\n>    * fences.\n>    */\n> -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> -\t: camera_(camera)\n> +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> +\t\t\t       const camera3_capture_request_t *camera3Request)\n> +\t: camera_(camera), camera3Request_(camera3Request)\n>   {\n>   \trequest_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n>   }\n> @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n>   {\n>   \t/* Async process the request on the worker which runs its own thread. */\n>   \tworker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> -\t\t\t     request);\n> +\t\t\t     this, request);\n>   }\n>   \n>   /*\n> @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n>   \treturn -errno;\n>   }\n>   \n> -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n>   {\n>   \t/* Wait on all fences before queuing the Request. */\n>   \tfor (int fence : request->fences()) {\n> @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n>   \t\tif (ret < 0) {\n>   \t\t\tLOG(HAL, Error) << \"Failed waiting for fence: \"\n>   \t\t\t\t\t<< fence << \": \" << strerror(-ret);\n> +\t\t\tcontext->requestQueueFailed.emit(request);\n>   \t\t\treturn;\n>   \t\t}\n>   \t}\n> diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h\n> index 67ae50bd9288..86f20f741e54 100644\n> --- a/src/android/camera_worker.h\n> +++ b/src/android/camera_worker.h\n> @@ -10,6 +10,7 @@\n>   #include <memory>\n>   \n>   #include <libcamera/base/object.h>\n> +#include <libcamera/base/signal.h>\n>   #include <libcamera/base/thread.h>\n>   \n>   #include <libcamera/camera.h>\n> @@ -22,7 +23,8 @@ class CameraDevice;\n>   class CaptureRequest\n>   {\n>   public:\n> -\tCaptureRequest(libcamera::Camera *camera);\n> +\tCaptureRequest(libcamera::Camera *camera,\n> +\t\t       const camera3_capture_request_t *camera3Request);\n>   \n>   \tconst std::vector<int> &fences() const { return acquireFences_; }\n>   \tlibcamera::ControlList &controls() { return request_->controls(); }\n> @@ -31,6 +33,7 @@ public:\n>   \t\treturn request_->metadata();\n>   \t}\n>   \tunsigned long cookie() const { return request_->cookie(); }\n> +\tconst camera3_capture_request_t *camera3Request() const { return camera3Request_; }\n>   \n>   \tvoid addBuffer(libcamera::Stream *stream,\n>   \t\t       libcamera::FrameBuffer *buffer, int fence);\n> @@ -40,6 +43,7 @@ private:\n>   \tlibcamera::Camera *camera_;\n>   \tstd::vector<int> acquireFences_;\n>   \tstd::unique_ptr<libcamera::Request> request_;\n> +\tconst camera3_capture_request_t *camera3Request_;\n>   };\n>   \n>   class CameraWorker : private libcamera::Thread\n> @@ -51,6 +55,7 @@ public:\n>   \tvoid stop();\n>   \n>   \tvoid queueRequest(CaptureRequest *request);\n> +\tlibcamera::Signal<CaptureRequest *> requestQueueFailed;\n>   \n>   protected:\n>   \tvoid run() override;\n> @@ -59,7 +64,7 @@ private:\n>   \tclass Worker : public libcamera::Object\n>   \t{\n>   \tpublic:\n> -\t\tvoid processRequest(CaptureRequest *request);\n> +\t\tvoid processRequest(CameraWorker *context, CaptureRequest *request);\n>   \n>   \tprivate:\n>   \t\tint waitFence(int fence);","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id C8488BDC71\n\tfor <parsemail@patchwork.libcamera.org>;\n\tMon, 13 Sep 2021 04:53:28 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 29E6269184;\n\tMon, 13 Sep 2021 06:53:28 +0200 (CEST)","from perceval.ideasonboard.com (perceval.ideasonboard.com\n\t[213.167.242.64])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id D81C160131\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tMon, 13 Sep 2021 06:53:26 +0200 (CEST)","from [192.168.1.104] (unknown [103.251.226.152])\n\tby perceval.ideasonboard.com (Postfix) with ESMTPSA id 99B269E;\n\tMon, 13 Sep 2021 06:53:25 +0200 (CEST)"],"Authentication-Results":"lancelot.ideasonboard.com; dkim=pass (1024-bit key;\n\tunprotected) header.d=ideasonboard.com header.i=@ideasonboard.com\n\theader.b=\"Qzsv954u\"; dkim-atps=neutral","DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/simple; d=ideasonboard.com;\n\ts=mail; t=1631508806;\n\tbh=Wc3Q1Vc3LEuNZcJ/DwBA5QRRYW1JtdPm0PEOv4Je9BI=;\n\th=Subject:To:References:From:Date:In-Reply-To:From;\n\tb=Qzsv954uLZSv6jOVDIjUljcu32Uteelzj07o9P6RSn2iCG26eqbhIMm3QnzLefuDY\n\tRchH9yk+twFmugUYY6LuxODftO24AVf3oG3otqS9VDuLFKrVMezuhhFlQfIU9F4ZNb\n\tH+H777O2EgekDNXvZLRHQL5q1fVgAcUqBq9ZD1Bw=","To":"Jacopo Mondi <jacopo@jmondi.org>, libcamera-devel@lists.libcamera.org","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>","From":"Umang Jain <umang.jain@ideasonboard.com>","Message-ID":"<a2febd40-ec50-e629-4474-64b086cb24c2@ideasonboard.com>","Date":"Mon, 13 Sep 2021 10:23:21 +0530","User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101\n\tThunderbird/78.10.2","MIME-Version":"1.0","In-Reply-To":"<20210906140152.636883-5-jacopo@jmondi.org>","Content-Type":"text/plain; charset=utf-8; format=flowed","Content-Transfer-Encoding":"7bit","Content-Language":"en-US","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}},{"id":19701,"web_url":"https://patchwork.libcamera.org/comment/19701/","msgid":"<YUFrvSJps+tqlSrl@pendragon.ideasonboard.com>","date":"2021-09-15T03:42:53","subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","submitter":{"id":2,"url":"https://patchwork.libcamera.org/api/people/2/","name":"Laurent Pinchart","email":"laurent.pinchart@ideasonboard.com"},"content":"Hi Jacopo,\n\nThank you for the patch.\n\nOn Mon, Sep 06, 2021 at 04:01:51PM +0200, Jacopo Mondi wrote:\n> The CameraDevice class stores one CaptureRequestDescriptor instance\n> for each capture request queued to the Camera. Such descriptors are\n> cleared when the Camera completes the Request in the\n> CameraDevice::requestComplete() slot.\n> \n> The CameraDevice class relies on an internal worker to off-load\n> waiting on synchronization fences, and does unconditionally store\n> the descriptor in the descriptors_ class member map to handle its\n> lifetime.\n> \n> If waiting on a fence fails, the created descriptor remains in the\n> descriptors_ map until the next call to stop() or close(), as\n> requestComplete() will never be called, and the camera\n> service is never notified about the queueing failure.\n> \n> Fix that by allowing the CameraWorker to notify to the CameraDevice if\n> waiting on a fence has failed, by installing a new requestQueueFailed\n> Signal.\n> \n> The Signal is emitted by the CameraWorker internal worker if waiting on\n> a fence has failed. The CameraDevice is augmented with a slot connected\n> to the Signal that removes the descriptor from the map and notifies the\n> camera framework about the failure.\n> \n> Signed-off-by: Jacopo Mondi <jacopo@jmondi.org>\n> ---\n>  src/android/camera_device.cpp | 22 +++++++++++++++++++++-\n>  src/android/camera_device.h   |  1 +\n>  src/android/camera_worker.cpp | 10 ++++++----\n>  src/android/camera_worker.h   |  9 +++++++--\n>  4 files changed, 35 insertions(+), 7 deletions(-)\n> \n> diff --git a/src/android/camera_device.cpp b/src/android/camera_device.cpp\n> index a0ea138d9499..0ce9b699bfae 100644\n> --- a/src/android/camera_device.cpp\n> +++ b/src/android/camera_device.cpp\n> @@ -244,7 +244,7 @@ CameraDevice::Camera3RequestDescriptor::Camera3RequestDescriptor(\n>  \t * Create the CaptureRequest, stored as a unique_ptr<> to tie its\n>  \t * lifetime to the descriptor.\n>  \t */\n> -\trequest_ = std::make_unique<CaptureRequest>(camera);\n> +\trequest_ = std::make_unique<CaptureRequest>(camera, camera3Request);\n>  }\n>  \n>  /*\n> @@ -264,6 +264,7 @@ CameraDevice::CameraDevice(unsigned int id, std::shared_ptr<Camera> camera)\n>  \t: id_(id), state_(State::Stopped), camera_(std::move(camera)),\n>  \t  facing_(CAMERA_FACING_FRONT), orientation_(0)\n>  {\n> +\tworker_.requestQueueFailed.connect(this, &CameraDevice::requestQueueFailed);\n>  \tcamera_->requestCompleted.connect(this, &CameraDevice::requestComplete);\n>  \n>  \tmaker_ = \"libcamera\";\n> @@ -1060,6 +1061,25 @@ int CameraDevice::processCaptureRequest(camera3_capture_request_t *camera3Reques\n>  \treturn 0;\n>  }\n>  \n> +void CameraDevice::requestQueueFailed(CaptureRequest *request)\n> +{\n> +\t/*\n> +\t * Remove the descriptor from the map if the worker has failed\n> +\t * to process it and notify the camera service about the failure.\n> +\t */\n> +\tMutexLocker descriptorsLock(descriptorsMutex_);\n> +\tauto it = descriptors_.find(request->cookie());\n> +\tif (it == descriptors_.end()) {\n> +\t\tLOG(HAL, Fatal)\n> +\t\t\t<< \"Unknown request: \" << request->cookie();\n> +\t\treturn;\n> +\t}\n> +\n> +\tdescriptors_.extract(it);\n> +\n> +\tabortRequest(request->camera3Request());\n\nI'd like to avoid adding the camera3_capture_request_t pointer to\nCaptureRequest. Could we modify CameraDevice::abortRequest() to take a\nCamera3RequestDescriptor (which we have in\nCameraDevice::processCaptureRequest() where abortRequest() is called\ntoday), and get it here from the descriptors_ map ?\n\nI also wonder if we could somehow set the Request status to\nRequestCancelled and call CameraDevice::requestComplete(). The rationale\nis that we'll eventually move the fence handling to the libcamera core,\nso the callback to the HAL will be requestComplete() then.\n\nThere's an additional problem, if a wait operation fails, you'll end up\nsignaling the request completion to the camera service out-of-order. I\nwonder if it wouldn't be better to rebase this on Umang's ongoing work\nthat will include a pending request queue to address this issue.\n\n> +}\n> +\n>  void CameraDevice::requestComplete(Request *request)\n>  {\n>  \tdecltype(descriptors_)::node_type node;\n> diff --git a/src/android/camera_device.h b/src/android/camera_device.h\n> index 54c4cb9ab499..65456ecca7e3 100644\n> --- a/src/android/camera_device.h\n> +++ b/src/android/camera_device.h\n> @@ -83,6 +83,7 @@ private:\n>  \t\tstd::vector<std::unique_ptr<libcamera::FrameBuffer>> frameBuffers_;\n>  \t\tCameraMetadata settings_;\n>  \t\tstd::unique_ptr<CaptureRequest> request_;\n> +\t\tconst camera3_capture_request_t *camera3Request_;\n>  \t};\n>  \n>  \tenum class State {\n> diff --git a/src/android/camera_worker.cpp b/src/android/camera_worker.cpp\n> index 98dddd9eb13b..7f39fab01638 100644\n> --- a/src/android/camera_worker.cpp\n> +++ b/src/android/camera_worker.cpp\n> @@ -27,8 +27,9 @@ LOG_DECLARE_CATEGORY(HAL)\n>   * by the CameraWorker which queues it to the libcamera::Camera after handling\n>   * fences.\n>   */\n> -CaptureRequest::CaptureRequest(libcamera::Camera *camera)\n> -\t: camera_(camera)\n> +CaptureRequest::CaptureRequest(libcamera::Camera *camera,\n> +\t\t\t       const camera3_capture_request_t *camera3Request)\n> +\t: camera_(camera), camera3Request_(camera3Request)\n>  {\n>  \trequest_ = camera_->createRequest(reinterpret_cast<uint64_t>(this));\n>  }\n> @@ -77,7 +78,7 @@ void CameraWorker::queueRequest(CaptureRequest *request)\n>  {\n>  \t/* Async process the request on the worker which runs its own thread. */\n>  \tworker_.invokeMethod(&Worker::processRequest, ConnectionTypeQueued,\n> -\t\t\t     request);\n> +\t\t\t     this, request);\n\nWe could also pass the CameraWorker pointer to the CameraWorker::Worker\nconstructor and store it (not sure it would be better actually). Another\noption is to store the Signal in CameraWorker::Worker, with a\nCameraWorker::requestQueueFailed() accessor function that returns a\nreference to the signal.\n\n>  }\n>  \n>  /*\n> @@ -109,7 +110,7 @@ int CameraWorker::Worker::waitFence(int fence)\n>  \treturn -errno;\n>  }\n>  \n> -void CameraWorker::Worker::processRequest(CaptureRequest *request)\n> +void CameraWorker::Worker::processRequest(CameraWorker *context, CaptureRequest *request)\n>  {\n>  \t/* Wait on all fences before queuing the Request. */\n>  \tfor (int fence : request->fences()) {\n> @@ -121,6 +122,7 @@ void CameraWorker::Worker::processRequest(CaptureRequest *request)\n>  \t\tif (ret < 0) {\n>  \t\t\tLOG(HAL, Error) << \"Failed waiting for fence: \"\n>  \t\t\t\t\t<< fence << \": \" << strerror(-ret);\n> +\t\t\tcontext->requestQueueFailed.emit(request);\n>  \t\t\treturn;\n>  \t\t}\n>  \t}\n> diff --git a/src/android/camera_worker.h b/src/android/camera_worker.h\n> index 67ae50bd9288..86f20f741e54 100644\n> --- a/src/android/camera_worker.h\n> +++ b/src/android/camera_worker.h\n> @@ -10,6 +10,7 @@\n>  #include <memory>\n>  \n>  #include <libcamera/base/object.h>\n> +#include <libcamera/base/signal.h>\n>  #include <libcamera/base/thread.h>\n>  \n>  #include <libcamera/camera.h>\n> @@ -22,7 +23,8 @@ class CameraDevice;\n>  class CaptureRequest\n>  {\n>  public:\n> -\tCaptureRequest(libcamera::Camera *camera);\n> +\tCaptureRequest(libcamera::Camera *camera,\n> +\t\t       const camera3_capture_request_t *camera3Request);\n>  \n>  \tconst std::vector<int> &fences() const { return acquireFences_; }\n>  \tlibcamera::ControlList &controls() { return request_->controls(); }\n> @@ -31,6 +33,7 @@ public:\n>  \t\treturn request_->metadata();\n>  \t}\n>  \tunsigned long cookie() const { return request_->cookie(); }\n> +\tconst camera3_capture_request_t *camera3Request() const { return camera3Request_; }\n>  \n>  \tvoid addBuffer(libcamera::Stream *stream,\n>  \t\t       libcamera::FrameBuffer *buffer, int fence);\n> @@ -40,6 +43,7 @@ private:\n>  \tlibcamera::Camera *camera_;\n>  \tstd::vector<int> acquireFences_;\n>  \tstd::unique_ptr<libcamera::Request> request_;\n> +\tconst camera3_capture_request_t *camera3Request_;\n>  };\n>  \n>  class CameraWorker : private libcamera::Thread\n> @@ -51,6 +55,7 @@ public:\n>  \tvoid stop();\n>  \n>  \tvoid queueRequest(CaptureRequest *request);\n> +\tlibcamera::Signal<CaptureRequest *> requestQueueFailed;\n>  \n>  protected:\n>  \tvoid run() override;\n> @@ -59,7 +64,7 @@ private:\n>  \tclass Worker : public libcamera::Object\n>  \t{\n>  \tpublic:\n> -\t\tvoid processRequest(CaptureRequest *request);\n> +\t\tvoid processRequest(CameraWorker *context, CaptureRequest *request);\n>  \n>  \tprivate:\n>  \t\tint waitFence(int fence);","headers":{"Return-Path":"<libcamera-devel-bounces@lists.libcamera.org>","X-Original-To":"parsemail@patchwork.libcamera.org","Delivered-To":"parsemail@patchwork.libcamera.org","Received":["from lancelot.ideasonboard.com (lancelot.ideasonboard.com\n\t[92.243.16.209])\n\tby patchwork.libcamera.org (Postfix) with ESMTPS id 32F42BDB1D\n\tfor <parsemail@patchwork.libcamera.org>;\n\tWed, 15 Sep 2021 03:43:21 +0000 (UTC)","from lancelot.ideasonboard.com (localhost [IPv6:::1])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTP id 6788969189;\n\tWed, 15 Sep 2021 05:43:20 +0200 (CEST)","from perceval.ideasonboard.com (perceval.ideasonboard.com\n\t[213.167.242.64])\n\tby lancelot.ideasonboard.com (Postfix) with ESMTPS id C3BBE60247\n\tfor <libcamera-devel@lists.libcamera.org>;\n\tWed, 15 Sep 2021 05:43:18 +0200 (CEST)","from pendragon.ideasonboard.com (62-78-145-57.bb.dnainternet.fi\n\t[62.78.145.57])\n\tby perceval.ideasonboard.com (Postfix) with ESMTPSA id 44BD124F;\n\tWed, 15 Sep 2021 05:43:18 +0200 (CEST)"],"Authentication-Results":"lancelot.ideasonboard.com; dkim=pass (1024-bit key;\n\tunprotected) header.d=ideasonboard.com header.i=@ideasonboard.com\n\theader.b=\"sEX11GHO\"; dkim-atps=neutral","DKIM-Signature":"v=1; a=rsa-sha256; c=relaxed/simple; d=ideasonboard.com;\n\ts=mail; t=1631677398;\n\tbh=VacNNVr0oujO5R1owLPOqVXyd7NEgQni0rXzXhxQruM=;\n\th=Date:From:To:Cc:Subject:References:In-Reply-To:From;\n\tb=sEX11GHOnXi+7vp7OpBCW0zTiUMfkQaaS6VC2sMxxMb/s+JdbzhHSY6H7V8Tw2ins\n\tePXhz6XXnlCFapGmN9w+nQ7W0kjW2gXnJTITPII8M+vusrYwXcTj11/Bc+jU8lSV1K\n\t0qsJsW7efPRHDUW9WbUksNooTC9XNtGGorG1sS9w=","Date":"Wed, 15 Sep 2021 06:42:53 +0300","From":"Laurent Pinchart <laurent.pinchart@ideasonboard.com>","To":"Jacopo Mondi <jacopo@jmondi.org>","Message-ID":"<YUFrvSJps+tqlSrl@pendragon.ideasonboard.com>","References":"<20210906140152.636883-1-jacopo@jmondi.org>\n\t<20210906140152.636883-5-jacopo@jmondi.org>","MIME-Version":"1.0","Content-Type":"text/plain; charset=utf-8","Content-Disposition":"inline","In-Reply-To":"<20210906140152.636883-5-jacopo@jmondi.org>","Subject":"Re: [libcamera-devel] [PATCH 4/5] android: camera_worker: Notify\n\tfence wait failures","X-BeenThere":"libcamera-devel@lists.libcamera.org","X-Mailman-Version":"2.1.29","Precedence":"list","List-Id":"<libcamera-devel.lists.libcamera.org>","List-Unsubscribe":"<https://lists.libcamera.org/options/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=unsubscribe>","List-Archive":"<https://lists.libcamera.org/pipermail/libcamera-devel/>","List-Post":"<mailto:libcamera-devel@lists.libcamera.org>","List-Help":"<mailto:libcamera-devel-request@lists.libcamera.org?subject=help>","List-Subscribe":"<https://lists.libcamera.org/listinfo/libcamera-devel>,\n\t<mailto:libcamera-devel-request@lists.libcamera.org?subject=subscribe>","Cc":"libcamera-devel@lists.libcamera.org","Errors-To":"libcamera-devel-bounces@lists.libcamera.org","Sender":"\"libcamera-devel\" <libcamera-devel-bounces@lists.libcamera.org>"}}]