OCA · thomaspaulb · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/queue_job/README.rst b/queue_job/README.rst
@@ -1,7 +1,3 @@
-.. image:: https://odoo-community.org/readme-banner-image
-   :target: https://odoo-community.org/get-involved?utm_source=readme
-   :alt: Odoo Community Association
-
 =========
 Job Queue
 =========
@@ -17,7 +13,7 @@ Job Queue
 .. |badge1| image:: https://img.shields.io/badge/maturity-Mature-brightgreen.png
     :target: https://odoo-community.org/page/development-status
     :alt: Mature
-.. |badge2| image:: https://img.shields.io/badge/license-LGPL--3-blue.png
+.. |badge2| image:: https://img.shields.io/badge/licence-LGPL--3-blue.png
     :target: http://www.gnu.org/licenses/lgpl-3.0-standalone.html
     :alt: License: LGPL-3
 .. |badge3| image:: https://img.shields.io/badge/github-OCA%2Fqueue-lightgray.png?logo=github
@@ -697,10 +693,13 @@ promote its widespread use.
 .. |maintainer-guewen| image:: https://github.com/guewen.png?size=40px
     :target: https://github.com/guewen
     :alt: guewen
+.. |maintainer-sbidoul| image:: https://github.com/sbidoul.png?size=40px
+    :target: https://github.com/sbidoul
+    :alt: sbidoul
 
-Current `maintainer <https://odoo-community.org/page/maintainer-role>`__:
+Current `maintainers <https://odoo-community.org/page/maintainer-role>`__:
 
-|maintainer-guewen| 
+|maintainer-guewen| |maintainer-sbidoul| 
 
 This module is part of the `OCA/queue <https://github.com/OCA/queue/tree/16.0/queue_job>`_ project on GitHub.
 

diff --git a/queue_job/__manifest__.py b/queue_job/__manifest__.py
@@ -29,7 +29,7 @@
     },
     "installable": True,
     "development_status": "Mature",
-    "maintainers": ["guewen"],
+    "maintainers": ["guewen", "sbidoul"],
     "post_init_hook": "post_init_hook",
     "post_load": "post_load",
 }
diff --git a/queue_job/controllers/main.py b/queue_job/controllers/main.py
@@ -26,15 +26,48 @@
 
 
 class RunJobController(http.Controller):
-    def _try_perform_job(self, env, job):
-        """Try to perform the job."""
+    @classmethod
+    def _acquire_job(cls, env: api.Environment, job_uuid: str) -> Job | None:
+        """Acquire a job for execution.
+
+        - make sure it is in ENQUEUED state
+        - mark it as STARTED and commit the state change
+        - acquire the job lock
+
+        If successful, return the Job instance, otherwise return None. This
+        function may fail to acquire the job is not in the expected state or is
+        already locked by another worker.
+        """
+        env.cr.execute(
+            "SELECT uuid FROM queue_job WHERE uuid=%s AND state=%s "
+            "FOR NO KEY UPDATE SKIP LOCKED",
+            (job_uuid, ENQUEUED),
+        )
+        if not env.cr.fetchone():
+            _logger.warning(
+                "was requested to run job %s, but it does not exist, "
+                "or is not in state %s, or is being handled by another worker",
+                job_uuid,
+                ENQUEUED,
+            )
+            return None
+        job = Job.load(env, job_uuid)
+        assert job and job.state == ENQUEUED
         job.set_started()
         job.store()
         env.cr.commit()
-        job.lock()
+        if not job.lock():
+            _logger.warning(
+                "was requested to run job %s, but it could not be locked",
+                job_uuid,
+            )
+            return None
+        return job
 
+    @classmethod
+    def _try_perform_job(cls, env, job):
+        """Try to perform the job, mark it done and commit if successful."""
         _logger.debug("%s started", job)
-
         job.perform()
         # Triggers any stored computed fields before calling 'set_done'
         # so that will be part of the 'exec_time'
@@ -45,18 +78,20 @@ def _try_perform_job(self, env, job):
         env.cr.commit()
         _logger.debug("%s done", job)
 
-    def _enqueue_dependent_jobs(self, env, job):
+    @classmethod
+    def _enqueue_dependent_jobs(cls, env, job):
         tries = 0
         while True:
             try:
-                job.enqueue_waiting()
+                with job.env.cr.savepoint():
+                    job.enqueue_waiting()
             except OperationalError as err:
                 # Automatically retry the typical transaction serialization
                 # errors
                 if err.pgcode not in PG_CONCURRENCY_ERRORS_TO_RETRY:
                     raise
                 if tries >= DEPENDS_MAX_TRIES_ON_CONCURRENCY_FAILURE:
-                    _logger.info(
+                    _logger.error(
                         "%s, maximum number of tries reached to update dependencies",
                         errorcodes.lookup(err.pgcode),
                     )
@@ -74,17 +109,8 @@ def _enqueue_dependent_jobs(self, env, job):
             else:
                 break
 
-    @http.route(
-        "/queue_job/runjob",
-        type="http",
-        auth="none",
-        save_session=False,
-        readonly=False,
-    )
-    def runjob(self, db, job_uuid, **kw):
-        http.request.session.db = db
-        env = http.request.env(user=SUPERUSER_ID)
-
+    @classmethod
+    def _runjob(cls, env: api.Environment, job: Job) -> None:
         def retry_postpone(job, message, seconds=None):
             job.env.clear()
             with registry(job.env.cr.dbname).cursor() as new_cr:
@@ -93,26 +119,9 @@ def retry_postpone(job, message, seconds=None):
                 job.set_pending(reset_retry=False)
                 job.store()
 
-        # ensure the job to run is in the correct state and lock the record
-        env.cr.execute(
-            "SELECT state FROM queue_job WHERE uuid=%s AND state=%s FOR UPDATE",
-            (job_uuid, ENQUEUED),
-        )
-        if not env.cr.fetchone():
-            _logger.warning(
-                "was requested to run job %s, but it does not exist, "
-                "or is not in state %s",
-                job_uuid,
-                ENQUEUED,
-            )
-            return ""
-
-        job = Job.load(env, job_uuid)
-        assert job and job.state == ENQUEUED
-
         try:
             try:
-                self._try_perform_job(env, job)
+                cls._try_perform_job(env, job)
             except OperationalError as err:
                 # Automatically retry the typical transaction serialization
                 # errors
@@ -141,7 +150,6 @@ def retry_postpone(job, message, seconds=None):
             # traceback in the logs we should have the traceback when all
             # retries are exhausted
             env.cr.rollback()
-            return ""
 
         except (FailedJobError, Exception) as orig_exception:
             buff = StringIO()
@@ -151,19 +159,18 @@ def retry_postpone(job, message, seconds=None):
             job.env.clear()
             with registry(job.env.cr.dbname).cursor() as new_cr:
                 job.env = job.env(cr=new_cr)
-                vals = self._get_failure_values(job, traceback_txt, orig_exception)
+                vals = cls._get_failure_values(job, traceback_txt, orig_exception)
                 job.set_failed(**vals)
                 job.store()
                 buff.close()
             raise
 
         _logger.debug("%s enqueue depends started", job)
-        self._enqueue_dependent_jobs(env, job)
+        cls._enqueue_dependent_jobs(env, job)
         _logger.debug("%s enqueue depends done", job)
 
-        return ""
-
-    def _get_failure_values(self, job, traceback_txt, orig_exception):
+    @classmethod
+    def _get_failure_values(cls, job, traceback_txt, orig_exception):
         """Collect relevant data from exception."""
         exception_name = orig_exception.__class__.__name__
         if hasattr(orig_exception, "__module__"):
@@ -177,6 +184,22 @@ def _get_failure_values(self, job, traceback_txt, orig_exception):
             "exc_message": exc_message,
         }
 
+    @http.route(
+        "/queue_job/runjob",
+        type="http",
+        auth="none",
+        save_session=False,
+        readonly=False,
+    )
+    def runjob(self, db, job_uuid, **kw):
+        http.request.session.db = db
+        env = http.request.env(user=SUPERUSER_ID)
+        job = self._acquire_job(env, job_uuid)
+        if not job:
+            return ""
+        self._runjob(env, job)
+        return ""
+
     # flake8: noqa: C901
     @http.route("/queue_job/create_test_job", type="http", auth="user")
     def create_test_job(
@@ -187,6 +210,7 @@ def create_test_job(
         description="Test job",
         size=1,
         failure_rate=0,
+        job_duration=0,
     ):
         """Create test jobs
 
@@ -207,6 +231,12 @@ def create_test_job(
             except (ValueError, TypeError):
                 failure_rate = 0
 
+        if job_duration is not None:
+            try:
+                job_duration = float(job_duration)
+            except (ValueError, TypeError):
+                job_duration = 0
+
         if not (0 <= failure_rate <= 1):
             raise BadRequest("failure_rate must be between 0 and 1")
 
@@ -235,6 +265,7 @@ def create_test_job(
                 channel=channel,
                 description=description,
                 failure_rate=failure_rate,
+                job_duration=job_duration,
             )
 
         if size > 1:
@@ -245,6 +276,7 @@ def create_test_job(
                 channel=channel,
                 description=description,
                 failure_rate=failure_rate,
+                job_duration=job_duration,
             )
         return ""
 
@@ -256,6 +288,7 @@ def _create_single_test_job(
         description="Test job",
         size=1,
         failure_rate=0,
+        job_duration=0,
     ):
         delayed = (
             http.request.env["queue.job"]
@@ -265,7 +298,7 @@ def _create_single_test_job(
                 channel=channel,
                 description=description,
             )
-            ._test_job(failure_rate=failure_rate)
+            ._test_job(failure_rate=failure_rate, job_duration=job_duration)
         )
         return "job uuid: %s" % (delayed.db_record().uuid,)
 
@@ -279,6 +312,7 @@ def _create_graph_test_jobs(
         channel=None,
         description="Test job",
         failure_rate=0,
+        job_duration=0,
     ):
         model = http.request.env["queue.job"]
         current_count = 0
@@ -301,7 +335,7 @@ def _create_graph_test_jobs(
                         max_retries=max_retries,
                         channel=channel,
                         description="%s #%d" % (description, current_count),
-                    )._test_job(failure_rate=failure_rate)
+                    )._test_job(failure_rate=failure_rate, job_duration=job_duration)
                 )
 
             grouping = random.choice(possible_grouping_methods)

diff --git a/queue_job/job.py b/queue_job/job.py
@@ -236,7 +236,7 @@ def load_many(cls, env, job_uuids):
         recordset = cls.db_records_from_uuids(env, job_uuids)
         return {cls._load_from_db_record(record) for record in recordset}
 
-    def add_lock_record(self):
+    def add_lock_record(self) -> None:
         """
         Create row in db to be locked while the job is being performed.
         """
@@ -256,13 +256,11 @@ def add_lock_record(self):
             [self.uuid],
         )
 
-    def lock(self):
-        """
-        Lock row of job that is being performed
+    def lock(self) -> bool:
+        """Lock row of job that is being performed.
 
-        If a job cannot be locked,
-        it means that the job wasn't started,
-        a RetryableJobError is thrown.
+        Return False if a job cannot be locked: it means that the job is not in
+        STARTED state or is already locked by another worker.
         """
         self.env.cr.execute(
             """
@@ -278,18 +276,15 @@ def lock(self):
                         queue_job
                     WHERE
                         uuid = %s
-                        AND state='started'
+                        AND state = %s
                 )
-            FOR UPDATE;
+            FOR NO KEY UPDATE SKIP LOCKED;
         """,
-            [self.uuid],
+            [self.uuid, STARTED],
         )
 
         # 1 job should be locked
-        if 1 != len(self.env.cr.fetchall()):
-            raise RetryableJobError(
-                f"Trying to lock job that wasn't started, uuid: {self.uuid}"
-            )
+        return bool(self.env.cr.fetchall())
 
     @classmethod
     def _load_from_db_record(cls, job_db_record):

diff --git a/queue_job/jobrunner/runner.py b/queue_job/jobrunner/runner.py
@@ -361,23 +361,26 @@ def _query_requeue_dead_jobs(self):
                         ELSE exc_info
                     END)
             WHERE
-                id in (
-                    SELECT
-                        queue_job_id
-                    FROM
-                        queue_job_lock
-                    WHERE
-                        queue_job_id in (
-                            SELECT
-                                id
-                            FROM
-                                queue_job
-                            WHERE
-                                state IN ('enqueued','started')
-                                AND date_enqueued <
-                                (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
-                        )
-                    FOR UPDATE SKIP LOCKED
+                state IN ('enqueued','started')
+                AND date_enqueued < (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
+                AND (
+                    id in (
+                        SELECT
+                            queue_job_id
+                        FROM
+                            queue_job_lock
+                        WHERE
+                            queue_job_lock.queue_job_id = queue_job.id
+                        FOR NO KEY UPDATE SKIP LOCKED
+                    )
+                    OR NOT EXISTS (
+                        SELECT
+                            1
+                        FROM
+                            queue_job_lock
+                        WHERE
+                            queue_job_lock.queue_job_id = queue_job.id
+                    )
                 )
             RETURNING uuid
             """
@@ -400,6 +403,12 @@ def requeue_dead_jobs(self):
         However, when the Odoo server crashes or is otherwise force-stopped,
         running jobs are interrupted while the runner has no chance to know
         they have been aborted.
+
+        This also handles orphaned jobs (enqueued but never started, no lock).
+        This edge case occurs when the runner marks a job as 'enqueued'
+        but the HTTP request to start the job never reaches the Odoo server
+        (e.g., due to server shutdown/crash between setting enqueued and
+        the controller receiving the request).
         """
 
         with closing(self.conn.cursor()) as cr: