Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions queue_job/README.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
.. image:: https://odoo-community.org/readme-banner-image
:target: https://odoo-community.org/get-involved?utm_source=readme
:alt: Odoo Community Association

=========
Job Queue
=========
Expand All @@ -17,7 +13,7 @@ Job Queue
.. |badge1| image:: https://img.shields.io/badge/maturity-Mature-brightgreen.png
:target: https://odoo-community.org/page/development-status
:alt: Mature
.. |badge2| image:: https://img.shields.io/badge/license-LGPL--3-blue.png
.. |badge2| image:: https://img.shields.io/badge/licence-LGPL--3-blue.png
:target: http://www.gnu.org/licenses/lgpl-3.0-standalone.html
:alt: License: LGPL-3
.. |badge3| image:: https://img.shields.io/badge/github-OCA%2Fqueue-lightgray.png?logo=github
Expand Down Expand Up @@ -697,10 +693,13 @@ promote its widespread use.
.. |maintainer-guewen| image:: https://github.com/guewen.png?size=40px
:target: https://github.com/guewen
:alt: guewen
.. |maintainer-sbidoul| image:: https://github.com/sbidoul.png?size=40px
:target: https://github.com/sbidoul
:alt: sbidoul

Current `maintainer <https://odoo-community.org/page/maintainer-role>`__:
Current `maintainers <https://odoo-community.org/page/maintainer-role>`__:

|maintainer-guewen|
|maintainer-guewen| |maintainer-sbidoul|

This module is part of the `OCA/queue <https://github.com/OCA/queue/tree/16.0/queue_job>`_ project on GitHub.

Expand Down
2 changes: 1 addition & 1 deletion queue_job/__manifest__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
"installable": True,
"development_status": "Mature",
"maintainers": ["guewen"],
"maintainers": ["guewen", "sbidoul"],
"post_init_hook": "post_init_hook",
"post_load": "post_load",
}
122 changes: 78 additions & 44 deletions queue_job/controllers/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,48 @@


class RunJobController(http.Controller):
def _try_perform_job(self, env, job):
"""Try to perform the job."""
@classmethod
def _acquire_job(cls, env: api.Environment, job_uuid: str) -> Job | None:
"""Acquire a job for execution.

- make sure it is in ENQUEUED state
- mark it as STARTED and commit the state change
- acquire the job lock

If successful, return the Job instance, otherwise return None. This
function may fail to acquire the job is not in the expected state or is
already locked by another worker.
"""
env.cr.execute(
"SELECT uuid FROM queue_job WHERE uuid=%s AND state=%s "
"FOR NO KEY UPDATE SKIP LOCKED",
(job_uuid, ENQUEUED),
)
if not env.cr.fetchone():
_logger.warning(
"was requested to run job %s, but it does not exist, "
"or is not in state %s, or is being handled by another worker",
job_uuid,
ENQUEUED,
)
return None
job = Job.load(env, job_uuid)
assert job and job.state == ENQUEUED
job.set_started()
job.store()
env.cr.commit()
job.lock()
if not job.lock():
_logger.warning(
"was requested to run job %s, but it could not be locked",
job_uuid,
)
return None
return job

@classmethod
def _try_perform_job(cls, env, job):
"""Try to perform the job, mark it done and commit if successful."""
_logger.debug("%s started", job)

job.perform()
# Triggers any stored computed fields before calling 'set_done'
# so that will be part of the 'exec_time'
Expand All @@ -45,18 +78,20 @@ def _try_perform_job(self, env, job):
env.cr.commit()
_logger.debug("%s done", job)

def _enqueue_dependent_jobs(self, env, job):
@classmethod
def _enqueue_dependent_jobs(cls, env, job):
tries = 0
while True:
try:
job.enqueue_waiting()
with job.env.cr.savepoint():
job.enqueue_waiting()
except OperationalError as err:
# Automatically retry the typical transaction serialization
# errors
if err.pgcode not in PG_CONCURRENCY_ERRORS_TO_RETRY:
raise
if tries >= DEPENDS_MAX_TRIES_ON_CONCURRENCY_FAILURE:
_logger.info(
_logger.error(
"%s, maximum number of tries reached to update dependencies",
errorcodes.lookup(err.pgcode),
)
Expand All @@ -74,17 +109,8 @@ def _enqueue_dependent_jobs(self, env, job):
else:
break

@http.route(
"/queue_job/runjob",
type="http",
auth="none",
save_session=False,
readonly=False,
)
def runjob(self, db, job_uuid, **kw):
http.request.session.db = db
env = http.request.env(user=SUPERUSER_ID)

@classmethod
def _runjob(cls, env: api.Environment, job: Job) -> None:
def retry_postpone(job, message, seconds=None):
job.env.clear()
with registry(job.env.cr.dbname).cursor() as new_cr:
Expand All @@ -93,26 +119,9 @@ def retry_postpone(job, message, seconds=None):
job.set_pending(reset_retry=False)
job.store()

# ensure the job to run is in the correct state and lock the record
env.cr.execute(
"SELECT state FROM queue_job WHERE uuid=%s AND state=%s FOR UPDATE",
(job_uuid, ENQUEUED),
)
if not env.cr.fetchone():
_logger.warning(
"was requested to run job %s, but it does not exist, "
"or is not in state %s",
job_uuid,
ENQUEUED,
)
return ""

job = Job.load(env, job_uuid)
assert job and job.state == ENQUEUED

try:
try:
self._try_perform_job(env, job)
cls._try_perform_job(env, job)
except OperationalError as err:
# Automatically retry the typical transaction serialization
# errors
Expand Down Expand Up @@ -141,7 +150,6 @@ def retry_postpone(job, message, seconds=None):
# traceback in the logs we should have the traceback when all
# retries are exhausted
env.cr.rollback()
return ""

except (FailedJobError, Exception) as orig_exception:
buff = StringIO()
Expand All @@ -151,19 +159,18 @@ def retry_postpone(job, message, seconds=None):
job.env.clear()
with registry(job.env.cr.dbname).cursor() as new_cr:
job.env = job.env(cr=new_cr)
vals = self._get_failure_values(job, traceback_txt, orig_exception)
vals = cls._get_failure_values(job, traceback_txt, orig_exception)
job.set_failed(**vals)
job.store()
buff.close()
raise

_logger.debug("%s enqueue depends started", job)
self._enqueue_dependent_jobs(env, job)
cls._enqueue_dependent_jobs(env, job)
_logger.debug("%s enqueue depends done", job)

return ""

def _get_failure_values(self, job, traceback_txt, orig_exception):
@classmethod
def _get_failure_values(cls, job, traceback_txt, orig_exception):
"""Collect relevant data from exception."""
exception_name = orig_exception.__class__.__name__
if hasattr(orig_exception, "__module__"):
Expand All @@ -177,6 +184,22 @@ def _get_failure_values(self, job, traceback_txt, orig_exception):
"exc_message": exc_message,
}

@http.route(
"/queue_job/runjob",
type="http",
auth="none",
save_session=False,
readonly=False,
)
def runjob(self, db, job_uuid, **kw):
http.request.session.db = db
env = http.request.env(user=SUPERUSER_ID)
job = self._acquire_job(env, job_uuid)
if not job:
return ""
self._runjob(env, job)
return ""

# flake8: noqa: C901
@http.route("/queue_job/create_test_job", type="http", auth="user")
def create_test_job(
Expand All @@ -187,6 +210,7 @@ def create_test_job(
description="Test job",
size=1,
failure_rate=0,
job_duration=0,
):
"""Create test jobs

Expand All @@ -207,6 +231,12 @@ def create_test_job(
except (ValueError, TypeError):
failure_rate = 0

if job_duration is not None:
try:
job_duration = float(job_duration)
except (ValueError, TypeError):
job_duration = 0

if not (0 <= failure_rate <= 1):
raise BadRequest("failure_rate must be between 0 and 1")

Expand Down Expand Up @@ -235,6 +265,7 @@ def create_test_job(
channel=channel,
description=description,
failure_rate=failure_rate,
job_duration=job_duration,
)

if size > 1:
Expand All @@ -245,6 +276,7 @@ def create_test_job(
channel=channel,
description=description,
failure_rate=failure_rate,
job_duration=job_duration,
)
return ""

Expand All @@ -256,6 +288,7 @@ def _create_single_test_job(
description="Test job",
size=1,
failure_rate=0,
job_duration=0,
):
delayed = (
http.request.env["queue.job"]
Expand All @@ -265,7 +298,7 @@ def _create_single_test_job(
channel=channel,
description=description,
)
._test_job(failure_rate=failure_rate)
._test_job(failure_rate=failure_rate, job_duration=job_duration)
)
return "job uuid: %s" % (delayed.db_record().uuid,)

Expand All @@ -279,6 +312,7 @@ def _create_graph_test_jobs(
channel=None,
description="Test job",
failure_rate=0,
job_duration=0,
):
model = http.request.env["queue.job"]
current_count = 0
Expand All @@ -301,7 +335,7 @@ def _create_graph_test_jobs(
max_retries=max_retries,
channel=channel,
description="%s #%d" % (description, current_count),
)._test_job(failure_rate=failure_rate)
)._test_job(failure_rate=failure_rate, job_duration=job_duration)
)

grouping = random.choice(possible_grouping_methods)
Expand Down
23 changes: 9 additions & 14 deletions queue_job/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def load_many(cls, env, job_uuids):
recordset = cls.db_records_from_uuids(env, job_uuids)
return {cls._load_from_db_record(record) for record in recordset}

def add_lock_record(self):
def add_lock_record(self) -> None:
"""
Create row in db to be locked while the job is being performed.
"""
Expand All @@ -256,13 +256,11 @@ def add_lock_record(self):
[self.uuid],
)

def lock(self):
"""
Lock row of job that is being performed
def lock(self) -> bool:
"""Lock row of job that is being performed.

If a job cannot be locked,
it means that the job wasn't started,
a RetryableJobError is thrown.
Return False if a job cannot be locked: it means that the job is not in
STARTED state or is already locked by another worker.
"""
self.env.cr.execute(
"""
Expand All @@ -278,18 +276,15 @@ def lock(self):
queue_job
WHERE
uuid = %s
AND state='started'
AND state = %s
)
FOR UPDATE;
FOR NO KEY UPDATE SKIP LOCKED;
""",
[self.uuid],
[self.uuid, STARTED],
)

# 1 job should be locked
if 1 != len(self.env.cr.fetchall()):
raise RetryableJobError(
f"Trying to lock job that wasn't started, uuid: {self.uuid}"
)
return bool(self.env.cr.fetchall())

@classmethod
def _load_from_db_record(cls, job_db_record):
Expand Down
43 changes: 26 additions & 17 deletions queue_job/jobrunner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,23 +361,26 @@ def _query_requeue_dead_jobs(self):
ELSE exc_info
END)
WHERE
id in (
SELECT
queue_job_id
FROM
queue_job_lock
WHERE
queue_job_id in (
SELECT
id
FROM
queue_job
WHERE
state IN ('enqueued','started')
AND date_enqueued <
(now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
)
FOR UPDATE SKIP LOCKED
state IN ('enqueued','started')
AND date_enqueued < (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
AND (
id in (
SELECT
queue_job_id
FROM
queue_job_lock
WHERE
queue_job_lock.queue_job_id = queue_job.id
FOR NO KEY UPDATE SKIP LOCKED
)
OR NOT EXISTS (
SELECT
1
FROM
queue_job_lock
WHERE
queue_job_lock.queue_job_id = queue_job.id
)
)
RETURNING uuid
"""
Expand All @@ -400,6 +403,12 @@ def requeue_dead_jobs(self):
However, when the Odoo server crashes or is otherwise force-stopped,
running jobs are interrupted while the runner has no chance to know
they have been aborted.

This also handles orphaned jobs (enqueued but never started, no lock).
This edge case occurs when the runner marks a job as 'enqueued'
but the HTTP request to start the job never reaches the Odoo server
(e.g., due to server shutdown/crash between setting enqueued and
the controller receiving the request).
"""

with closing(self.conn.cursor()) as cr:
Expand Down
Loading