diff --git a/.github/workflows/run-tests-on-push.yml b/.github/workflows/run-tests-on-push.yml index 965ddfb34..ad6359cd5 100644 --- a/.github/workflows/run-tests-on-push.yml +++ b/.github/workflows/run-tests-on-push.yml @@ -7,8 +7,8 @@ env: jobs: run-tests-3_9: - runs-on: ubuntu-20.04 - name: Pytest on Python 3.9 / Ubuntu 20.04 + runs-on: ubuntu-latest + name: Pytest on Python 3.9 steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/alembic/versions/34026092c7f8_clinvar_variant_table.py b/alembic/versions/34026092c7f8_clinvar_variant_table.py new file mode 100644 index 000000000..17031c49c --- /dev/null +++ b/alembic/versions/34026092c7f8_clinvar_variant_table.py @@ -0,0 +1,58 @@ +"""clinvar variant table + +Revision ID: 34026092c7f8 +Revises: e8a3b5d8f885 +Create Date: 2025-01-28 21:48:42.532346 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "34026092c7f8" +down_revision = "e8a3b5d8f885" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "clinvar_variants", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("allele_id", sa.Integer(), nullable=False), + sa.Column("gene_symbol", sa.String(), nullable=False), + sa.Column("clinical_significance", sa.String(), nullable=False), + sa.Column("clinical_review_status", sa.String(), nullable=False), + sa.Column("clinvar_db_version", sa.String(), nullable=False), + sa.Column("creation_date", sa.Date(), nullable=False), + sa.Column("modification_date", sa.Date(), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_clinvar_variants_allele_id"), "clinvar_variants", ["allele_id"], unique=False) + op.add_column("mapped_variants", sa.Column("clinvar_variant_id", sa.Integer(), nullable=True)) + op.create_index( + op.f("ix_mapped_variants_clinvar_variant_id"), "mapped_variants", ["clinvar_variant_id"], unique=False + ) + op.create_foreign_key( + "mapped_variant_clinvar_variant_id_foreign_key_constraint", + "mapped_variants", + "clinvar_variants", + ["clinvar_variant_id"], + ["id"], + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint( + "mapped_variant_clinvar_variant_id_foreign_key_constraint", "mapped_variants", type_="foreignkey" + ) + op.drop_index(op.f("ix_mapped_variants_clinvar_variant_id"), table_name="mapped_variants") + op.drop_column("mapped_variants", "clinvar_variant_id") + op.drop_index(op.f("ix_clinvar_variants_allele_id"), table_name="clinvar_variants") + op.drop_table("clinvar_variants") + # ### end Alembic commands ### diff --git a/alembic/versions/695b73abe581_genericize_clinvar_variants_table.py b/alembic/versions/695b73abe581_genericize_clinvar_variants_table.py new file mode 100644 index 000000000..29dfcbadc --- /dev/null +++ b/alembic/versions/695b73abe581_genericize_clinvar_variants_table.py @@ -0,0 +1,101 @@ +"""genericize clinvar variants table + +Revision ID: 695b73abe581 +Revises: 34026092c7f8 +Create Date: 2025-02-18 11:54:15.243078 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "695b73abe581" +down_revision = "34026092c7f8" +branch_labels = None +depends_on = None + + +def upgrade(): + op.rename_table("clinvar_variants", "clinical_controls") + op.execute("ALTER SEQUENCE clinvar_variants_id_seq RENAME TO clinical_controls_id_seq") + op.execute("ALTER INDEX clinvar_variants_pkey RENAME TO clinical_controls_pkey") + + op.alter_column("clinical_controls", "clinvar_db_version", nullable=False, new_column_name="db_version") + op.alter_column("clinical_controls", "allele_id", nullable=False, new_column_name="db_identifier") + op.add_column("clinical_controls", sa.Column("db_name", sa.String(), nullable=True)) + + op.create_index("ix_clinical_controls_gene_symbol", "clinical_controls", ["gene_symbol"]) + op.create_index("ix_clinical_controls_db_name", "clinical_controls", ["db_name"]) + op.create_index("ix_clinical_controls_db_identifier", "clinical_controls", ["db_identifier"]) + op.create_index("ix_clinical_controls_db_version", "clinical_controls", ["db_version"]) + + op.create_table( + "mapped_variants_clinical_controls", + sa.Column("mapped_variant_id", sa.Integer(), nullable=False), + sa.Column("clinical_control_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["mapped_variant_id"], + ["mapped_variants.id"], + ), + sa.ForeignKeyConstraint( + ["clinical_control_id"], + ["clinical_controls.id"], + ), + sa.PrimaryKeyConstraint("mapped_variant_id", "clinical_control_id"), + ) + + # Convert any existing ClinVar variants into clinical control variants. Since + # this table is being update from a clinvar specific table, we assume all existing + # controls are from ClinVar. + op.execute( + """ + INSERT INTO mapped_variants_clinical_controls ( + mapped_variant_id, + clinical_control_id + ) + SELECT id, clinvar_variant_id + FROM mapped_variants + WHERE clinvar_variant_id IS NOT NULL + """ + ) + + op.execute("UPDATE clinical_controls SET db_name='ClinVar'") + op.alter_column("clinical_controls", "db_name", nullable=False) + + op.drop_index("ix_mapped_variants_clinvar_variant_id", "mapped_variants") + op.drop_column("mapped_variants", "clinvar_variant_id") + + +def downgrade(): + op.rename_table("clinical_controls", "clinvar_variants") + op.execute("ALTER SEQUENCE clinical_controls_id_seq RENAME TO clinvar_variants_id_seq") + op.execute("ALTER INDEX clinical_controls_pkey RENAME TO clinvar_variants_pkey") + + op.drop_index("ix_clinical_controls_gene_symbol", "clinical_controls") + op.drop_index("ix_clinical_controls_db_name", "clinical_controls") + op.drop_index("ix_clinical_controls_db_identifier", "clinical_controls") + op.drop_index("ix_clinical_controls_db_version", "clinical_controls") + + op.alter_column("clinvar_variants", "db_version", nullable=False, new_column_name="clinvar_db_version") + op.alter_column("clinvar_variants", "db_identifier", nullable=False, new_column_name="allele_id") + op.drop_column("clinvar_variants", "db_name") + + op.add_column( + "mapped_variants", + sa.Column("clinvar_variant_id", sa.Integer(), sa.ForeignKey("clinvar_variants.id"), nullable=True), + ) + + # Downgrades a many-to-many relationship to a one to many. This will result in data loss. + op.execute( + """ + UPDATE mapped_variants + SET clinvar_variant_id=mapped_variants_clinical_controls.clinical_control_id + FROM mapped_variants_clinical_controls + WHERE mapped_variants_clinical_controls.mapped_variant_id=mapped_variants.id + """ + ) + + op.create_index("ix_mapped_variants_clinvar_variant_id", "mapped_variants", ["clinvar_variant_id"]) + op.drop_table("mapped_variants_clinical_controls") diff --git a/alembic/versions/d6e5a9fde3c9_move_clingen_allele_id_to_mapped_.py b/alembic/versions/d6e5a9fde3c9_move_clingen_allele_id_to_mapped_.py new file mode 100644 index 000000000..c6decaf6a --- /dev/null +++ b/alembic/versions/d6e5a9fde3c9_move_clingen_allele_id_to_mapped_.py @@ -0,0 +1,53 @@ +"""move clingen allele id to mapped variants table + +Revision ID: d6e5a9fde3c9 +Revises: 695b73abe581 +Create Date: 2025-02-19 10:51:07.319962 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "d6e5a9fde3c9" +down_revision = "695b73abe581" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index("ix_variants_clingen_allele_id", table_name="variants") + op.add_column("mapped_variants", sa.Column("clingen_allele_id", sa.String(), nullable=True)) + op.execute( + """ + UPDATE mapped_variants + SET clingen_allele_id=variants.clingen_allele_id + FROM variants + WHERE variants.id=mapped_variants.variant_id + """ + ) + op.drop_column("variants", "clingen_allele_id") + op.create_index( + op.f("ix_mapped_variants_clingen_allele_id"), "mapped_variants", ["clingen_allele_id"], unique=False + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_mapped_variants_clingen_allele_id"), table_name="mapped_variants") + op.add_column("variants", sa.Column("clingen_allele_id", sa.String(), nullable=True)) + op.execute( + """ + UPDATE variants + SET clingen_allele_id=mapped_variants.clingen_allele_id + FROM mapped_variants + WHERE variants.id=mapped_variants.variant_id + """ + ) + op.drop_column("mapped_variants", "clingen_allele_id") + op.create_index("ix_variants_clingen_allele_id", "variants", ["clingen_allele_id"], unique=False) + # ### end Alembic commands ### diff --git a/alembic/versions/e8a3b5d8f885_add_clingen_allele_ids.py b/alembic/versions/e8a3b5d8f885_add_clingen_allele_ids.py new file mode 100644 index 000000000..da58494a1 --- /dev/null +++ b/alembic/versions/e8a3b5d8f885_add_clingen_allele_ids.py @@ -0,0 +1,27 @@ +"""Add ClinGen allele IDs + +Revision ID: e8a3b5d8f885 +Revises: 4726e4dddde8 +Create Date: 2025-01-27 18:55:09.283855 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "e8a3b5d8f885" +down_revision = "4726e4dddde8" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column("variants", sa.Column("clingen_allele_id", sa.String(), nullable=True)) + op.create_index(op.f("ix_variants_clingen_allele_id"), "variants", ["clingen_allele_id"], unique=False) + + +def downgrade(): + op.drop_index(op.f("ix_variants_clingen_allele_id"), table_name="variants") + op.drop_column("variants", "clingen_allele_id") diff --git a/src/mavedb/constants.py b/src/mavedb/constants.py new file mode 100644 index 000000000..41f9d825b --- /dev/null +++ b/src/mavedb/constants.py @@ -0,0 +1,4 @@ +import os + +MAVEDB_BASE_GIT = "https://github.com/VariantEffect/mavedb-api" +MAVEDB_FRONTEND_URL = os.getenv("MAVE_FRONTEND_URL", "https://mavedb.org") diff --git a/src/mavedb/lib/clingen/__init__.py b/src/mavedb/lib/clingen/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/mavedb/lib/clingen/constants.py b/src/mavedb/lib/clingen/constants.py new file mode 100644 index 000000000..7e21f77b4 --- /dev/null +++ b/src/mavedb/lib/clingen/constants.py @@ -0,0 +1,17 @@ +import os + +GENBOREE_ACCOUNT_NAME = os.getenv("GENBOREE_ACCOUNT_NAME") +GENBOREE_ACCOUNT_PASSWORD = os.getenv("GENBOREE_ACCOUNT_PASSWORD") + +CLIN_GEN_TENANT = os.getenv("CLIN_GEN_TENANT") +LDH_TENANT = os.getenv("LDH_TENANT") + +LDH_SUBMISSION_TYPE = "cg-ldh-ld-submission" +LDH_ENTITY_NAME = "MaveDBMapping" +LDH_ENTITY_ENDPOINT = "maveDb" # for some reason, not the same :/ + +DEFAULT_LDH_SUBMISSION_BATCH_SIZE = 100 +LDH_SUBMISSION_URL = f"https://genboree.org/mq/brdg/pulsar/{CLIN_GEN_TENANT}/ldh/submissions/{LDH_ENTITY_ENDPOINT}" +LDH_LINKED_DATA_URL = f"https://genboree.org/{LDH_TENANT}/{LDH_ENTITY_NAME}/id" + +LINKED_DATA_RETRY_THRESHOLD = 0.95 diff --git a/src/mavedb/lib/clingen/content_constructors.py b/src/mavedb/lib/clingen/content_constructors.py new file mode 100644 index 000000000..b9c35e274 --- /dev/null +++ b/src/mavedb/lib/clingen/content_constructors.py @@ -0,0 +1,66 @@ +from datetime import datetime +from uuid import uuid4 + +from mavedb import __version__ +from mavedb.constants import MAVEDB_BASE_GIT, MAVEDB_FRONTEND_URL +from mavedb.lib.types.clingen import LdhContentLinkedData, LdhContentSubject, LdhEvent, LdhSubmission +from mavedb.lib.clingen.constants import LDH_ENTITY_NAME, LDH_SUBMISSION_TYPE +from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.variant import Variant + + +def construct_ldh_submission_event(sbj: LdhContentSubject) -> LdhEvent: + return { + "type": LDH_SUBMISSION_TYPE, + "name": LDH_ENTITY_NAME, + "uuid": str(uuid4()), + "sbj": {"id": sbj["Variant"]["hgvs"], "type": "Variant", "format": "hgvs", "add": True}, + "triggered": { + "by": { + "host": MAVEDB_BASE_GIT, + "id": "resource_published", + "iri": f"{MAVEDB_BASE_GIT}/releases/tag/v{__version__}", + }, + "at": datetime.now().isoformat(), + }, + } + + +def construct_ldh_submission_subject(hgvs: str) -> LdhContentSubject: + return {"Variant": {"hgvs": hgvs}} + + +def construct_ldh_submission_entity(variant: Variant, mapped_variant: MappedVariant) -> LdhContentLinkedData: + return { + # TODO#372: We try to make all possible fields that are non-nullable represented that way. + "MaveDBMapping": [ + { + "entContent": { + "mavedb_id": variant.urn, # type: ignore + "pre_mapped": mapped_variant.pre_mapped, # type: ignore + "post_mapped": mapped_variant.post_mapped, # type: ignore + "mapping_api_version": mapped_variant.mapping_api_version, # type: ignore + "score": variant.data["score_data"]["score"], # type: ignore + }, + "entId": variant.urn, # type: ignore + "entIri": f"{MAVEDB_FRONTEND_URL}/{variant.urn}", # type: ignore + } + ] + } + + +def construct_ldh_submission(variant_content: list[tuple[str, Variant, MappedVariant]]) -> list[LdhSubmission]: + content_submission: list[LdhSubmission] = [] + for hgvs, variant, mapped_variant in variant_content: + subject = construct_ldh_submission_subject(hgvs) + event = construct_ldh_submission_event(subject) + entity = construct_ldh_submission_entity(variant, mapped_variant) + + content_submission.append( + { + "event": event, + "content": {"sbj": subject, "ld": entity}, + } + ) + + return content_submission diff --git a/src/mavedb/lib/clingen/linked_data_hub.py b/src/mavedb/lib/clingen/linked_data_hub.py new file mode 100644 index 000000000..34718d40c --- /dev/null +++ b/src/mavedb/lib/clingen/linked_data_hub.py @@ -0,0 +1,244 @@ +import logging +import requests +import os +from datetime import datetime +from typing import Optional +from urllib import parse + + +from jose import jwt + +from mavedb.lib.logging.context import logging_context, save_to_logging_context, format_raised_exception_info_as_dict +from mavedb.lib.clingen.constants import GENBOREE_ACCOUNT_NAME, GENBOREE_ACCOUNT_PASSWORD, LDH_LINKED_DATA_URL + +from mavedb.lib.types.clingen import LdhSubmission +from mavedb.lib.utils import batched + +logger = logging.getLogger(__name__) + + +class ClinGenLdhService: + """ + A service class for interacting with the ClinGen Linked Data Hub (LDH) API. + + This class provides methods for authenticating with the Genboree services and dispatching + submissions to the ClinGen LDH API. + + Attributes: + url (str): The base URL of the ClinGen LDH API. + + Methods: + __init__(url: str) -> None: + Initializes the ClinGenLdhService instance with the given API URL. + + authenticate() -> str: + Authenticates with the Genboree services and retrieves a JSON Web Token (JWT). + If a valid JWT already exists, it is reused. Otherwise, a new JWT is obtained + by authenticating with the Genboree API. + + dispatch_submissions(content_submissions: list[LdhSubmission], batch_size: Optional[int] = None) -> tuple[list, list]: + Dispatches a list of LDH submissions to the ClinGen LDH API. Supports optional + batching of submissions. + + Args: + content_submissions (list[LdhSubmission]): A list of LDH submissions to be dispatched. + batch_size (Optional[int]): The size of each batch for submission. If None, no batching is applied. + + Returns: + tuple[list, list]: A tuple containing two lists: + - A list of successful submission responses. + - A list of failed submissions. + + _existing_jwt() -> Optional[str]: + Checks for an existing and valid Genboree JWT in the environment variables. + + Returns: + Optional[str]: The existing JWT if valid, or None if no valid JWT is found. + """ + + def __init__(self, url: str) -> None: + self.url = url + + def authenticate(self) -> str: + """ + Authenticates with Genboree services and retrieves a JSON Web Token (JWT). + + This method first checks for an existing JWT using the `_existing_jwt` method. If a valid JWT is found, + it is returned immediately. Otherwise, the method attempts to authenticate with Genboree services + using the account name and password provided via environment variables. + + Raises: + ValueError: If the Genboree account name or password is not set, or if the JWT cannot be parsed + from the authentication response. + requests.exceptions.HTTPError: If the HTTP request to Genboree services fails. + + Returns: + str: The JWT retrieved from Genboree services, which is also stored in the `GENBOREE_JWT` + environment variable for future use. + """ + if existing_jwt := self._existing_jwt(): + logger.debug(msg="Using existing Genboree JWT for authentication.", extra=logging_context()) + return existing_jwt + + logger.debug( + msg="No existing or valid Genboree JWT found. Authenticating via Genboree services.", + extra=logging_context(), + ) + + auth_url = f"https://genboree.org/auth/usr/gb:{GENBOREE_ACCOUNT_NAME}/auth" + auth_body = {"type": "plain", "val": GENBOREE_ACCOUNT_PASSWORD} + auth_response = requests.post(auth_url, json=auth_body) + try: + auth_response.raise_for_status() + except requests.exceptions.HTTPError as exc: + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + logger.error(msg="Failed to authenticate with Genboree services.", exc_info=exc, extra=logging_context()) + raise exc + + auth_jwt = auth_response.json().get("data", {}).get("jwt") + + try: + assert auth_jwt is not None, "No JWT in response." + except AssertionError as exc: + msg = "Failed to authenticate with Genboree services. Could not parse JWT from valid response." + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + logger.error(msg=msg, extra=logging_context()) + raise ValueError(msg) + + # TODO#411: We should consider using a secret manager to store persistent/setable secrets like this. + # I'd prefer not to ever set environment variables, especially externally generated content. + os.environ["GENBOREE_JWT"] = auth_jwt + logger.info(msg="Successfully authenticated with Genboree services.", extra=logging_context()) + return auth_jwt + + def dispatch_submissions( + self, content_submissions: list[LdhSubmission], batch_size: Optional[int] = None + ) -> tuple[list, list]: + """ + Dispatches a list of content submissions to a specified URL in batches, if specified. + + Args: + content_submissions (list[LdhSubmission]): A list of submissions to be dispatched. + batch_size (Optional[int]): The size of each batch for dispatching submissions. + If None, submissions are dispatched without batching. + + Returns: + tuple[list, list]: A tuple containing two lists: + - The first list contains the successful submission responses. + - The second list contains the submissions that failed to dispatch. + + Raises: + requests.exceptions.RequestException: If an error occurs during the HTTP request. + """ + submission_successes = [] + submission_failures = [] + submissions = list(batched(content_submissions, batch_size)) if batch_size is not None else content_submissions + save_to_logging_context({"ldh_submission_count": len(content_submissions)}) + + if batch_size is not None: + save_to_logging_context({"ldh_submission_batch_size": batch_size}) + save_to_logging_context({"ldh_submission_batch_count": len(submissions)}) + logger.debug("Batching ldh submissions.", extra=logging_context()) + + logger.info(msg=f"Dispatching {len(submissions)} ldh submissions...", extra=logging_context()) + for idx, content in enumerate(submissions): + try: + logger.debug(msg=f"Dispatching submission {idx+1}.", extra=logging_context()) + response = requests.put( + url=self.url, + json=content, + headers={"Authorization": f"Bearer {self.authenticate()}", "Content-Type": "application/json"}, + ) + response.raise_for_status() + submission_successes.append(response.json()) + logger.info( + msg=f"Successfully dispatched ldh submission ({idx+1} / {len(submissions)}).", + extra=logging_context(), + ) + + except requests.exceptions.RequestException as exc: + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + logger.error(msg="Failed to dispatch ldh submission.", exc_info=exc, extra=logging_context()) + submission_failures.append(content) + continue + + save_to_logging_context( + { + "ldh_submission_success_count": len(submission_successes), + "ldh_submission_failure_count": len(submission_failures), + } + ) + logger.info(msg="Done dispatching ldh submissions.", extra=logging_context()) + return submission_successes, submission_failures + + def _existing_jwt(self) -> Optional[str]: + """ + Checks for an existing Genboree JWT (JSON Web Token) in the environment variables. + + This method retrieves the JWT from the "GENBOREE_JWT" environment variable, verifies its + presence, and checks its expiration status. If the token is valid and not expired, it is returned. + Otherwise, it returns None. + + Returns: + Optional[str]: The existing and valid Genboree JWT if found, otherwise None. + """ + logger.debug(msg="Checking for existing Genboree JWT.", extra=logging_context()) + + existing_jwt = os.getenv("GENBOREE_JWT") + + if not existing_jwt: + logger.debug(msg="No existing Genboree JWT was set.", extra=logging_context()) + return None + + expiration = jwt.get_unverified_claims(existing_jwt).get("exp", datetime.now().timestamp()) + + if expiration > datetime.now().timestamp(): + logger.debug(msg="Found existing and valid Genboree JWT.", extra=logging_context()) + return existing_jwt + + logger.debug(msg="Found existing but expired Genboree JWT.", extra=logging_context()) + return None + + +def get_clingen_variation(urn: str) -> Optional[dict]: + """ + Fetches ClinGen variation data for a given URN (Uniform Resource Name) from the Linked Data Hub. + + Args: + urn (str): The URN of the variation to fetch. + + Returns: + Optional[dict]: A dictionary containing the variation data if the request is successful, + or None if the request fails. + """ + response = requests.get( + f"{LDH_LINKED_DATA_URL}/{parse.quote_plus(urn)}", + headers={"Accept": "application/json"}, + ) + + if response.status_code == 200: + return response.json() + else: + logger.error(f"Failed to fetch data for URN {urn}: {response.status_code} - {response.text}") + return None + + +def clingen_allele_id_from_ldh_variation(variation: Optional[dict]) -> Optional[str]: + """ + Extracts the ClinGen allele ID from a given variation dictionary. + + Args: + variation (Optional[dict]): A dictionary containing variation data, otherwise None. + + Returns: + Optional[str]: The ClinGen allele ID if found, otherwise None. + """ + if not variation: + return None + + try: + return variation["data"]["ldFor"]["Variant"][0]["entId"] + except (KeyError, IndexError) as exc: + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + logger.error("Failed to extract ClinGen allele ID from variation data.", extra=logging_context()) + return None diff --git a/src/mavedb/lib/clingen/py.typed b/src/mavedb/lib/clingen/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/src/mavedb/lib/exceptions.py b/src/mavedb/lib/exceptions.py index 277594389..46380e961 100644 --- a/src/mavedb/lib/exceptions.py +++ b/src/mavedb/lib/exceptions.py @@ -176,3 +176,17 @@ class NonexistentMappingReferenceError(ValueError): class MappingEnqueueError(ValueError): """Raised when a mapping job fails to be enqueued despite appearing as if it should have been""" + + pass + + +class SubmissionEnqueueError(ValueError): + """Raised when a linking job fails to be enqueued despite appearing as if it should have been""" + + pass + + +class LinkingEnqueueError(ValueError): + """Raised when a linking job fails to be enqueued despite appearing as if it should have been""" + + pass diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 1d650eb62..be1722c30 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -3,7 +3,7 @@ import logging import re from operator import attrgetter -from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence +from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal import numpy as np import pandas as pd @@ -266,30 +266,22 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: def fetch_superseding_score_set_in_search_result( - score_sets: list[ScoreSet], - requesting_user: Optional["UserData"], - search: ScoreSetsSearch) -> list[ScoreSet]: + score_sets: list[ScoreSet], requesting_user: Optional["UserData"], search: ScoreSetsSearch +) -> list[ScoreSet]: """ Remove superseded score set from search results. Check whether all of the score set are correct versions. """ from mavedb.lib.permissions import Action + if search.published: filtered_score_sets_tail = [ - find_publish_or_private_superseded_score_set_tail( - score_set, - Action.READ, - requesting_user, - search.published - ) for score_set in score_sets + find_publish_or_private_superseded_score_set_tail(score_set, Action.READ, requesting_user, search.published) + for score_set in score_sets ] else: filtered_score_sets_tail = [ - find_superseded_score_set_tail( - score_set, - Action.READ, - requesting_user - ) for score_set in score_sets + find_superseded_score_set_tail(score_set, Action.READ, requesting_user) for score_set in score_sets ] # Remove None item. filtered_score_sets = [score_set for score_set in filtered_score_sets_tail if score_set is not None] @@ -345,10 +337,10 @@ def find_meta_analyses_for_experiment_sets(db: Session, urns: list[str]) -> list def find_superseded_score_set_tail( - score_set: ScoreSet, - action: Optional["Action"] = None, - user_data: Optional["UserData"] = None) -> Optional[ScoreSet]: + score_set: ScoreSet, action: Optional["Action"] = None, user_data: Optional["UserData"] = None +) -> Optional[ScoreSet]: from mavedb.lib.permissions import has_permission + while score_set.superseding_score_set is not None: next_score_set_in_chain = score_set.superseding_score_set @@ -374,75 +366,49 @@ def find_superseded_score_set_tail( def find_publish_or_private_superseded_score_set_tail( - score_set: ScoreSet, - action: Optional["Action"] = None, - user_data: Optional["UserData"] = None, - publish: bool = True) -> Optional[ScoreSet]: + score_set: ScoreSet, action: Optional["Action"] = None, user_data: Optional["UserData"] = None, publish: bool = True +) -> Optional[ScoreSet]: from mavedb.lib.permissions import has_permission + if publish: while score_set.superseding_score_set is not None: next_score_set_in_chain = score_set.superseding_score_set # Find the final published one. - if action is not None and has_permission(user_data, score_set, action).permitted \ - and next_score_set_in_chain.published_date is None: + if ( + action is not None + and has_permission(user_data, score_set, action).permitted + and next_score_set_in_chain.published_date is None + ): return score_set score_set = next_score_set_in_chain else: # Unpublished score set should not be superseded. # It should not have superseding score set, but possible have superseded score set. - if action is not None and score_set.published_date is None \ - and has_permission(user_data, score_set, action).permitted: + if ( + action is not None + and score_set.published_date is None + and has_permission(user_data, score_set, action).permitted + ): return score_set else: return None return score_set -def get_score_set_counts_as_csv( +def get_score_set_variants_as_csv( db: Session, score_set: ScoreSet, + data_type: Literal["scores", "counts"], start: Optional[int] = None, limit: Optional[int] = None, drop_na_columns: Optional[bool] = None, ) -> str: assert type(score_set.dataset_columns) is dict - count_columns = [str(x) for x in list(score_set.dataset_columns.get("count_columns", []))] - columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"] + count_columns - type_column = "count_data" - - variants_query = ( - select(Variant) - .where(Variant.score_set_id == score_set.id) - .order_by(cast(func.split_part(Variant.urn, "#", 2), Integer)) - ) - if start: - variants_query = variants_query.offset(start) - if limit: - variants_query = variants_query.limit(limit) - variants = db.scalars(variants_query).all() - - rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column) - if drop_na_columns: - rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns) + dataset_cols = "score_columns" if data_type == "scores" else "count_columns" + type_column = "score_data" if data_type == "scores" else "count_data" - stream = io.StringIO() - writer = csv.DictWriter(stream, fieldnames=columns, quoting=csv.QUOTE_MINIMAL) - writer.writeheader() - writer.writerows(rows_data) - return stream.getvalue() - - -def get_score_set_scores_as_csv( - db: Session, - score_set: ScoreSet, - start: Optional[int] = None, - limit: Optional[int] = None, - drop_na_columns: Optional[bool] = None, -) -> str: - assert type(score_set.dataset_columns) is dict - score_columns = [str(x) for x in list(score_set.dataset_columns.get("score_columns", []))] - columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"] + score_columns - type_column = "score_data" + count_columns = [str(x) for x in list(score_set.dataset_columns.get(dataset_cols, []))] + columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"] + count_columns variants_query = ( select(Variant) @@ -455,7 +421,7 @@ def get_score_set_scores_as_csv( variants_query = variants_query.limit(limit) variants = db.scalars(variants_query).all() - rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column) + rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column) # type: ignore if drop_na_columns: rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns) @@ -467,8 +433,7 @@ def get_score_set_scores_as_csv( def drop_na_columns_from_csv_file_rows( - rows_data: Iterable[dict[str, Any]], - columns: list[str] + rows_data: Iterable[dict[str, Any]], columns: list[str] ) -> tuple[list[dict[str, Any]], list[str]]: """Process rows_data for downloadable CSV by removing empty columns.""" # Convert map to list. @@ -532,6 +497,7 @@ def variant_to_csv_row(variant: Variant, columns: list[str], dtype: str, na_rep= if is_null(value): value = na_rep row[column_key] = value + return row diff --git a/src/mavedb/lib/slack.py b/src/mavedb/lib/slack.py index 22786b697..035723d6f 100644 --- a/src/mavedb/lib/slack.py +++ b/src/mavedb/lib/slack.py @@ -16,14 +16,7 @@ def find_traceback_locations(): ] -def send_slack_message(err, request=None): - text = {"type": err.__class__.__name__, "exception": str(err), "location": find_traceback_locations()} - - if request: - text["client"] = str(request.client.host) - text["request"] = f"{request.method} {request.url}" - - text = json.dumps(text) +def send_slack_message(text: str): slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL") if slack_webhook_url is not None and len(slack_webhook_url) > 0: client = WebhookClient(url=slack_webhook_url) @@ -38,3 +31,14 @@ def send_slack_message(err, request=None): ) else: print(f"EXCEPTION_HANDLER: {text}") + + +def send_slack_error(err, request=None): + text = {"type": err.__class__.__name__, "exception": str(err), "location": find_traceback_locations()} + + if request: + text["client"] = str(request.client.host) + text["request"] = f"{request.method} {request.url}" + + text = json.dumps(text) + send_slack_message(text) diff --git a/src/mavedb/lib/types/__init__.py b/src/mavedb/lib/types/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/mavedb/lib/types/clingen.py b/src/mavedb/lib/types/clingen.py new file mode 100644 index 000000000..0d8710a11 --- /dev/null +++ b/src/mavedb/lib/types/clingen.py @@ -0,0 +1,81 @@ +from typing import TypedDict, Literal +from typing_extensions import NotRequired + + +# See: https://ldh.genome.network/docs/ldh/submit.html#content-submission-body + + +### Linked Data Hub Event Type + + +# The subject of the event (ie. the entity that the event is about) +class EventSbj(TypedDict): + id: str + type: str + format: Literal["hgvs", "alleleRegistryID", "clinvarID", "geneSymbol"] + add: bool + + +# Who/what triggered the event +class EventTriggerer(TypedDict): + host: str + id: str + iri: str + + +class EventTrigger(TypedDict): + by: EventTriggerer + at: str + + +class LdhEvent(TypedDict): + type: str + name: str + uuid: str + sbj: EventSbj + triggered: EventTrigger + + +### Linked Data Hub Content Types + + +# The subject of the content submission +class LdhSubjectVariant(TypedDict): + id: NotRequired[str] + hgvs: str + + +class LdhContentSubject(TypedDict): + Variant: LdhSubjectVariant + + +# The entities we are submitting +class LdhMapping(TypedDict): + mavedb_id: str + pre_mapped: str + post_mapped: str + mapping_api_version: str + score: float + + +class LdhEntity(TypedDict): + entContent: LdhMapping + entId: str + entIri: str + + +class LdhContentLinkedData(TypedDict): + MaveDBMapping: list[LdhEntity] + + +### Linked Data Hub Submission Type + + +class LdhSubmissionContent(TypedDict): + sbj: LdhContentSubject + ld: LdhContentLinkedData + + +class LdhSubmission(TypedDict): + event: LdhEvent + content: LdhSubmissionContent diff --git a/src/mavedb/lib/utils.py b/src/mavedb/lib/utils.py new file mode 100644 index 000000000..c4b13f3bf --- /dev/null +++ b/src/mavedb/lib/utils.py @@ -0,0 +1,36 @@ +import logging +import requests +import time + + +logger = logging.getLogger(__name__) + + +def request_with_backoff( + method: str, url: str, backoff_limit: int = 5, backoff_wait: int = 10, **kwargs +) -> requests.Response: + attempt = 0 + while attempt <= backoff_limit: + logger.debug(f"Attempting request to {url}. This is attempt {attempt+1}.") + try: + response = requests.request(method=method, url=url, **kwargs) + response.raise_for_status() + return response + except requests.exceptions.RequestException as exc: + logger.warning(f"Request to {url} failed.", exc_info=exc) + backoff_time = backoff_wait * (2**attempt) + attempt += 1 + logger.info(f"Retrying request to {url} in {backoff_wait} seconds.") + time.sleep(backoff_time) + + raise requests.exceptions.RequestException(f"Request to {url} failed after {backoff_limit} attempts.") + + +# TODO: When we upgrade to Python 3.12, we can replace this with the built-in `itertools.batched` method. +def batched(iterable, n): + """ + Yield successive n-sized chunks from iterable. + """ + l = len(iterable) # noqa: E741 + for i in range(0, l, n): + yield iterable[i : min((i + n, l))] diff --git a/src/mavedb/lib/variants.py b/src/mavedb/lib/variants.py new file mode 100644 index 000000000..ad4c2676e --- /dev/null +++ b/src/mavedb/lib/variants.py @@ -0,0 +1,32 @@ +from mavedb.models.mapped_variant import MappedVariant + + +def hgvs_from_vrs_allele(allele: dict) -> str: + """ + Extract the HGVS notation from the VRS allele. + """ + try: + # VRS 2.X + return allele["expressions"][0]["value"] + except KeyError: + # VRS 1.X + return allele["variation"]["expressions"][0]["value"] + + +def hgvs_from_mapped_variant(mapped_variant: MappedVariant) -> list[str]: + """ + Extract the HGVS notation from the post_mapped field of the MappedVariant object. + """ + post_mapped_object: dict = mapped_variant.post_mapped # type: ignore + + if not post_mapped_object: + return [] + + if post_mapped_object["type"] == "Haplotype": # type: ignore + return [hgvs_from_vrs_allele(allele) for allele in post_mapped_object["members"]] + elif post_mapped_object["type"] == "CisPhasedBlock": # type: ignore + return [hgvs_from_vrs_allele(allele) for allele in post_mapped_object["members"]] + elif post_mapped_object["type"] == "Allele": # type: ignore + return [hgvs_from_vrs_allele(post_mapped_object)] + else: + raise ValueError(f"Unsupported post_mapped type: {post_mapped_object['type']}") diff --git a/src/mavedb/models/__init__.py b/src/mavedb/models/__init__.py index a1a2c0afd..af823015f 100644 --- a/src/mavedb/models/__init__.py +++ b/src/mavedb/models/__init__.py @@ -1,6 +1,7 @@ __all__ = [ "access_key", "collection", + "clinical_control", "controlled_keyword", "doi_identifier", "ensembl_identifier", diff --git a/src/mavedb/models/clinical_control.py b/src/mavedb/models/clinical_control.py new file mode 100644 index 000000000..a74620079 --- /dev/null +++ b/src/mavedb/models/clinical_control.py @@ -0,0 +1,35 @@ +from datetime import date +from typing import TYPE_CHECKING + +from sqlalchemy import Column, Date, Integer, String +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.models.clinical_control_mapped_variant import mapped_variants_clinical_controls_association_table + +if TYPE_CHECKING: + from mavedb.models.mapped_variant import MappedVariant + + +class ClinicalControl(Base): + __tablename__ = "clinical_controls" + + id = Column(Integer, primary_key=True) + + gene_symbol = Column(String, nullable=False, index=True) + + clinical_significance = Column(String, nullable=False) + clinical_review_status = Column(String, nullable=False) + + db_name = Column(String, nullable=False, index=True) + db_identifier = Column(String, nullable=False, index=True) + db_version = Column(String, nullable=False, index=True) + + creation_date = Column(Date, nullable=False, default=date.today) + modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) + + mapped_variants: Mapped[list["MappedVariant"]] = relationship( + "MappedVariant", + secondary=mapped_variants_clinical_controls_association_table, + back_populates="clinical_controls", + ) diff --git a/src/mavedb/models/clinical_control_mapped_variant.py b/src/mavedb/models/clinical_control_mapped_variant.py new file mode 100644 index 000000000..eabb7689a --- /dev/null +++ b/src/mavedb/models/clinical_control_mapped_variant.py @@ -0,0 +1,11 @@ +from sqlalchemy import Column, Table, ForeignKey + +from mavedb.db.base import Base + + +mapped_variants_clinical_controls_association_table = Table( + "mapped_variants_clinical_controls", + Base.metadata, + Column("mapped_variant_id", ForeignKey("mapped_variants.id"), primary_key=True), + Column("clinical_control_id", ForeignKey("clinical_controls.id"), primary_key=True), +) diff --git a/src/mavedb/models/mapped_variant.py b/src/mavedb/models/mapped_variant.py index 57cefd030..8396dba8f 100644 --- a/src/mavedb/models/mapped_variant.py +++ b/src/mavedb/models/mapped_variant.py @@ -1,12 +1,16 @@ from datetime import date +from typing import TYPE_CHECKING from sqlalchemy import Boolean, Column, Date, ForeignKey, Integer, String from sqlalchemy.dialects.postgresql import JSONB -from sqlalchemy.orm import Mapped, backref, relationship +from sqlalchemy.orm import Mapped, relationship from mavedb.db.base import Base +from mavedb.models.clinical_control_mapped_variant import mapped_variants_clinical_controls_association_table -from .variant import Variant +if TYPE_CHECKING: + from .clinical_control import ClinicalControl + from .variant import Variant class MappedVariant(Base): @@ -24,4 +28,12 @@ class MappedVariant(Base): current = Column(Boolean, nullable=False) variant_id = Column(Integer, ForeignKey("variants.id"), index=True, nullable=False) - variant: Mapped[Variant] = relationship("Variant", backref=backref("mapped_variants", cascade="all,delete-orphan")) + variant: Mapped["Variant"] = relationship("Variant", back_populates="mapped_variants") + + clingen_allele_id = Column(String, index=True, nullable=True) + + clinical_controls: Mapped[list["ClinicalControl"]] = relationship( + "ClinicalControl", + secondary=mapped_variants_clinical_controls_association_table, + back_populates="mapped_variants", + ) diff --git a/src/mavedb/models/variant.py b/src/mavedb/models/variant.py index f3a5821ad..b038c1eab 100644 --- a/src/mavedb/models/variant.py +++ b/src/mavedb/models/variant.py @@ -1,4 +1,5 @@ from datetime import date +from typing import TYPE_CHECKING, List from sqlalchemy import Column, Date, ForeignKey, Integer, String from sqlalchemy.dialects.postgresql import JSONB @@ -6,7 +7,9 @@ from mavedb.db.base import Base -from .score_set import ScoreSet +if TYPE_CHECKING: + from .mapped_variant import MappedVariant + from .score_set import ScoreSet class Variant(Base): @@ -19,7 +22,7 @@ class Variant(Base): score_set_id = Column("scoreset_id", Integer, ForeignKey("scoresets.id"), index=True, nullable=False) # TODO examine if delete-orphan is necessary, explore cascade - score_set: Mapped[ScoreSet] = relationship(back_populates="variants") + score_set: Mapped["ScoreSet"] = relationship(back_populates="variants") hgvs_nt = Column(String, nullable=True) hgvs_pro = Column(String, nullable=True) @@ -27,3 +30,7 @@ class Variant(Base): creation_date = Column(Date, nullable=False, default=date.today) modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) + + mapped_variants: Mapped[List["MappedVariant"]] = relationship( + back_populates="variant", cascade="all, delete-orphan" + ) diff --git a/src/mavedb/routers/score_sets.py b/src/mavedb/routers/score_sets.py index 65e83c0f2..db0a58c3d 100644 --- a/src/mavedb/routers/score_sets.py +++ b/src/mavedb/routers/score_sets.py @@ -1,6 +1,6 @@ import logging from datetime import date -from typing import Any, List, Optional +from typing import Any, List, Optional, Sequence, Union import pandas as pd import pydantic @@ -38,8 +38,7 @@ from mavedb.lib.score_sets import ( csv_data_to_df, find_meta_analyses_for_experiment_sets, - get_score_set_counts_as_csv, - get_score_set_scores_as_csv, + get_score_set_variants_as_csv, variants_to_csv_rows, ) from mavedb.lib.score_sets import ( @@ -53,6 +52,7 @@ generate_experiment_urn, generate_score_set_urn, ) +from mavedb.models.clinical_control import ClinicalControl from mavedb.models.contributor import Contributor from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.enums.user_role import UserRole @@ -64,7 +64,7 @@ from mavedb.models.target_gene import TargetGene from mavedb.models.target_sequence import TargetSequence from mavedb.models.variant import Variant -from mavedb.view_models import mapped_variant, score_set, calibration +from mavedb.view_models import mapped_variant, score_set, calibration, clinical_control from mavedb.view_models.search import ScoreSetsSearch logger = logging.getLogger(__name__) @@ -257,7 +257,7 @@ def get_score_set_scores_csv( assert_permission(user_data, score_set, Action.READ) - csv_str = get_score_set_scores_as_csv(db, score_set, start, limit, drop_na_columns) + csv_str = get_score_set_variants_as_csv(db, score_set, "scores", start, limit, drop_na_columns) return StreamingResponse(iter([csv_str]), media_type="text/csv") @@ -312,7 +312,7 @@ async def get_score_set_counts_csv( assert_permission(user_data, score_set, Action.READ) - csv_str = get_score_set_counts_as_csv(db, score_set, start, limit, drop_na_columns) + csv_str = get_score_set_variants_as_csv(db, score_set, "counts", start, limit, drop_na_columns) return StreamingResponse(iter([csv_str]), media_type="text/csv") @@ -1143,3 +1143,136 @@ async def publish_score_set( ) return item + + +@router.get( + "/score-sets/{urn}/clinical-controls", + status_code=200, + response_model=list[clinical_control.ClinicalControlWithMappedVariants], + response_model_exclude_none=True, +) +async def get_clinical_controls_for_score_set( + *, + urn: str, + # We'd prefer to reserve `db` as a query parameter. + _db: Session = Depends(deps.get_db), + user_data: UserData = Depends(get_current_user), + db: Optional[str] = None, + version: Optional[str] = None, +) -> Sequence[ClinicalControl]: + """ + Fetch relevant clinical controls for a given score set. + """ + save_to_logging_context({"requested_resource": urn, "resource_property": "clinical_controls"}) + + # Rename user facing kwargs for consistency with code base naming conventions. My-py doesn't care for us redefining db. + db_name = db + db_version = version + + item: Optional[ScoreSet] = _db.scalars(select(ScoreSet).where(ScoreSet.urn == urn)).one_or_none() + if not item: + logger.info( + msg="Failed to fetch clinical controls for score set; The requested score set does not exist.", + extra=logging_context(), + ) + raise HTTPException(status_code=404, detail=f"score set with URN '{urn}' not found") + + assert_permission(user_data, item, Action.READ) + + clinical_controls_query = ( + select(ClinicalControl) + .join(MappedVariant, ClinicalControl.mapped_variants) + .join(Variant) + .where(Variant.score_set_id == item.id) + ) + + if db_name is not None: + save_to_logging_context({"db_name": db_name}) + clinical_controls_query = clinical_controls_query.where(ClinicalControl.db_name == db_name) + + if db_version is not None: + save_to_logging_context({"db_version": db_version}) + clinical_controls_query = clinical_controls_query.where(ClinicalControl.db_version == db_version) + + clinical_controls_for_item: Sequence[ClinicalControl] = _db.scalars(clinical_controls_query).all() + clinical_controls_with_mapped_variant = [] + for control_variant in clinical_controls_for_item: + control_variant.mapped_variants = [ + mv for mv in control_variant.mapped_variants if mv.current and mv.variant.score_set_id == item.id + ] + + if control_variant.mapped_variants: + clinical_controls_with_mapped_variant.append(control_variant) + + if not clinical_controls_with_mapped_variant: + logger.info( + msg="No clinical control variants matching the provided filters are associated with the requested score set.", + extra=logging_context(), + ) + raise HTTPException( + status_code=404, + detail=f"No clinical control variants matching the provided filters associated with score set URN {urn} were found", + ) + + save_to_logging_context({"resource_count": len(clinical_controls_for_item)}) + + return clinical_controls_for_item + + +@router.get( + "/score-sets/{urn}/clinical-controls/options", + status_code=200, + response_model=list[clinical_control.ClinicalControlOptions], + response_model_exclude_none=True, +) +async def get_clinical_controls_options_for_score_set( + *, + urn: str, + # We'd prefer to reserve `db` as a query parameter. + db: Session = Depends(deps.get_db), + user_data: UserData = Depends(get_current_user), +) -> list[dict[str, Union[str, list[str]]]]: + """ + Fetch clinical control options for a given score set. + """ + save_to_logging_context({"requested_resource": urn, "resource_property": "clinical_control_options"}) + + item: Optional[ScoreSet] = db.scalars(select(ScoreSet).where(ScoreSet.urn == urn)).one_or_none() + if not item: + logger.info( + msg="Failed to fetch clinical control options for score set; The requested score set does not exist.", + extra=logging_context(), + ) + raise HTTPException(status_code=404, detail=f"score set with URN '{urn}' not found") + + assert_permission(user_data, item, Action.READ) + + clinical_controls_query = ( + select(ClinicalControl.db_name, ClinicalControl.db_version) + .join(MappedVariant, ClinicalControl.mapped_variants) + .join(Variant) + .where(Variant.score_set_id == item.id) + ) + + clinical_controls_for_item = db.execute(clinical_controls_query).unique() + + # NOTE: We return options even for pairwise groupings which may have no associated mapped variants + # and 404 when ultimately requested together. + clinical_control_options: dict[str, list[str]] = {} + for db_name, db_version in clinical_controls_for_item: + clinical_control_options.setdefault(db_name, []).append(db_version) + + if not clinical_control_options: + logger.info( + msg="Failed to fetch clinical control options for score set; No clinical control variants are associated with this score set.", + extra=logging_context(), + ) + raise HTTPException( + status_code=404, + detail=f"no clinical control variants associated with score set URN {urn} were found", + ) + + return [ + dict(zip(("db_name", "available_versions"), (db_name, db_versions))) + for db_name, db_versions in clinical_control_options.items() + ] diff --git a/src/mavedb/routers/variants.py b/src/mavedb/routers/variants.py new file mode 100644 index 000000000..853ebedfd --- /dev/null +++ b/src/mavedb/routers/variants.py @@ -0,0 +1,79 @@ +import logging + +from fastapi import APIRouter, Depends +from fastapi.exceptions import HTTPException +from mavedb.lib.authentication import UserData, get_current_user +from mavedb.lib.permissions import Action, assert_permission, has_permission +from sqlalchemy import select +from sqlalchemy.exc import MultipleResultsFound +from sqlalchemy.orm import Session, joinedload + +from mavedb import deps +from mavedb.lib.logging import LoggedRoute +from mavedb.lib.logging.context import logging_context, save_to_logging_context +from mavedb.models.score_set import ScoreSet +from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.variant import Variant +from mavedb.view_models.variant import ( + ClingenAlleleIdVariantLookupsRequest, + VariantWithScoreSet, + VariantWithShortScoreSet, +) + +router = APIRouter( + prefix="/api/v1", tags=["access keys"], responses={404: {"description": "Not found"}}, route_class=LoggedRoute +) + +logger = logging.getLogger(__name__) + + +@router.post("/variants/clingen-allele-id-lookups", response_model=list[list[VariantWithShortScoreSet]]) +def lookup_variants( + *, + request: ClingenAlleleIdVariantLookupsRequest, + db: Session = Depends(deps.get_db), + user_data: UserData = Depends(get_current_user), +): + variants = db.execute( + select(Variant, MappedVariant.clingen_allele_id) + .join(MappedVariant) + .options(joinedload(Variant.score_set).joinedload(ScoreSet.experiment)) + .where(MappedVariant.clingen_allele_id.in_(request.clingen_allele_ids)) + ).all() + + variants_by_allele_id: dict[str, list[Variant]] = {allele_id: [] for allele_id in request.clingen_allele_ids} + + for variant, allele_id in variants: + if has_permission(user_data, variant.score_set, Action.READ).permitted: + variants_by_allele_id[allele_id].append(variant) + + return [variants_by_allele_id[allele_id] for allele_id in request.clingen_allele_ids] + + +@router.get( + "/variants/{urn}", + status_code=200, + response_model=VariantWithScoreSet, + responses={404: {}, 500: {}}, + response_model_exclude_none=True, +) +def get_variant(*, urn: str, db: Session = Depends(deps.get_db), user_data: UserData = Depends(get_current_user)): + """ + Fetch a single variant by URN. + """ + save_to_logging_context({"requested_resource": urn}) + try: + query = db.query(Variant).filter(Variant.urn == urn) + variant = query.one_or_none() + except MultipleResultsFound: + logger.info( + msg="Could not fetch the requested score set; Multiple such variants exist.", extra=logging_context() + ) + raise HTTPException(status_code=500, detail=f"multiple variants with URN '{urn}' were found") + + if not variant: + logger.info(msg="Could not fetch the requested variant; No such variant exists.", extra=logging_context()) + raise HTTPException(status_code=404, detail=f"variant with URN '{urn}' not found") + + assert_permission(user_data, variant.score_set, Action.READ) + return variant diff --git a/src/mavedb/scripts/clingen_ldh_submission.py b/src/mavedb/scripts/clingen_ldh_submission.py new file mode 100644 index 000000000..e25563fc8 --- /dev/null +++ b/src/mavedb/scripts/clingen_ldh_submission.py @@ -0,0 +1,121 @@ +import click +import logging +from typing import Sequence + +from sqlalchemy import select +from sqlalchemy.orm import Session + +from mavedb.models.score_set import ScoreSet +from mavedb.models.variant import Variant +from mavedb.models.mapped_variant import MappedVariant +from mavedb.scripts.environment import with_database_session +from mavedb.lib.clingen.linked_data_hub import ClinGenLdhService +from mavedb.lib.clingen.constants import DEFAULT_LDH_SUBMISSION_BATCH_SIZE, LDH_SUBMISSION_URL +from mavedb.lib.clingen.content_constructors import construct_ldh_submission +from mavedb.lib.variants import hgvs_from_mapped_variant + +logger = logging.getLogger(__name__) + + +def submit_urns_to_clingen(db: Session, urns: Sequence[str], debug: bool) -> list[str]: + ldh_service = ClinGenLdhService(url=LDH_SUBMISSION_URL) + ldh_service.authenticate() + + submitted_entities = [] + + if debug: + logger.debug("Debug mode enabled. Submitting only one request to ClinGen.") + urns = urns[:1] + + for idx, urn in enumerate(urns): + logger.info(f"Processing URN: {urn}. (Scoreset {idx + 1}/{len(urns)})") + + try: + score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == urn)).one_or_none() + if not score_set: + logger.warning(f"No score set found for URN: {urn}") + continue + + logger.info(f"Submitting mapped variants to LDH service for score set with URN: {urn}") + variant_objects = db.execute( + select(Variant, MappedVariant) + .join(MappedVariant) + .join(ScoreSet) + .where(ScoreSet.urn == urn) + .where(MappedVariant.post_mapped.is_not(None)) + .where(MappedVariant.current.is_(True)) + ).all() + + if not variant_objects: + logger.warning(f"No mapped variants found for score set with URN: {urn}") + continue + + logger.debug(f"Preparing {len(variant_objects)} mapped variants for submission") + + variant_content: list[tuple[str, Variant, MappedVariant]] = [] + for variant, mapped_variant in variant_objects: + variation = hgvs_from_mapped_variant(mapped_variant) + + if not variation: + logger.warning(f"No variation found for variant {variant.urn}.") + continue + + for allele in variation: + variant_content.append((allele, variant, mapped_variant)) + + if debug: + logger.debug("Debug mode enabled. Submitting only one request to ClinGen.") + variant_content = variant_content[:1] + + logger.debug(f"Constructing LDH submission for {len(variant_content)} variants") + submission_content = construct_ldh_submission(variant_content) + submission_successes, submission_failures = ldh_service.dispatch_submissions( + submission_content, DEFAULT_LDH_SUBMISSION_BATCH_SIZE + ) + + if submission_failures: + logger.error(f"Failed to submit some variants for URN: {urn}") + else: + logger.info(f"Successfully submitted all variants for URN: {urn}") + + submitted_entities.extend([variant.urn for _, variant, _ in variant_content]) + + except Exception as e: + logger.error(f"Error processing URN {urn}", exc_info=e) + + # TODO#372: non-nullable urns. + return submitted_entities # type: ignore + + +@click.command() +@with_database_session +@click.argument("urns", nargs=-1) +@click.option("--all", help="Submit mapped variants for every score set in MaveDB.", is_flag=True) +@click.option("--suppress-output", help="Suppress final print output to the console.", is_flag=True) +@click.option("--debug", help="Enable debug mode. This will send only one request at most to ClinGen", is_flag=True) +def submit_clingen_urns_command( + db: Session, urns: Sequence[str], all: bool, suppress_output: bool, debug: bool +) -> None: + """ + Submit data to ClinGen for mapped variant allele ID generation for the given URNs. + """ + if urns and all: + logger.error("Cannot provide both URNs and --all option.") + return + + if all: + # TODO#372: non-nullable urns. + urns = db.scalars(select(ScoreSet.urn)).all() # type: ignore + + if not urns: + logger.error("No URNs provided. Please provide at least one URN.") + return + + submitted_variant_urns = submit_urns_to_clingen(db, urns, debug) + + if not suppress_output: + print(", ".join(submitted_variant_urns)) + + +if __name__ == "__main__": + submit_clingen_urns_command() diff --git a/src/mavedb/scripts/environment.py b/src/mavedb/scripts/environment.py index f773f55ff..66bdbb78b 100644 --- a/src/mavedb/scripts/environment.py +++ b/src/mavedb/scripts/environment.py @@ -61,7 +61,7 @@ def with_database_session(command=None, *, pass_action: bool = False): The *command* callable must be a :py:class:`click.Command` instance. The decorated *command* is called with a ``db`` keyword argument to provide - a :class:`~id3c.db.session.DatabaseSession` object. The call happens + a :class:`~sqlalchemy.Session` object. The call happens within an exception handler that commits or rollsback the database transaction, possibly interactively. Three new options are added to the *command* (``--dry-run``, ``--prompt``, and ``--commit``) to control this diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py index 4a52ee808..8e3857b5c 100644 --- a/src/mavedb/scripts/export_public_data.py +++ b/src/mavedb/scripts/export_public_data.py @@ -36,7 +36,7 @@ from sqlalchemy import select from sqlalchemy.orm import lazyload, Session -from mavedb.lib.score_sets import get_score_set_counts_as_csv, get_score_set_scores_as_csv +from mavedb.lib.score_sets import get_score_set_variants_as_csv from mavedb.models.experiment import Experiment from mavedb.models.experiment_set import ExperimentSet from mavedb.models.license import License @@ -147,12 +147,12 @@ def export_public_data(db: Session): logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}") csv_filename_base = score_set.urn.replace(":", "-") - csv_str = get_score_set_scores_as_csv(db, score_set) + csv_str = get_score_set_variants_as_csv(db, score_set, "scores") zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str) count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None if count_columns and len(count_columns) > 0: - csv_str = get_score_set_counts_as_csv(db, score_set) + csv_str = get_score_set_variants_as_csv(db, score_set, "counts") zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str) diff --git a/src/mavedb/scripts/link_clingen_variants.py b/src/mavedb/scripts/link_clingen_variants.py new file mode 100644 index 000000000..5f81e3087 --- /dev/null +++ b/src/mavedb/scripts/link_clingen_variants.py @@ -0,0 +1,73 @@ +import click +import logging +from typing import Sequence + +from sqlalchemy import select +from sqlalchemy.orm import Session + +from mavedb.lib.clingen.linked_data_hub import get_clingen_variation, clingen_allele_id_from_ldh_variation +from mavedb.models.score_set import ScoreSet +from mavedb.models.variant import Variant +from mavedb.models.mapped_variant import MappedVariant +from mavedb.scripts.environment import with_database_session + +logger = logging.getLogger(__name__) + + +@click.command() +@with_database_session +@click.argument("urns", nargs=-1) +@click.option("--score-sets/--variants", default=False) +@click.option("--unlinked", default=False) +def link_clingen_variants(db: Session, urns: Sequence[str], score_sets: bool, unlinked: bool) -> None: + """ + Submit data to ClinGen for mapped variant allele ID generation for the given URNs. + """ + if not urns: + logger.error("No URNs provided. Please provide at least one URN.") + return + + # Convert score set URNs to variant URNs. + if score_sets: + query = ( + select(Variant.urn) + .join(MappedVariant) + .join(ScoreSet) + .where(MappedVariant.current.is_(True), MappedVariant.post_mapped.is_not(None)) + ) + + if unlinked: + query = query.where(MappedVariant.clingen_allele_id.is_(None)) + + variants = [db.scalars(query.where(ScoreSet.urn == urn)).all() for urn in urns] + urns = [variant for sublist in variants for variant in sublist if variant is not None] + + failed_urns = [] + for urn in urns: + ldh_variation = get_clingen_variation(urn) + allele_id = clingen_allele_id_from_ldh_variation(ldh_variation) + + if not allele_id: + failed_urns.append(urn) + continue + + mapped_variant = db.scalar(select(MappedVariant).join(Variant).where(Variant.urn == urn)) + + if not mapped_variant: + logger.warning(f"No mapped variant found for URN {urn}.") + failed_urns.append(urn) + continue + + mapped_variant.clingen_allele_id = allele_id + db.add(mapped_variant) + + logger.info(f"Successfully linked URN {urn} to ClinGen variation {allele_id}.") + + if failed_urns: + logger.warning(f"Failed to link the following {len(failed_urns)} URNs: {', '.join(failed_urns)}") + + logger.info(f"Linking process completed. Linked {len(urns) - len(failed_urns)}/{len(urns)} URNs successfully.") + + +if __name__ == "__main__": + link_clingen_variants() diff --git a/src/mavedb/scripts/refresh_clinvar_variant_data.py b/src/mavedb/scripts/refresh_clinvar_variant_data.py new file mode 100644 index 000000000..b09cb3373 --- /dev/null +++ b/src/mavedb/scripts/refresh_clinvar_variant_data.py @@ -0,0 +1,137 @@ +import click +import requests +import csv +import time +import logging +import gzip +import random +import io + +from typing import Dict, Any, Optional +from datetime import date + +from sqlalchemy import select, distinct, func +from sqlalchemy.orm import Session + +from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.clinical_control import ClinicalControl +from mavedb.scripts.environment import with_database_session + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +def fetch_clinvar_variant_summary_tsv(month: Optional[str], year: Optional[str]) -> bytes: + if month is None and year is None: + url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" + else: + url = f"https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/variant_summary_{year}-{month}.txt.gz" + + response = requests.get(url, stream=True) + response.raise_for_status() + return response.content + + +def parse_tsv(tsv_content: bytes) -> Dict[int, Dict[str, str]]: + with gzip.open(filename=io.BytesIO(tsv_content), mode="rt") as f: + # This readlines object will only be a list of bytes if the file is opened in "rb" mode. + reader = csv.DictReader(f.readlines(), delimiter="\t") # type: ignore + data = {int(row["#AlleleID"]): row for row in reader} + + return data + + +def query_clingen_allele_api(allele_id: str) -> Dict[str, Any]: + url = f"https://reg.clinicalgenome.org/allele/{allele_id}" + retries = 5 + for i in range(retries): + try: + response = requests.get(url) + response.raise_for_status() + break + except requests.RequestException as e: + if i < retries - 1: + wait_time = (2**i) + random.uniform(0, 1) + logger.warning(f"Request failed ({e}), retrying in {wait_time:.2f} seconds...") + time.sleep(wait_time) + else: + logger.error(f"Request failed after {retries} attempts: {e}") + raise + + logger.debug(f"Fetched ClinGen data for allele ID {allele_id}.") + return response.json() + + +def refresh_clinvar_variants(db: Session, month: Optional[str], year: Optional[str]) -> None: + tsv_content = fetch_clinvar_variant_summary_tsv(month, year) + tsv_data = parse_tsv(tsv_content) + version = f"{month}_{year}" if month and year else f"{date.today().month}_{date.today().year}" + logger.info(f"Fetched TSV variant data for ClinVar for {version}.") + + total_variants_with_clingen_ids = db.scalar(func.count(distinct(MappedVariant.clingen_allele_id))) + clingen_ids = db.scalars( + select(distinct(MappedVariant.clingen_allele_id)).where(MappedVariant.clingen_allele_id.is_not(None)) + ).all() + + logger.info(f"Fetching ClinGen data for {total_variants_with_clingen_ids} variants.") + for index, clingen_id in enumerate(clingen_ids): + if total_variants_with_clingen_ids > 0 and index % (total_variants_with_clingen_ids // 100) == 0: + logger.info(f"Progress: {index / total_variants_with_clingen_ids:.0%}") + + # Guaranteed based on our query filters. + clingen_data = query_clingen_allele_api(clingen_id) # type: ignore + clinvar_allele_id = clingen_data.get("externalRecords", {}).get("ClinVarAlleles", [{}])[0].get("alleleId") + + if not clinvar_allele_id or clinvar_allele_id not in tsv_data: + logger.debug( + f"No ClinVar variant data found for ClinGen allele ID {clingen_id}. ({index + 1}/{total_variants_with_clingen_ids})." + ) + continue + + variant_data = tsv_data[clinvar_allele_id] + clinvar_variant = db.scalars( + select(ClinicalControl).where( + ClinicalControl.db_identifier == clinvar_allele_id, + ClinicalControl.db_version == version, + ClinicalControl.db_name == "ClinVar", + ) + ).one_or_none() + if clinvar_variant: + clinvar_variant.gene_symbol = variant_data.get("GeneSymbol") + clinvar_variant.clinical_significance = variant_data.get("ClinicalSignificance") + clinvar_variant.clinical_review_status = variant_data.get("ReviewStatus") + else: + clinvar_variant = ClinicalControl( + db_identifier=clinvar_allele_id, + gene_symbol=variant_data.get("GeneSymbol"), + clinical_significance=variant_data.get("ClinicalSignificance"), + clinical_review_status=variant_data.get("ReviewStatus"), + db_version=version, + db_name="ClinVar", + ) + + db.add(clinvar_variant) + + variants_with_clingen_allele_id = db.scalars( + select(MappedVariant).where(MappedVariant.clingen_allele_id == clingen_id) + ).all() + for mapped_variant in variants_with_clingen_allele_id: + mapped_variant.clinical_controls.append(clinvar_variant) + db.add(mapped_variant) + + db.commit() + logger.debug( + f"Added ClinVar variant data ({clinvar_allele_id}) for ClinGen allele ID {clingen_id}. ({index + 1}/{total_variants_with_clingen_ids})." + ) + + +@click.command() +@with_database_session +@click.option("--month", default=None, help="Populate mapped variants for every score set in MaveDB.") +@click.option("--year", default=None, help="Populate mapped variants for every score set in MaveDB.") +def refresh_clinvar_variants_command(db: Session, month: Optional[str], year: Optional[str]) -> None: + refresh_clinvar_variants(db, month, year) + + +if __name__ == "__main__": + refresh_clinvar_variants_command() diff --git a/src/mavedb/server_main.py b/src/mavedb/server_main.py index b0e966cf7..5f49a35c7 100644 --- a/src/mavedb/server_main.py +++ b/src/mavedb/server_main.py @@ -32,7 +32,7 @@ save_to_logging_context, ) from mavedb.lib.permissions import PermissionException -from mavedb.lib.slack import send_slack_message +from mavedb.lib.slack import send_slack_error from mavedb.models import * # noqa: F403 from mavedb.routers import ( access_keys, @@ -55,6 +55,7 @@ target_genes, taxonomies, users, + variants, ) logger = logging.getLogger(__name__) @@ -100,6 +101,7 @@ app.include_router(target_genes.router) app.include_router(taxonomies.router) app.include_router(users.router) +app.include_router(variants.router) @app.exception_handler(PermissionException) @@ -174,7 +176,7 @@ async def exception_handler(request, err): try: logger.error(msg="Uncaught exception.", extra=logging_context(), exc_info=err) - send_slack_message(err=err, request=request) + send_slack_error(err=err, request=request) finally: log_request(request, response, time.time_ns()) diff --git a/src/mavedb/view_models/clinical_control.py b/src/mavedb/view_models/clinical_control.py new file mode 100644 index 000000000..ed9f1dd23 --- /dev/null +++ b/src/mavedb/view_models/clinical_control.py @@ -0,0 +1,64 @@ +# See https://pydantic-docs.helpmanual.io/usage/postponed_annotations/#self-referencing-models +from __future__ import annotations + +from datetime import date +from typing import Optional, Sequence + +from mavedb.view_models import record_type_validator, set_record_type +from mavedb.view_models.base.base import BaseModel + + +class ClinicalControlBase(BaseModel): + db_identifier: str + gene_symbol: str + clinical_significance: str + clinical_review_status: str + db_version: str + db_name: str + + +class ClinicalControlUpdate(ClinicalControlBase): + mapped_variants: Optional[list[MappedVariantCreate]] = None + + +class ClinicalControlCreate(ClinicalControlUpdate): + pass + + +# Properties shared by models stored in DB +class SavedClinicalControl(ClinicalControlBase): + id: int + modification_date: date + creation_date: date + + record_type: str = None # type: ignore + _record_type_factory = record_type_validator()(set_record_type) + + class Config: + orm_mode = True + + +class SavedClinicalControlWithMappedVariants(SavedClinicalControl): + mapped_variants: Sequence[SavedMappedVariant] + + +# Properties to return to non-admin clients +class ClinicalControl(SavedClinicalControl): + pass + + +class ClinicalControlWithMappedVariants(SavedClinicalControlWithMappedVariants): + mapped_variants: Sequence[MappedVariant] + + +class ClinicalControlOptions(BaseModel): + db_name: str + available_versions: list[str] + + +# ruff: noqa: E402 +from mavedb.view_models.mapped_variant import MappedVariant, SavedMappedVariant, MappedVariantCreate + +ClinicalControlCreate.update_forward_refs() +SavedClinicalControlWithMappedVariants.update_forward_refs() +ClinicalControlWithMappedVariants.update_forward_refs() diff --git a/src/mavedb/view_models/mapped_variant.py b/src/mavedb/view_models/mapped_variant.py index 397084de9..6b3a3d97d 100644 --- a/src/mavedb/view_models/mapped_variant.py +++ b/src/mavedb/view_models/mapped_variant.py @@ -1,5 +1,8 @@ +# See https://pydantic-docs.helpmanual.io/usage/postponed_annotations/#self-referencing-models +from __future__ import annotations + from datetime import date -from typing import Any, Optional +from typing import Any, Optional, Sequence from mavedb.view_models import record_type_validator, set_record_type from mavedb.view_models.base.base import BaseModel @@ -8,7 +11,7 @@ class MappedVariantBase(BaseModel): pre_mapped: Optional[Any] post_mapped: Optional[Any] - variant_id: int + variant_urn: str vrs_version: Optional[str] error_message: Optional[str] modification_date: date @@ -16,26 +19,48 @@ class MappedVariantBase(BaseModel): mapping_api_version: str current: bool - -class MappedVariantCreate(MappedVariantBase): - pass + @classmethod + def from_orm(cls, obj: Any): + obj.variant_urn = obj.variant.urn + return super().from_orm(obj) class MappedVariantUpdate(MappedVariantBase): + clinical_controls: Sequence[ClinicalControlBase] + + +class MappedVariantCreate(MappedVariantUpdate): pass # Properties shared by models stored in DB class SavedMappedVariant(MappedVariantBase): id: int - record_type: str = None # type: ignore + clingen_allele_id: Optional[str] + record_type: str = None # type: ignore _record_type_factory = record_type_validator()(set_record_type) class Config: orm_mode = True +class SavedMappedVariantWithControls(SavedMappedVariant): + clinical_controls: Sequence[SavedClinicalControl] + + # Properties to return to non-admin clients class MappedVariant(SavedMappedVariant): pass + + +class MappedVariantWithControls(SavedMappedVariantWithControls): + clinical_controls: Sequence[ClinicalControl] + + +# ruff: noqa: E402 +from mavedb.view_models.clinical_control import ClinicalControlBase, ClinicalControl, SavedClinicalControl + +MappedVariantCreate.update_forward_refs() +SavedMappedVariantWithControls.update_forward_refs() +MappedVariantWithControls.update_forward_refs() diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py index 8bc19c2d5..30b93d286 100644 --- a/src/mavedb/view_models/score_set.py +++ b/src/mavedb/view_models/score_set.py @@ -34,7 +34,6 @@ TargetGeneCreate, ) from mavedb.view_models.user import SavedUser, User -from mavedb.view_models.variant import VariantInDbBase class ExternalLink(BaseModel): @@ -441,7 +440,7 @@ class ScoreSetWithVariants(ScoreSet): are requested. """ - variants: list[VariantInDbBase] + variants: list[SavedVariant] class AdminScoreSet(ScoreSet): @@ -469,6 +468,8 @@ class ScoreSetPublicDump(SavedScoreSet): # ruff: noqa: E402 from mavedb.view_models.experiment import Experiment +from mavedb.view_models.variant import SavedVariant ShortScoreSet.update_forward_refs() ScoreSet.update_forward_refs() +ScoreSetWithVariants.update_forward_refs() diff --git a/src/mavedb/view_models/variant.py b/src/mavedb/view_models/variant.py index 830bdd5c2..f7c713497 100644 --- a/src/mavedb/view_models/variant.py +++ b/src/mavedb/view_models/variant.py @@ -1,6 +1,7 @@ from datetime import date from typing import Any +from mavedb.view_models.mapped_variant import MappedVariant, SavedMappedVariant from pydantic.types import Optional from mavedb.view_models import record_type_validator, set_record_type @@ -8,6 +9,8 @@ class VariantBase(BaseModel): + """Properties shared by most variant view models""" + urn: Optional[str] data: Any score_set_id: int @@ -19,15 +22,20 @@ class VariantBase(BaseModel): class VariantCreate(VariantBase): + """Input view model for creating variants""" + pass class VariantUpdate(VariantBase): + """Input view model for updating variants""" + pass -# Properties shared by models stored in DB -class VariantInDbBase(VariantBase): +class SavedVariant(VariantBase): + """Base class for variant view models handling saved variants""" + id: int record_type: str = None # type: ignore @@ -37,11 +45,50 @@ class Config: orm_mode = True -# Properties to return to client -class Variant(VariantInDbBase): - pass +class SavedVariantWithMappedVariant(SavedVariant): + """Class for saved variant with any associated mapped variants""" + mapped_variant: Optional[SavedMappedVariant] + + @classmethod + def from_orm(cls, obj: Any): + try: + obj.mapped_variant = next( + mapped_variant for mapped_variant in obj.mapped_variants if mapped_variant.current + ) + except (AttributeError, StopIteration): + obj.mapped_variant = None + return super().from_orm(obj) + + +class Variant(SavedVariant): + """Variant view model returned to most clients""" -# Properties stored in DB -class VariantInDb(VariantInDbBase): pass + + +class VariantWithScoreSet(SavedVariant): + """Variant view model with mapped variants and score set""" + + score_set: "ScoreSet" + mapped_variants: list[MappedVariant] + + +class VariantWithShortScoreSet(SavedVariant): + """Variant view model with mapped variants and a limited set of score set details""" + + score_set: "ShortScoreSet" + mapped_variants: list[MappedVariant] + + +class ClingenAlleleIdVariantLookupsRequest(BaseModel): + """A request to search for variants matching a list of ClinGen allele IDs""" + + clingen_allele_ids: list[str] + + +# ruff: noqa: E402 +from mavedb.view_models.score_set import ScoreSet, ShortScoreSet + +VariantWithScoreSet.update_forward_refs() +VariantWithShortScoreSet.update_forward_refs() diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 2219a496c..d30064ff7 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -15,18 +15,36 @@ from mavedb.data_providers.services import vrs_mapper from mavedb.db.view import refresh_all_mat_views -from mavedb.lib.exceptions import MappingEnqueueError, NonexistentMappingReferenceError, NonexistentMappingResultsError +from mavedb.lib.clingen.constants import ( + DEFAULT_LDH_SUBMISSION_BATCH_SIZE, + LDH_SUBMISSION_URL, + LINKED_DATA_RETRY_THRESHOLD, +) +from mavedb.lib.clingen.content_constructors import construct_ldh_submission +from mavedb.lib.clingen.linked_data_hub import ( + ClinGenLdhService, + get_clingen_variation, + clingen_allele_id_from_ldh_variation, +) +from mavedb.lib.exceptions import ( + MappingEnqueueError, + SubmissionEnqueueError, + LinkingEnqueueError, + NonexistentMappingReferenceError, + NonexistentMappingResultsError, +) from mavedb.lib.logging.context import format_raised_exception_info_as_dict from mavedb.lib.score_sets import ( columns_for_dataset, create_variants, create_variants_data, ) -from mavedb.lib.slack import send_slack_message +from mavedb.lib.slack import send_slack_error, send_slack_message from mavedb.lib.validation.dataframe import ( validate_and_standardize_dataframe_pair, ) from mavedb.lib.validation.exceptions import ValidationError +from mavedb.lib.variants import hgvs_from_mapped_variant from mavedb.models.enums.mapping_state import MappingState from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.mapped_variant import MappedVariant @@ -40,16 +58,13 @@ MAPPING_QUEUE_NAME = "vrs_mapping_queue" MAPPING_CURRENT_ID_NAME = "vrs_mapping_current_job_id" BACKOFF_LIMIT = 5 -BACKOFF_IN_SECONDS = 15 +MAPPING_BACKOFF_IN_SECONDS = 15 +LINKING_BACKOFF_IN_SECONDS = 15 * 60 -@asynccontextmanager -async def mapping_in_execution(redis: ArqRedis, job_id: str): - await redis.set(MAPPING_CURRENT_ID_NAME, job_id) - try: - yield - finally: - await redis.set(MAPPING_CURRENT_ID_NAME, "") +#################################################################################################### +# Job utilities +#################################################################################################### def setup_job_state( @@ -65,14 +80,13 @@ def setup_job_state( async def enqueue_job_with_backoff( - redis: ArqRedis, job_name: str, attempt: int, *args + redis: ArqRedis, job_name: str, attempt: int, backoff: int, *args ) -> tuple[Optional[str], bool, Any]: new_job_id = None - backoff = None limit_reached = attempt > BACKOFF_LIMIT if not limit_reached: limit_reached = True - backoff = BACKOFF_IN_SECONDS * (2**attempt) + backoff = backoff * (2**attempt) attempt = attempt + 1 # NOTE: for jobs supporting backoff, `attempt` should be the final argument. @@ -89,6 +103,11 @@ async def enqueue_job_with_backoff( return (new_job_id, not limit_reached, backoff) +#################################################################################################### +# Creating variants +#################################################################################################### + + async def create_variants_for_score_set( ctx, correlation_id: str, score_set_id: int, updater_id: int, scores: pd.DataFrame, counts: pd.DataFrame ): @@ -181,7 +200,7 @@ async def create_variants_for_score_set( logging_context["created_variants"] = 0 logger.warning(msg="Encountered an internal exception while processing variants.", extra=logging_context) - send_slack_message(err=e) + send_slack_error(err=e) return {"success": False} # Catch all other exceptions. The exceptions caught here were intented to be system exiting. @@ -223,6 +242,20 @@ async def create_variants_for_score_set( return {"success": True} +#################################################################################################### +# Mapping variants +#################################################################################################### + + +@asynccontextmanager +async def mapping_in_execution(redis: ArqRedis, job_id: str): + await redis.set(MAPPING_CURRENT_ID_NAME, job_id) + try: + yield + finally: + await redis.set(MAPPING_CURRENT_ID_NAME, "") + + async def map_variants_for_score_set( ctx: dict, correlation_id: str, score_set_id: int, updater_id: int, attempt: int = 1 ) -> dict: @@ -256,7 +289,7 @@ async def map_variants_for_score_set( loop = asyncio.get_running_loop() except Exception as e: - send_slack_message(e) + send_slack_error(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.error( msg="Variant mapper encountered an unexpected error during setup. This job will not be retried.", @@ -285,7 +318,7 @@ async def map_variants_for_score_set( db.add(score_set) db.commit() - send_slack_message(e) + send_slack_error(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.warning( msg="Variant mapper encountered an unexpected error while mapping variants. This job will be retried.", @@ -297,7 +330,7 @@ async def map_variants_for_score_set( try: await redis.lpush(MAPPING_QUEUE_NAME, score_set.id) # type: ignore new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( - redis, "variant_mapper_manager", attempt, correlation_id, updater_id + redis, "variant_mapper_manager", attempt, MAPPING_BACKOFF_IN_SECONDS, correlation_id, updater_id ) # If we fail to enqueue a mapping manager for this score set, evict it from the queue. if new_job_id is None: @@ -312,7 +345,7 @@ async def map_variants_for_score_set( score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.add(score_set) db.commit() - send_slack_message(backoff_e) + send_slack_error(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} logger.critical( msg="While attempting to re-enqueue a mapping job that exited in error, another exception was encountered. This score set will not be mapped.", @@ -468,7 +501,7 @@ async def map_variants_for_score_set( db.add(score_set) db.commit() - send_slack_message(e) + send_slack_error(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.warning( msg="An unexpected error occurred during variant mapping. This job will be attempted again.", @@ -480,7 +513,7 @@ async def map_variants_for_score_set( try: await redis.lpush(MAPPING_QUEUE_NAME, score_set.id) # type: ignore new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( - redis, "variant_mapper_manager", attempt, correlation_id, updater_id + redis, "variant_mapper_manager", attempt, MAPPING_BACKOFF_IN_SECONDS, correlation_id, updater_id ) # If we fail to enqueue a mapping manager for this score set, evict it from the queue. if new_job_id is None: @@ -493,7 +526,7 @@ async def map_variants_for_score_set( except Exception as backoff_e: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} - send_slack_message(backoff_e) + send_slack_error(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} logger.critical( msg="While attempting to re-enqueue a mapping job that exited in error, another exception was encountered. This score set will not be mapped.", @@ -525,8 +558,38 @@ async def map_variants_for_score_set( db.commit() return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} + new_job_id = None + try: + new_job = await redis.enqueue_job( + "submit_score_set_mappings_to_ldh", + correlation_id, + score_set.id, + ) + + if new_job: + new_job_id = new_job.job_id + + logging_context["submit_clingen_variants_job_id"] = new_job_id + logger.info(msg="Queued a new ClinGen submission job.", extra=logging_context) + + else: + raise SubmissionEnqueueError() + + except Exception as e: + send_slack_error(e) + send_slack_message( + f"Could not submit mappings to LDH for score set {score_set.urn}. Mappings for this score set should be submitted manually." + ) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="Mapped variant ClinGen submission encountered an unexpected error while attempting to enqueue a submission job. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": new_job_id} + ctx["state"][ctx["job_id"]] = logging_context.copy() - return {"success": True, "retried": False} + return {"success": True, "retried": False, "enqueued_job": new_job_id} async def variant_mapper_manager(ctx: dict, correlation_id: str, updater_id: int, attempt: int = 1) -> dict: @@ -569,7 +632,7 @@ async def variant_mapper_manager(ctx: dict, correlation_id: str, updater_id: int logging_context["existing_mapping_job_id"] = mapping_job_id except Exception as e: - send_slack_message(e) + send_slack_error(e) # Attempt to remove this item from the mapping queue. try: @@ -629,7 +692,7 @@ async def variant_mapper_manager(ctx: dict, correlation_id: str, updater_id: int raise MappingEnqueueError() except Exception as e: - send_slack_message(e) + send_slack_error(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.error( msg="Variant mapper manager encountered an unexpected error while enqueing a mapping job. This job will not be retried.", @@ -659,6 +722,11 @@ async def variant_mapper_manager(ctx: dict, correlation_id: str, updater_id: int return {"success": False, "enqueued_job": new_job_id} +#################################################################################################### +# Materialized Views +#################################################################################################### + + # TODO#405: Refresh materialized views within an executor. async def refresh_materialized_views(ctx: dict): logging_context = setup_job_state(ctx, None, None, None) @@ -674,3 +742,401 @@ async def refresh_published_variants_view(ctx: dict, correlation_id: str): PublishedVariantsMV.refresh(ctx["db"]) logger.debug(msg="Done refreshing of published variants materialized view.", extra=logging_context) return {"success": True} + + +#################################################################################################### +# ClinGen resource creation / linkage +#################################################################################################### + + +async def submit_score_set_mappings_to_ldh(ctx: dict, correlation_id: str, score_set_id: int): + logging_context = {} + score_set = None + text = ( + "Could not submit mappings to LDH for score set %s. Mappings for this score set should be submitted manually." + ) + try: + db: Session = ctx["db"] + redis: ArqRedis = ctx["redis"] + score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one() + + logging_context = setup_job_state(ctx, None, score_set.urn, correlation_id) + logger.info(msg="Started LDH mapped resource submission", extra=logging_context) + + submission_urn = score_set.urn + assert submission_urn, "A valid URN is needed to submit LDH objects for this score set." + + logging_context["current_ldh_submission_resource"] = submission_urn + logger.debug(msg="Fetched score set metadata for ldh mapped resource submission.", extra=logging_context) + + except Exception as e: + send_slack_error(e) + if score_set: + send_slack_message(text=text % score_set.urn) + else: + send_slack_message(text=text % score_set_id) + + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource submission encountered an unexpected error during setup. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + ldh_service = ClinGenLdhService(url=LDH_SUBMISSION_URL) + ldh_service.authenticate() + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource submission encountered an unexpected error while attempting to authenticate to the LDH. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + variant_objects = db.execute( + select(Variant, MappedVariant) + .join(MappedVariant) + .join(ScoreSet) + .where(ScoreSet.urn == score_set.urn) + .where(MappedVariant.post_mapped.is_not(None)) + .where(MappedVariant.current.is_(True)) + ).all() + + if not variant_objects: + logger.warning( + msg="No current mapped variants with post mapped metadata were found for this score set. Skipping LDH submission.", + extra=logging_context, + ) + return {"success": True, "retried": False, "enqueued_job": None} + + variant_content = [] + for variant, mapped_variant in variant_objects: + for variation in hgvs_from_mapped_variant(mapped_variant): + variant_content.append((variation, variant, mapped_variant)) + + submission_content = construct_ldh_submission(variant_content) + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource submission encountered an unexpected error while attempting to construct submission objects. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + blocking = functools.partial( + ldh_service.dispatch_submissions, submission_content, DEFAULT_LDH_SUBMISSION_BATCH_SIZE + ) + loop = asyncio.get_running_loop() + submission_successes, submission_failures = await loop.run_in_executor(ctx["pool"], blocking) + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource submission encountered an unexpected error while dispatching submissions. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + assert not submission_failures, f"{len(submission_failures)} submissions failed to be dispatched to the LDH." + logger.info(msg="Dispatched all variant mapping submissions to the LDH.", extra=logging_context) + except AssertionError as e: + send_slack_error(e) + send_slack_message( + text=f"{len(submission_failures)} submissions failed to be dispatched to the LDH for score set {score_set.urn}." + ) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource submission failed to submit all mapping resources. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + new_job_id = None + try: + new_job = await redis.enqueue_job( + "link_clingen_variants", + correlation_id, + score_set.id, + 1, + _defer_by=timedelta(seconds=LINKING_BACKOFF_IN_SECONDS), + ) + + if new_job: + new_job_id = new_job.job_id + + logging_context["link_clingen_variants_job_id"] = new_job_id + logger.info(msg="Queued a new ClinGen linking job.", extra=logging_context) + + else: + raise LinkingEnqueueError() + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource submission encountered an unexpected error while attempting to enqueue a linking job. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": new_job_id} + + return {"success": True, "retried": False, "enqueued_job": new_job_id} + + +def do_clingen_fetch(variant_urns): + return [(variant_urn, get_clingen_variation(variant_urn)) for variant_urn in variant_urns] + + +async def link_clingen_variants(ctx: dict, correlation_id: str, score_set_id: int, attempt: int) -> dict: + logging_context = {} + score_set = None + text = "Could not link mappings to LDH for score set %s. Mappings for this score set should be linked manually." + try: + db: Session = ctx["db"] + score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one() + + logging_context = setup_job_state(ctx, None, score_set.urn, correlation_id) + logging_context["linkage_retry_threshold"] = LINKED_DATA_RETRY_THRESHOLD + logging_context["attempt"] = attempt + logging_context["max_attempts"] = BACKOFF_LIMIT + logger.info(msg="Started LDH mapped resource linkage", extra=logging_context) + + submission_urn = score_set.urn + assert submission_urn, "A valid URN is needed to link LDH objects for this score set." + + logging_context["current_ldh_linking_resource"] = submission_urn + logger.debug(msg="Fetched score set metadata for ldh mapped resource linkage.", extra=logging_context) + + except Exception as e: + send_slack_error(e) + if score_set: + send_slack_message(text=text % score_set.urn) + else: + send_slack_message(text=text % score_set_id) + + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource linkage encountered an unexpected error during setup. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + variant_urns = db.scalars( + select(Variant.urn) + .join(MappedVariant) + .join(ScoreSet) + .where( + ScoreSet.urn == score_set.urn, MappedVariant.current.is_(True), MappedVariant.post_mapped.is_not(None) + ) + ).all() + num_variant_urns = len(variant_urns) + + logging_context["variants_to_link_ldh"] = submission_urn + + if not variant_urns: + logger.warning( + msg="No current mapped variants with post mapped metadata were found for this score set. Skipping LDH linkage (nothing to do).", + extra=logging_context, + ) + + return {"success": True, "retried": False, "enqueued_job": None} + + logger.info( + msg="Found current mapped variants with post mapped metadata for this score set. Attempting to link them to LDH submissions.", + extra=logging_context, + ) + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource linkage encountered an unexpected error while attempting to build linkage urn list. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + logger.info(msg="Attempting to link mapped variants to LDH submissions.", extra=logging_context) + + # TODO#372: Non-nullable variant urns. + blocking = functools.partial( + do_clingen_fetch, + variant_urns, # type: ignore + ) + loop = asyncio.get_running_loop() + linked_data = await loop.run_in_executor(ctx["pool"], blocking) + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource linkage encountered an unexpected error while attempting to link LDH submissions. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + linked_allele_ids = [ + (variant_urn, clingen_allele_id_from_ldh_variation(clingen_variation)) + for variant_urn, clingen_variation in linked_data + ] + + linkage_failures = [] + for variant_urn, ldh_variation in linked_allele_ids: + # XXX: Should we unlink variation if it is not found? Does this constitute a failure? + if not ldh_variation: + logger.warning( + msg=f"Failed to link mapped variant {variant_urn} to LDH submission. No LDH variation found.", + extra=logging_context, + ) + linkage_failures.append(variant_urn) + continue + + mapped_variant = db.scalars( + select(MappedVariant).join(Variant).where(Variant.urn == variant_urn, MappedVariant.current.is_(True)) + ).one_or_none() + + if not mapped_variant: + logger.warning( + msg=f"Failed to link mapped variant {variant_urn} to LDH submission. No mapped variant found.", + extra=logging_context, + ) + linkage_failures.append(variant_urn) + continue + + mapped_variant.clingen_allele_id = ldh_variation + db.add(mapped_variant) + + db.commit() + + except Exception as e: + db.rollback() + + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource linkage encountered an unexpected error while attempting to link LDH submissions. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + try: + num_linkage_failures = len(linkage_failures) + ratio_failed_linking = round(num_linkage_failures / num_variant_urns, 3) + logging_context["linkage_failure_rate"] = ratio_failed_linking + logging_context["linkage_failures"] = num_linkage_failures + logging_context["linkage_successes"] = num_variant_urns - num_linkage_failures + + assert ( + len(linked_allele_ids) == num_variant_urns + ), f"{num_variant_urns - len(linked_allele_ids)} appear to not have been attempted to be linked." + + if not linkage_failures: + logger.info( + msg="Successfully linked all mapped variants to LDH submissions.", + extra=logging_context, + ) + return {"success": True, "retried": False, "enqueued_job": None} + + if ratio_failed_linking < LINKED_DATA_RETRY_THRESHOLD: + logger.warning( + msg="Linkage failures exist, but did not exceed the retry threshold.", + extra=logging_context, + ) + send_slack_message( + text=f"Failed to link {len(linkage_failures)} mapped variants to LDH submissions for score set {score_set.urn}." + f"The retry threshold was not exceeded and this job will not be retried. URNs failed to link: {', '.join(linkage_failures)}." + ) + return {"success": True, "retried": False, "enqueued_job": None} + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="LDH mapped resource linkage encountered an unexpected error while attempting to finalize linkage. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False, "enqueued_job": None} + + # If we reach this point, we should consider the job failed (there were failures which exceeded our retry threshold). + new_job_id = None + max_retries_exceeded = None + try: + new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( + ctx["redis"], "variant_mapper_manager", attempt, LINKING_BACKOFF_IN_SECONDS, correlation_id + ) + + logging_context["backoff_limit_exceeded"] = max_retries_exceeded + logging_context["backoff_deferred_in_seconds"] = backoff_time + logging_context["backoff_job_id"] = new_job_id + + except Exception as e: + send_slack_error(e) + send_slack_message(text=text % score_set.urn) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.critical( + msg="LDH mapped resource linkage encountered an unexpected error while attempting to retry a failed linkage job. This job will not be retried.", + extra=logging_context, + ) + else: + if new_job_id and not max_retries_exceeded: + logger.info( + msg="After a failure condition while linking mapped variants to LDH submissions, another linkage job was queued.", + extra=logging_context, + ) + send_slack_message( + text=f"Failed to link {len(linkage_failures)} ({ratio_failed_linking*100}% of total mapped variants for {score_set.urn})." + f"This job was successfully retried. This was attempt {attempt}. Retry will occur in {backoff_time} seconds. URNs failed to link: {', '.join(linkage_failures)}." + ) + elif new_job_id is None and not max_retries_exceeded: + logger.error( + msg="After a failure condition while linking mapped variants to LDH submissions, another linkage job was unable to be queued.", + extra=logging_context, + ) + send_slack_message( + text=f"Failed to link {len(linkage_failures)} ({ratio_failed_linking} of total mapped variants for {score_set.urn})." + f"This job could not be retried due to an unexpected issue while attempting to enqueue another linkage job. This was attempt {attempt}. URNs failed to link: {', '.join(linkage_failures)}." + ) + else: + logger.error( + msg="After a failure condition while linking mapped variants to LDH submissions, the maximum retries for this job were exceeded. The reamining linkage failures will not be retried.", + extra=logging_context, + ) + send_slack_message( + text=f"Failed to link {len(linkage_failures)} ({ratio_failed_linking} of total mapped variants for {score_set.urn})." + f"The retry threshold was exceeded and this job will not be retried. URNs failed to link: {', '.join(linkage_failures)}." + ) + + finally: + return { + "success": False, + "retried": (not max_retries_exceeded and new_job_id is not None), + "enqueued_job": new_job_id, + } diff --git a/src/mavedb/worker/settings.py b/src/mavedb/worker/settings.py index 76bc4a326..754c3f44e 100644 --- a/src/mavedb/worker/settings.py +++ b/src/mavedb/worker/settings.py @@ -15,6 +15,8 @@ variant_mapper_manager, refresh_materialized_views, refresh_published_variants_view, + submit_score_set_mappings_to_ldh, + link_clingen_variants, ) # ARQ requires at least one task on startup. @@ -23,6 +25,8 @@ variant_mapper_manager, map_variants_for_score_set, refresh_published_variants_view, + submit_score_set_mappings_to_ldh, + link_clingen_variants, ] # In UTC time. Depending on daylight savings time, this will bounce around by an hour but should always be very early in the morning # for all of the USA. diff --git a/tests/conftest.py b/tests/conftest.py index e5d55a325..c16ef6104 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,13 @@ from mavedb.lib.authorization import require_current_user from mavedb.models.user import User from mavedb.server_main import app -from mavedb.worker.jobs import create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager +from mavedb.worker.jobs import ( + create_variants_for_score_set, + map_variants_for_score_set, + variant_mapper_manager, + submit_score_set_mappings_to_ldh, + link_clingen_variants, +) sys.path.append(".") @@ -160,7 +166,13 @@ async def on_job(ctx): ctx["pool"] = futures.ProcessPoolExecutor() worker_ = Worker( - functions=[create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager], + functions=[ + create_variants_for_score_set, + map_variants_for_score_set, + variant_mapper_manager, + submit_score_set_mappings_to_ldh, + link_clingen_variants, + ], redis_pool=arq_redis, burst=True, poll_delay=0, diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 03abc856e..a134a4684 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -4,6 +4,12 @@ from mavedb.models.enums.processing_state import ProcessingState + +VALID_EXPERIMENT_SET_URN = "urn:mavedb:01234567" +VALID_EXPERIMENT_URN = f"{VALID_EXPERIMENT_SET_URN}-abcd" +VALID_SCORE_SET_URN = f"{VALID_EXPERIMENT_URN}-0123" +VALID_VARIANT_URN = f"{VALID_SCORE_SET_URN}#1" + TEST_PUBMED_IDENTIFIER = "20711194" TEST_PUBMED_URL_IDENTIFIER = "https://pubmed.ncbi.nlm.nih.gov/37162834/" TEST_BIORXIV_IDENTIFIER = "2021.06.21.212592" @@ -11,9 +17,23 @@ TEST_CROSSREF_IDENTIFIER = "10.1371/2021.06.22.21259265" TEST_ORCID_ID = "1111-1111-1111-1111" +TEST_GA4GH_IDENTIFIER = "ga4gh:SQ.test" +# ^[0-9A-Za-z_\-]{32}$ +TEST_GA4GH_DIGEST = "ga4ghtest_ga4ghtest_ga4ghtest_dg" +# ^SQ.[0-9A-Za-z_\-]{32}$ +TEST_REFGET_ACCESSION = "SQ.ga4ghtest_ga4ghtest_ga4ghtest_rg" +TEST_SEQUENCE_LOCATION_ACCESSION = "ga4gh:SL.test" + +TEST_REFSEQ_IDENTIFIER = "NM_003345" +TEST_HGVS_IDENTIFIER = f"{TEST_REFSEQ_IDENTIFIER}:p.Asp5Phe" + VALID_ACCESSION = "NM_001637.3" VALID_GENE = "BRCA1" +VALID_CLINGEN_PA_ID = "PA2579908752" +VALID_CLINGEN_CA_ID = "CA341478553" +VALID_CLINGEN_LDH_ID = "2786738861" + SAVED_PUBMED_PUBLICATION = { "recordType": "PublicationIdentifier", "identifier": "20711194", @@ -36,6 +56,114 @@ "id": 1, } +# VRS 1.X +TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS1_X = { + "id": TEST_GA4GH_IDENTIFIER, + "type": "Allele", + "variation": { + "state": {"type": "LiteralSequenceExpression", "sequence": "V"}, + "digest": TEST_GA4GH_DIGEST, + "location": { + "id": TEST_SEQUENCE_LOCATION_ACCESSION, + "end": 2, + "type": "SequenceLocation", + "start": 1, + "digest": TEST_GA4GH_DIGEST, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": TEST_REFGET_ACCESSION, + }, + }, + "extensions": [{"name": "vrs_ref_allele_seq", "type": "Extension", "value": "W"}], + }, +} + +TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X = { + "id": TEST_GA4GH_IDENTIFIER, + "type": "Allele", + "variation": { + "state": {"type": "LiteralSequenceExpression", "sequence": "F"}, + "digest": TEST_GA4GH_DIGEST, + "location": { + "id": TEST_SEQUENCE_LOCATION_ACCESSION, + "end": 6, + "type": "SequenceLocation", + "start": 5, + "digest": TEST_GA4GH_DIGEST, + "sequenceReference": { + "type": "SequenceReference", + "label": TEST_REFSEQ_IDENTIFIER, + "refgetAccession": TEST_REFGET_ACCESSION, + }, + }, + "extensions": [{"name": "vrs_ref_allele_seq", "type": "Extension", "value": "D"}], + "expressions": [{"value": TEST_HGVS_IDENTIFIER, "syntax": "hgvs.p"}], + }, +} + +# VRS 2.X +TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X = { + "id": TEST_GA4GH_IDENTIFIER, + "type": "Allele", + "state": {"type": "LiteralSequenceExpression", "sequence": "V"}, + "digest": TEST_GA4GH_DIGEST, + "location": { + "id": TEST_SEQUENCE_LOCATION_ACCESSION, + "end": 2, + "type": "SequenceLocation", + "start": 1, + "digest": TEST_GA4GH_DIGEST, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": TEST_REFGET_ACCESSION, + }, + }, + "extensions": [{"name": "vrs_ref_allele_seq", "type": "Extension", "value": "W"}], +} + +TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X = { + "id": TEST_GA4GH_IDENTIFIER, + "type": "Allele", + "state": {"type": "LiteralSequenceExpression", "sequence": "F"}, + "digest": TEST_GA4GH_DIGEST, + "location": { + "id": TEST_SEQUENCE_LOCATION_ACCESSION, + "end": 6, + "type": "SequenceLocation", + "start": 5, + "digest": TEST_GA4GH_DIGEST, + "sequenceReference": { + "type": "SequenceReference", + "label": TEST_REFSEQ_IDENTIFIER, + "refgetAccession": TEST_REFGET_ACCESSION, + }, + }, + "extensions": [{"name": "vrs_ref_allele_seq", "type": "Extension", "value": "D"}], + "expressions": [{"value": TEST_HGVS_IDENTIFIER, "syntax": "hgvs.p"}], +} + +# VRS 1.X +TEST_VALID_PRE_MAPPED_VRS_HAPLOTYPE = { + "type": "Haplotype", + "members": [TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X], +} + +TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE = { + "type": "Haplotype", + "members": [TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X], +} + +# VRS 2.X +TEST_VALID_PRE_MAPPED_VRS_CIS_PHASED_BLOCK = { + "type": "Haplotype", + "members": [TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X], +} + +TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK = { + "type": "Haplotype", + "members": [TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X], +} + TEST_USER = { "username": "0000-1111-2222-3333", "first_name": "First", @@ -681,7 +809,7 @@ } -TEST_SCORESET_RANGE = { +TEST_SCORE_SET_RANGE = { "wt_score": 1.0, "ranges": [ {"label": "test1", "classification": "normal", "range": (0, 2.0)}, @@ -690,7 +818,7 @@ } -TEST_SAVED_SCORESET_RANGE = { +TEST_SAVED_SCORE_SET_RANGE = { "wtScore": 1.0, "ranges": [ {"label": "test1", "classification": "normal", "range": [0.0, 2.0]}, @@ -764,3 +892,127 @@ "editors": [], "viewers": [], } + +TEST_CLINVAR_CONTROL = { + "db_identifier": "183058", + "gene_symbol": "PTEN", + "clinical_significance": "Likely benign", + "clinical_review_status": "criteria provided, multiple submitters, no conflicts", + "db_name": "ClinVar", + "db_version": "11_2024", +} + + +TEST_SAVED_CLINVAR_CONTROL = { + "recordType": "ClinicalControlWithMappedVariants", + "dbIdentifier": "183058", + "geneSymbol": "PTEN", + "clinicalSignificance": "Likely benign", + "clinicalReviewStatus": "criteria provided, multiple submitters, no conflicts", + "dbName": "ClinVar", + "dbVersion": "11_2024", + "mappedVariants": [], +} + + +TEST_GENERIC_CLINICAL_CONTROL = { + "db_identifier": "ABC123", + "gene_symbol": "BRCA1", + "clinical_significance": "benign", + "clinical_review_status": "lots of convincing evidence", + "db_name": "GenDB", + "db_version": "2024", +} + + +TEST_SAVED_GENERIC_CLINICAL_CONTROL = { + "recordType": "ClinicalControlWithMappedVariants", + "dbIdentifier": "ABC123", + "geneSymbol": "BRCA1", + "clinicalSignificance": "benign", + "clinicalReviewStatus": "lots of convincing evidence", + "dbName": "GenDB", + "dbVersion": "2024", + "mappedVariants": [], +} + + +TEST_CLINGEN_SUBMISSION_RESPONSE = { + "data": {"msg": "Data sent successfully", "msgIds": ["(148894,0,-1,0)"]}, + "metadata": {"rendered": {"by": "https://genboree.org/mq/brdg/srvc", "when": datetime.now().isoformat()}}, + "status": {"code": 200, "name": "OK"}, +} + + +TEST_CLINGEN_SUBMISSION_UNAUTHORIZED_RESPONSE = { + "metadata": {"rendered": {"when": datetime.now().isoformat()}}, + "status": {"code": 403, "msg": "Bad Auth Info - jwt malformed", "name": "Forbidden"}, +} + +TEST_CLINGEN_SUBMISSION_BAD_RESQUEST_RESPONSE = { + "metadata": {"rendered": {"when": datetime.now().isoformat()}}, + "status": { + "code": 400, + "msg": "Put Failed - Error! Submission was an empty object. Submission must consist of valid, non-Empty JSON objects", + "name": "Bad Request", + }, +} + + +TEST_CLINGEN_LDH_LINKING_RESPONSE = { + "data": { + "created": datetime.now().isoformat(), + "creator": "brl_clingen", + "entContent": { + "mapping_api_version": "pytest.mapping.1.0", + "mavedb_id": VALID_VARIANT_URN, + "post_mapped": TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, + "pre_mapped": TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, + "score": 1.0, + }, + "entId": VALID_VARIANT_URN, + "entIri": f"https://staging.mavedb.org/score-sets/{VALID_VARIANT_URN}", + "entType": "MaveDBMapping", + "ldFor": { + "Variant": [ + { + "created": datetime.now().isoformat(), + "creator": "brl_clingen", + "entId": VALID_CLINGEN_PA_ID, + "entIri": f"http://reg.genome.network/allele/{VALID_CLINGEN_PA_ID}", + "entType": "Variant", + "ldhId": VALID_CLINGEN_LDH_ID, + "ldhIri": f"https://10.15.55.128/ldh-stg/Variant/id/{VALID_CLINGEN_LDH_ID}", + "modified": datetime.now().isoformat(), + "modifier": "brl_clingen", + "rev": "_hLpznbC-A-", + } + ] + }, + "ldhId": VALID_CLINGEN_LDH_ID, + "ldhIri": f"https://10.15.55.128/ldh-stg/MaveDBMapping/id/{VALID_CLINGEN_LDH_ID}", + "modified": datetime.now().isoformat(), + "modifier": "brl_clingen", + "rev": "_jj3a99K---", + }, + "metadata": {"rendered": {"by": "https://10.15.55.128/ldh-stg/srvc", "when": datetime.now().isoformat()}}, + "status": {"code": 200, "name": "OK"}, +} + + +TEST_CLINGEN_LDH_LINKING_RESPONSE_NOT_FOUND = { + "metadata": {"rendered": {"by": "https://10.15.55.128/ldh-stg/srvc", "when": datetime.now().isoformat()}}, + "status": { + "code": 404, + "msg": f"Bad Entity - No 'MaveDBMapping' entity found with identifier {VALID_VARIANT_URN}", + "name": "Not Found", + }, +} + + +TEST_CLINGEN_LDH_LINKING_RESPONSE_BAD_REQUEST = { + "errCode": 400, + "errMsg": "INVALID URL - Your request is invalid. Specifically, the URL path you provided ('/ldh-stg/MaveDBMapping/i/urn%3Amavedb%3A00000050-a-1%231') is not valid for HTTP 'GET' requests to the CG-LDH API service.", + "errName": "Bad Request", + "errCat": "INVALID URL", +} diff --git a/tests/helpers/util.py b/tests/helpers/util.py index 6a005e3a3..6519e4d01 100644 --- a/tests/helpers/util.py +++ b/tests/helpers/util.py @@ -1,4 +1,5 @@ from copy import deepcopy +from datetime import date from unittest.mock import patch import cdot.hgvs.dataproviders @@ -9,6 +10,7 @@ from mavedb.lib.score_sets import columns_for_dataset, create_variants, create_variants_data, csv_data_to_df from mavedb.lib.validation.dataframe import validate_and_standardize_dataframe_pair +from mavedb.models.clinical_control import ClinicalControl as ClinicalControlDbModel from mavedb.models.contributor import Contributor from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.enums.mapping_state import MappingState @@ -19,9 +21,13 @@ from mavedb.models.user import User from mavedb.models.variant import Variant from mavedb.view_models.collection import Collection +from mavedb.models.mapped_variant import MappedVariant as MappedVariantDbModel +from mavedb.models.variant import Variant as VariantDbModel from mavedb.view_models.experiment import Experiment, ExperimentCreate from mavedb.view_models.score_set import ScoreSet, ScoreSetCreate from tests.helpers.constants import ( + TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, EXTRA_USER, TEST_CDOT_TRANSCRIPT, TEST_COLLECTION, @@ -31,6 +37,8 @@ TEST_MINIMAL_POST_MAPPED_METADATA, TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_MAPPED_VARIANT, + TEST_VALID_PRE_MAPPED_VRS_CIS_PHASED_BLOCK, + TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, ) @@ -211,6 +219,35 @@ def create_mapped_variants_for_score_set(db, score_set_urn): return +def mock_worker_vrs_mapping(client, db, score_set, alleles=True): + # The mapping job is tested elsewhere, so insert mapped variants manually. + variants = db.scalars( + select(VariantDbModel).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set["urn"]) + ).all() + + # It's un-important what the contents of each mapped VRS object are, so use the same constant for each variant. + for variant in variants: + mapped_variant = MappedVariantDbModel( + pre_mapped=TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X + if alleles + else TEST_VALID_PRE_MAPPED_VRS_CIS_PHASED_BLOCK, + post_mapped=TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X + if alleles + else TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, + variant=variant, + vrs_version="2.0", + modification_date=date.today(), + mapped_date=date.today(), + mapping_api_version="pytest.0.0", + current=True, + ) + db.add(mapped_variant) + + db.commit() + + return client.get(f"/api/v1/score-sets/{score_set['urn']}").json() + + def create_seq_score_set_with_variants( client, db, data_provider, experiment_urn, scores_csv_path, update=None, counts_csv_path=None ): @@ -288,3 +325,48 @@ def update_expected_response_for_created_resources(expected_response, created_ex ) return expected_response + + +def create_seq_score_set_with_mapped_variants( + client, db, data_provider, experiment_urn, scores_csv_path, update=None, counts_csv_path=None +): + score_set = create_seq_score_set_with_variants( + client, db, data_provider, experiment_urn, scores_csv_path, update, counts_csv_path + ) + score_set = mock_worker_vrs_mapping(client, db, score_set) + + jsonschema.validate(instance=score_set, schema=ScoreSet.schema()) + return score_set + + +def create_acc_score_set_with_mapped_variants( + client, db, data_provider, experiment_urn, scores_csv_path, update=None, counts_csv_path=None +): + score_set = create_acc_score_set_with_variants( + client, db, data_provider, experiment_urn, scores_csv_path, update, counts_csv_path + ) + score_set = mock_worker_vrs_mapping(client, db, score_set) + + jsonschema.validate(instance=score_set, schema=ScoreSet.schema()) + return score_set + + +def link_clinical_controls_to_mapped_variants(db, score_set): + mapped_variants = db.scalars( + select(MappedVariantDbModel) + .join(VariantDbModel) + .join(ScoreSetDbModel) + .where(ScoreSetDbModel.urn == score_set["urn"]) + ).all() + + # The first mapped variant gets the clinvar control, the second gets the generic control. + mapped_variants[0].clinical_controls.append( + db.scalar(select(ClinicalControlDbModel).where(ClinicalControlDbModel.id == 1)) + ) + mapped_variants[1].clinical_controls.append( + db.scalar(select(ClinicalControlDbModel).where(ClinicalControlDbModel.id == 2)) + ) + + db.add(mapped_variants[0]) + db.add(mapped_variants[1]) + db.commit() diff --git a/tests/lib/clingen/__init__.py b/tests/lib/clingen/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/clingen/test_content_constructors.py b/tests/lib/clingen/test_content_constructors.py new file mode 100644 index 000000000..35f73adc2 --- /dev/null +++ b/tests/lib/clingen/test_content_constructors.py @@ -0,0 +1,105 @@ +from unittest.mock import patch +from uuid import UUID + +from mavedb.constants import MAVEDB_BASE_GIT, MAVEDB_FRONTEND_URL +from mavedb.lib.clingen.content_constructors import ( + construct_ldh_submission_event, + construct_ldh_submission_subject, + construct_ldh_submission, + construct_ldh_submission_entity, +) +from mavedb.lib.clingen.constants import LDH_ENTITY_NAME, LDH_SUBMISSION_TYPE +from mavedb import __version__ + +from tests.helpers.constants import ( + TEST_HGVS_IDENTIFIER, + VALID_VARIANT_URN, + TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, +) + + +def test_construct_ldh_submission_subject(): + result = construct_ldh_submission_subject(TEST_HGVS_IDENTIFIER) + assert result == {"Variant": {"hgvs": TEST_HGVS_IDENTIFIER}} + + +def test_construct_ldh_submission_event(): + sbj = construct_ldh_submission_subject(TEST_HGVS_IDENTIFIER) + + with ( + patch("mavedb.lib.clingen.content_constructors.uuid4") as mock_uuid4, + ): + mock_uuid4.return_value = UUID("12345678-1234-5678-1234-567812345678") + + result = construct_ldh_submission_event(sbj) + + assert result["type"] == LDH_SUBMISSION_TYPE + assert result["name"] == LDH_ENTITY_NAME + assert result["uuid"] == "12345678-1234-5678-1234-567812345678" + assert result["sbj"] == { + "id": TEST_HGVS_IDENTIFIER, + "type": "Variant", + "format": "hgvs", + "add": True, + } + assert result["triggered"]["by"] == { + "host": MAVEDB_BASE_GIT, + "id": "resource_published", + "iri": f"{MAVEDB_BASE_GIT}/releases/tag/v{__version__}", + } + + +def test_construct_ldh_submission_entity(mock_variant, mock_mapped_variant): + result = construct_ldh_submission_entity(mock_variant, mock_mapped_variant) + + assert "MaveDBMapping" in result + assert len(result["MaveDBMapping"]) == 1 + mapping = result["MaveDBMapping"][0] + + assert mapping["entContent"]["mavedb_id"] == VALID_VARIANT_URN + assert mapping["entContent"]["pre_mapped"] == TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X + assert mapping["entContent"]["post_mapped"] == TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X + assert mapping["entContent"]["mapping_api_version"] == "pytest.mapping.1.0" + assert mapping["entContent"]["score"] == 1.0 + + assert mapping["entId"] == VALID_VARIANT_URN + assert mapping["entIri"] == f"{MAVEDB_FRONTEND_URL}/{VALID_VARIANT_URN}" + + +def test_construct_ldh_submission(mock_variant, mock_mapped_variant): + variant_content = [ + (TEST_HGVS_IDENTIFIER, mock_variant, mock_mapped_variant), + (TEST_HGVS_IDENTIFIER, mock_variant, mock_mapped_variant), + ] + + uuid_1 = UUID("12345678-1234-5678-1234-567812345678") + uuid_2 = UUID("87654321-4321-8765-4321-876543218765") + + with ( + patch("mavedb.lib.clingen.content_constructors.uuid4") as mock_uuid4, + ): + mock_uuid4.side_effect = [ + uuid_1, + uuid_2, + ] + + result = construct_ldh_submission(variant_content) + + assert len(result) == 2 + + # Validate the first submission + submission1 = result[0] + assert submission1["event"]["uuid"] == str(uuid_1) + assert submission1["event"]["sbj"]["id"] == TEST_HGVS_IDENTIFIER + assert submission1["content"]["sbj"] == {"Variant": {"hgvs": TEST_HGVS_IDENTIFIER}} + assert submission1["content"]["ld"]["MaveDBMapping"][0]["entContent"]["mavedb_id"] == VALID_VARIANT_URN + assert submission1["content"]["ld"]["MaveDBMapping"][0]["entContent"]["score"] == 1.0 + + # Validate the second submission + submission2 = result[1] + assert submission2["event"]["uuid"] == str(uuid_2) + assert submission2["event"]["sbj"]["id"] == TEST_HGVS_IDENTIFIER + assert submission2["content"]["sbj"] == {"Variant": {"hgvs": TEST_HGVS_IDENTIFIER}} + assert submission2["content"]["ld"]["MaveDBMapping"][0]["entContent"]["mavedb_id"] == VALID_VARIANT_URN + assert submission2["content"]["ld"]["MaveDBMapping"][0]["entContent"]["score"] == 1.0 diff --git a/tests/lib/clingen/test_linked_data_hub.py b/tests/lib/clingen/test_linked_data_hub.py new file mode 100644 index 000000000..43dd80fdd --- /dev/null +++ b/tests/lib/clingen/test_linked_data_hub.py @@ -0,0 +1,266 @@ +import os +from urllib import parse +import pytest +import requests +from datetime import datetime +from unittest.mock import patch, MagicMock + +from mavedb.lib.clingen.constants import LDH_LINKED_DATA_URL, GENBOREE_ACCOUNT_NAME, GENBOREE_ACCOUNT_PASSWORD +from mavedb.lib.utils import batched +from mavedb.lib.clingen.linked_data_hub import ( + ClinGenLdhService, + get_clingen_variation, + clingen_allele_id_from_ldh_variation, +) + +from tests.helpers.constants import VALID_CLINGEN_CA_ID + + +TEST_CLINGEN_URL = "https://pytest.clingen.com" + + +@pytest.fixture +def clingen_service(): + yield ClinGenLdhService(url=TEST_CLINGEN_URL) + + +class TestClinGenLdhService: + def test_init(self, clingen_service): + assert clingen_service.url == TEST_CLINGEN_URL + + ### Test the authenticate method + + def test_authenticate_with_existing_jwt(self, clingen_service: ClinGenLdhService): + with patch.object(ClinGenLdhService, "_existing_jwt", return_value="existing_jwt_token") as mock_existing_jwt: + jwt = clingen_service.authenticate() + + assert jwt == "existing_jwt_token" + mock_existing_jwt.assert_called_once() + + @patch("mavedb.lib.clingen.linked_data_hub.requests.post") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService._existing_jwt") + def test_authenticate_with_new_jwt(self, mock_existing_jwt, mock_post, clingen_service): + mock_existing_jwt.return_value = None + + mock_response = MagicMock() + mock_response.json.return_value = {"data": {"jwt": "new_jwt_token"}} + mock_response.raise_for_status = MagicMock() + mock_post.return_value = mock_response + + jwt = clingen_service.authenticate() + assert jwt == "new_jwt_token" + assert os.environ["GENBOREE_JWT"] == "new_jwt_token" + mock_post.assert_called_once_with( + f"https://genboree.org/auth/usr/gb:{GENBOREE_ACCOUNT_NAME}/auth", + json={"type": "plain", "val": GENBOREE_ACCOUNT_PASSWORD}, + ) + + @patch("mavedb.lib.clingen.linked_data_hub.requests.post") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService._existing_jwt") + def test_authenticate_http_error(self, mock_existing_jwt, mock_post, clingen_service): + mock_existing_jwt.return_value = None + + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("HTTP Error") + mock_post.return_value = mock_response + + with pytest.raises(requests.exceptions.HTTPError, match="HTTP Error"): + clingen_service.authenticate() + + mock_post.assert_called_once() + + @patch("mavedb.lib.clingen.linked_data_hub.requests.post") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService._existing_jwt") + def test_authenticate_missing_jwt_in_response(self, mock_existing_jwt, mock_post, clingen_service): + mock_existing_jwt.return_value = None + + mock_response = MagicMock() + mock_response.json.return_value = {"data": {}} + mock_response.raise_for_status = MagicMock() + mock_post.return_value = mock_response + + with pytest.raises(ValueError, match="Could not parse JWT from valid response"): + clingen_service.authenticate() + + mock_post.assert_called_once() + + ### Test the _existing_jwt method + + @patch("mavedb.lib.clingen.linked_data_hub.os.getenv") + @patch("mavedb.lib.clingen.linked_data_hub.jwt.get_unverified_claims") + def test_existing_jwt_valid(self, mock_get_unverified_claims, mock_getenv, clingen_service): + mock_getenv.return_value = "valid_jwt_token" + mock_get_unverified_claims.return_value = {"exp": (datetime.now().timestamp() + 3600)} + + jwt = clingen_service._existing_jwt() + + assert jwt == "valid_jwt_token" + mock_getenv.assert_called_once_with("GENBOREE_JWT") + mock_get_unverified_claims.assert_called_once_with("valid_jwt_token") + + @patch("mavedb.lib.clingen.linked_data_hub.os.getenv") + @patch("mavedb.lib.clingen.linked_data_hub.jwt.get_unverified_claims") + def test_existing_jwt_expired(self, mock_get_unverified_claims, mock_getenv, clingen_service): + mock_getenv.return_value = "expired_jwt_token" + mock_get_unverified_claims.return_value = {"exp": (datetime.now().timestamp() - 3600)} + + jwt = clingen_service._existing_jwt() + + assert jwt is None + mock_getenv.assert_called_once_with("GENBOREE_JWT") + mock_get_unverified_claims.assert_called_once_with("expired_jwt_token") + + @patch("mavedb.lib.clingen.linked_data_hub.os.getenv") + def test_existing_jwt_not_set(self, mock_getenv, clingen_service): + mock_getenv.return_value = None + + jwt = clingen_service._existing_jwt() + + assert jwt is None + mock_getenv.assert_called_once_with("GENBOREE_JWT") + + ### Test the dispatch_submissions method + + @patch("mavedb.lib.clingen.linked_data_hub.requests.put") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService.authenticate") + @patch("mavedb.lib.clingen.linked_data_hub.batched") + def test_dispatch_submissions_success(self, mock_batched, mock_authenticate, mock_request, clingen_service): + mock_authenticate.return_value = "test_jwt_token" + mock_request.return_value.json.return_value = {"success": True} + + content_submissions = [{"id": 1}, {"id": 2}, {"id": 3}] + mock_batched.return_value = [[{"id": 1}, {"id": 2}], [{"id": 3}]] # Simulate batching + + batch_size = 2 + successes, failures = clingen_service.dispatch_submissions(content_submissions, batch_size=batch_size) + + assert len(successes) == 2 # 2 batches + assert len(failures) == 0 + mock_batched.assert_called_once_with(content_submissions, 2) + for submission in batched(content_submissions, batch_size): + mock_request.assert_any_call( + url=clingen_service.url, + json=submission, + headers={"Authorization": "Bearer test_jwt_token", "Content-Type": "application/json"}, + ) + + @patch("mavedb.lib.clingen.linked_data_hub.requests.put") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService.authenticate") + def test_dispatch_submissions_failure(self, mock_authenticate, mock_request, clingen_service): + mock_authenticate.return_value = "test_jwt_token" + mock_request.side_effect = requests.exceptions.RequestException("Request failed") + + content_submissions = [{"id": 1}, {"id": 2}, {"id": 3}] + + successes, failures = clingen_service.dispatch_submissions(content_submissions) + + assert len(successes) == 0 + assert len(failures) == 3 + for submission in content_submissions: + mock_request.assert_any_call( + url=clingen_service.url, + json=submission, + headers={"Authorization": "Bearer test_jwt_token", "Content-Type": "application/json"}, + ) + + @patch("mavedb.lib.clingen.linked_data_hub.requests.put") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService.authenticate") + def test_dispatch_submissions_partial_success(self, mock_authenticate, mock_request, clingen_service): + mock_authenticate.return_value = "test_jwt_token" + + def mock_request_side_effect(*args, **kwargs): + if kwargs["json"]["id"] == 2: + raise requests.exceptions.RequestException("Request failed") + return MagicMock(json=MagicMock(return_value={"success": True})) + + mock_request.side_effect = mock_request_side_effect + + content_submissions = [{"id": 1}, {"id": 2}, {"id": 3}] + + successes, failures = clingen_service.dispatch_submissions(content_submissions) + + assert len(successes) == 2 + assert len(failures) == 1 + assert failures[0]["id"] == 2 + + @patch("mavedb.lib.clingen.linked_data_hub.requests.put") + @patch("mavedb.lib.clingen.linked_data_hub.ClinGenLdhService.authenticate") + @patch("mavedb.lib.clingen.linked_data_hub.batched") + def test_dispatch_submissions_no_batching(self, mock_batched, mock_authenticate, mock_request, clingen_service): + mock_authenticate.return_value = "test_jwt_token" + mock_request.return_value.json.return_value = {"success": True} + + content_submissions = [{"id": 1}, {"id": 2}, {"id": 3}] + mock_batched.return_value = content_submissions # No batching + + successes, failures = clingen_service.dispatch_submissions(content_submissions) + + assert len(successes) == 3 + assert len(failures) == 0 + mock_batched.assert_not_called() + for submission in content_submissions: + mock_request.assert_any_call( + url=clingen_service.url, + json=submission, + headers={"Authorization": "Bearer test_jwt_token", "Content-Type": "application/json"}, + ) + + +@patch("mavedb.lib.clingen.linked_data_hub.requests.get") +def test_get_clingen_variation_success(mock_get): + mocked_response_json = {"data": {"ldFor": {"Variant": [{"id": "variant_1", "name": "Test Variant"}]}}} + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = mocked_response_json + mock_get.return_value = mock_response + + urn = "urn:example:variant" + result = get_clingen_variation(urn) + + assert result == mocked_response_json + mock_get.assert_called_once_with( + f"{LDH_LINKED_DATA_URL}/{parse.quote_plus(urn)}", + headers={"Accept": "application/json"}, + ) + + +@patch("mavedb.lib.clingen.linked_data_hub.requests.get") +def test_get_clingen_variation_failure(mock_get): + mock_response = MagicMock() + mock_response.status_code = 404 + mock_response.text = "Not Found" + mock_get.return_value = mock_response + + urn = "urn:example:nonexistent_variant" + result = get_clingen_variation(urn) + + assert result is None + mock_get.assert_called_once_with( + f"{LDH_LINKED_DATA_URL}/{parse.quote_plus(urn)}", + headers={"Accept": "application/json"}, + ) + + +def test_clingen_allele_id_from_ldh_variation_success(): + variation = {"data": {"ldFor": {"Variant": [{"entId": VALID_CLINGEN_CA_ID}]}}} + result = clingen_allele_id_from_ldh_variation(variation) + assert result == VALID_CLINGEN_CA_ID + + +def test_clingen_allele_id_from_ldh_variation_missing_key(): + variation = {"data": {"ldFor": {"Variant": []}}} + + result = clingen_allele_id_from_ldh_variation(variation) + assert result is None + + +def test_clingen_allele_id_from_ldh_variation_no_variation(): + result = clingen_allele_id_from_ldh_variation(None) + assert result is None + + +def test_clingen_allele_id_from_ldh_variation_key_error(): + variation = {"data": {}} + + result = clingen_allele_id_from_ldh_variation(variation) + assert result is None diff --git a/tests/lib/conftest.py b/tests/lib/conftest.py index 076dac4b3..a3629ca91 100644 --- a/tests/lib/conftest.py +++ b/tests/lib/conftest.py @@ -1,10 +1,19 @@ import pytest +from unittest import mock +from datetime import datetime from mavedb.models.enums.user_role import UserRole +from mavedb.models.experiment_set import ExperimentSet +from mavedb.models.experiment import Experiment from mavedb.models.license import License +from mavedb.models.publication_identifier import PublicationIdentifier +from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation from mavedb.models.role import Role from mavedb.models.taxonomy import Taxonomy +from mavedb.models.score_set import ScoreSet from mavedb.models.user import User +from mavedb.models.variant import Variant +from mavedb.models.mapped_variant import MappedVariant from tests.helpers.constants import ( ADMIN_USER, EXTRA_USER, @@ -12,6 +21,15 @@ TEST_INACTIVE_LICENSE, TEST_TAXONOMY, TEST_USER, + VALID_VARIANT_URN, + VALID_SCORE_SET_URN, + VALID_EXPERIMENT_URN, + VALID_EXPERIMENT_SET_URN, + TEST_PUBMED_IDENTIFIER, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, + TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, + TEST_SCORE_SET_RANGE, + TEST_SCORE_CALIBRATION, ) @@ -29,3 +47,87 @@ def setup_lib_db(session): db.add(License(**TEST_LICENSE)) db.add(License(**TEST_INACTIVE_LICENSE)) db.commit() + + +@pytest.fixture +def mock_user(): + mv = mock.Mock(spec=User) + mv.username = TEST_USER["username"] + return mv + + +@pytest.fixture +def mock_publication(): + mv = mock.Mock(spec=PublicationIdentifier) + mv.identifier = TEST_PUBMED_IDENTIFIER + mv.url = f"http://www.ncbi.nlm.nih.gov/pubmed/{TEST_PUBMED_IDENTIFIER}" + return mv + + +@pytest.fixture +def mock_publication_associations(mock_publication): + mv = mock.Mock(spec=ScoreSetPublicationIdentifierAssociation) + mv.publication = mock_publication + mv.primary = True + return [mv] + + +@pytest.fixture +def mock_experiment_set(): + resource = mock.Mock(spec=ExperimentSet) + resource.urn = VALID_EXPERIMENT_SET_URN + resource.creation_date = datetime(2023, 1, 1) + resource.modification_date = datetime(2023, 1, 2) + return resource + + +@pytest.fixture +def mock_experiment(): + experiment = mock.Mock(spec=Experiment) + experiment.title = "Test Experiment" + experiment.urn = VALID_EXPERIMENT_URN + experiment.creation_date = datetime(2023, 1, 1) + experiment.modification_date = datetime(2023, 1, 2) + return experiment + + +@pytest.fixture +def mock_score_set(mock_user, mock_experiment, mock_publication_associations): + score_set = mock.Mock(spec=ScoreSet) + score_set.urn = VALID_SCORE_SET_URN + score_set.score_ranges = TEST_SCORE_SET_RANGE + score_set.score_calibrations = {"pillar_project": TEST_SCORE_CALIBRATION} + score_set.license.short_name = "MIT" + score_set.created_by = mock_user + score_set.modified_by = mock_user + score_set.published_date = datetime(2023, 1, 1) + score_set.title = "Mock score set" + score_set.creation_date = datetime(2023, 1, 2) + score_set.modification_date = datetime(2023, 1, 3) + score_set.experiment = mock_experiment + score_set.publication_identifier_associations = mock_publication_associations + return score_set + + +@pytest.fixture +def mock_variant(mock_score_set): + variant = mock.Mock(spec=Variant) + variant.urn = VALID_VARIANT_URN + variant.score_set = mock_score_set + variant.data = {"score_data": {"score": 1.0}} + variant.creation_date = datetime(2023, 1, 2) + variant.modification_date = datetime(2023, 1, 3) + return variant + + +@pytest.fixture +def mock_mapped_variant(mock_variant): + mv = mock.Mock(spec=MappedVariant) + mv.mapping_api_version = "pytest.mapping.1.0" + mv.mapped_date = datetime(2023, 1, 1) + mv.variant = mock_variant + mv.pre_mapped = TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X + mv.post_mapped = TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X + mv.mapped_date = datetime(2023, 1, 2) + mv.modification_date = datetime(2023, 1, 3) + return mv diff --git a/tests/lib/test_score_set.py b/tests/lib/test_score_set.py index d95ad6f16..3179b9218 100644 --- a/tests/lib/test_score_set.py +++ b/tests/lib/test_score_set.py @@ -21,7 +21,7 @@ ) from mavedb.models.score_set import ScoreSet from mavedb.models.variant import Variant -from tests.helpers.constants import TEST_SAVED_SCORESET_RANGE +from tests.helpers.constants import TEST_SAVED_SCORE_SET_RANGE from tests.helpers.util import create_acc_score_set, create_experiment, create_seq_score_set @@ -320,7 +320,7 @@ def test_create_null_score_range(setup_lib_db, client, session): def test_update_null_score_range(setup_lib_db, client, session): experiment = create_experiment(client) - score_set = create_seq_score_set(client, experiment["urn"], update={"scoreRanges": TEST_SAVED_SCORESET_RANGE}) + score_set = create_seq_score_set(client, experiment["urn"], update={"scoreRanges": TEST_SAVED_SCORE_SET_RANGE}) db_score_set = session.scalar(select(ScoreSet).where(ScoreSet.score_ranges.is_(None))) assert db_score_set is None diff --git a/tests/lib/test_variants.py b/tests/lib/test_variants.py new file mode 100644 index 000000000..92c5791f7 --- /dev/null +++ b/tests/lib/test_variants.py @@ -0,0 +1,68 @@ +import pytest +from unittest.mock import MagicMock + +from mavedb.lib.variants import hgvs_from_vrs_allele +from mavedb.lib.variants import hgvs_from_mapped_variant + +from tests.helpers.constants import ( + TEST_HGVS_IDENTIFIER, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, + TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE, + TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, +) + + +@pytest.mark.parametrize("allele", [TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X]) +def test_hgvs_from_vrs_allele(allele): + result = hgvs_from_vrs_allele(allele) + assert result == TEST_HGVS_IDENTIFIER + + +def test_hgvs_from_vrs_allele_invalid(): + allele = {"invalid_key": "invalid_value"} + with pytest.raises(KeyError): + hgvs_from_vrs_allele(allele) + + +def test_hgvs_from_mapped_variant_haplotype(): + mapped_variant = MagicMock() + mapped_variant.post_mapped = TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE + result = hgvs_from_mapped_variant(mapped_variant) + assert result == [TEST_HGVS_IDENTIFIER, TEST_HGVS_IDENTIFIER] + + +def test_hgvs_from_mapped_variant_cis_phased_block(): + mapped_variant = MagicMock() + mapped_variant.post_mapped = TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK + result = hgvs_from_mapped_variant(mapped_variant) + assert result == [TEST_HGVS_IDENTIFIER, TEST_HGVS_IDENTIFIER] + + +@pytest.mark.parametrize("allele", [TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X]) +def test_hgvs_from_mapped_variant_single_allele(allele): + mapped_variant = MagicMock() + mapped_variant.post_mapped = allele + result = hgvs_from_mapped_variant(mapped_variant) + assert result == [TEST_HGVS_IDENTIFIER] + + +def test_hgvs_from_mapped_variant_empty_post_mapped(): + mapped_variant = MagicMock() + mapped_variant.post_mapped = None + result = hgvs_from_mapped_variant(mapped_variant) + assert result == [] + + +def test_hgvs_from_mapped_variant_invalid_type(): + mapped_variant = MagicMock() + mapped_variant.post_mapped = {"type": "InvalidType"} + with pytest.raises(ValueError): + hgvs_from_mapped_variant(mapped_variant) + + +def test_hgvs_from_mapped_variant_invalid_structure(): + mapped_variant = MagicMock() + mapped_variant.post_mapped = {"invalid_key": "InvalidType"} + with pytest.raises(KeyError): + hgvs_from_mapped_variant(mapped_variant) diff --git a/tests/routers/conftest.py b/tests/routers/conftest.py index 591c4e3ec..e634f6148 100644 --- a/tests/routers/conftest.py +++ b/tests/routers/conftest.py @@ -5,6 +5,7 @@ import cdot.hgvs.dataproviders import pytest +from mavedb.models.clinical_control import ClinicalControl from mavedb.models.controlled_keyword import ControlledKeyword from mavedb.models.contributor import Contributor from mavedb.models.enums.user_role import UserRole @@ -15,6 +16,8 @@ from mavedb.models.user import User from tests.helpers.constants import ( ADMIN_USER, + TEST_CLINVAR_CONTROL, + TEST_GENERIC_CLINICAL_CONTROL, EXTRA_USER, EXTRA_CONTRIBUTOR, TEST_CDOT_TRANSCRIPT, @@ -50,6 +53,8 @@ def setup_router_db(session): db.add(License(**TEST_INACTIVE_LICENSE)) db.add(License(**EXTRA_LICENSE)) db.add(Contributor(**EXTRA_CONTRIBUTOR)) + db.add(ClinicalControl(**TEST_CLINVAR_CONTROL)) + db.add(ClinicalControl(**TEST_GENERIC_CLINICAL_CONTROL)) db.bulk_save_objects([ControlledKeyword(**keyword_obj) for keyword_obj in TEST_DB_KEYWORDS]) db.commit() diff --git a/tests/routers/test_score_set.py b/tests/routers/test_score_set.py index 67c26b274..1ce59e1d8 100644 --- a/tests/routers/test_score_set.py +++ b/tests/routers/test_score_set.py @@ -7,10 +7,11 @@ import pytest from arq import ArqRedis from humps import camelize -from sqlalchemy import select +from sqlalchemy import select, delete from mavedb.lib.validation.urn_re import MAVEDB_TMP_URN_RE, MAVEDB_SCORE_SET_URN_RE, MAVEDB_EXPERIMENT_URN_RE from mavedb.models.enums.processing_state import ProcessingState +from mavedb.models.clinical_control import ClinicalControl from mavedb.models.experiment import Experiment as ExperimentDbModel from mavedb.models.score_set import ScoreSet as ScoreSetDbModel from mavedb.models.variant import Variant as VariantDbModel @@ -25,8 +26,8 @@ TEST_MINIMAL_SEQ_SCORESET_RESPONSE, TEST_PUBMED_IDENTIFIER, TEST_ORCID_ID, - TEST_SCORESET_RANGE, - TEST_SAVED_SCORESET_RANGE, + TEST_SCORE_SET_RANGE, + TEST_SAVED_SCORE_SET_RANGE, TEST_MINIMAL_ACC_SCORESET_RESPONSE, TEST_USER, TEST_INACTIVE_LICENSE, @@ -36,6 +37,8 @@ SAVED_SHORT_EXTRA_LICENSE, TEST_SCORE_CALIBRATION, TEST_SAVED_SCORE_CALIBRATION, + TEST_SAVED_CLINVAR_CONTROL, + TEST_SAVED_GENERIC_CLINICAL_CONTROL, ) from tests.helpers.dependency_overrider import DependencyOverrider from tests.helpers.util import ( @@ -46,6 +49,8 @@ create_seq_score_set, create_seq_score_set_with_variants, update_expected_response_for_created_resources, + create_seq_score_set_with_mapped_variants, + link_clinical_controls_to_mapped_variants, ) @@ -133,7 +138,7 @@ def test_create_score_set_with_score_range(client, setup_router_db): experiment = create_experiment(client) score_set = deepcopy(TEST_MINIMAL_SEQ_SCORESET) score_set["experimentUrn"] = experiment["urn"] - score_set.update({"score_ranges": TEST_SCORESET_RANGE}) + score_set.update({"score_ranges": TEST_SCORE_SET_RANGE}) response = client.post("/api/v1/score-sets/", json=score_set) assert response.status_code == 200 @@ -145,7 +150,7 @@ def test_create_score_set_with_score_range(client, setup_router_db): expected_response = update_expected_response_for_created_resources( deepcopy(TEST_MINIMAL_SEQ_SCORESET_RESPONSE), experiment, response_data ) - expected_response["scoreRanges"] = TEST_SAVED_SCORESET_RANGE + expected_response["scoreRanges"] = TEST_SAVED_SCORE_SET_RANGE assert sorted(expected_response.keys()) == sorted(response_data.keys()) for key in expected_response: @@ -159,7 +164,7 @@ def test_remove_score_range_from_score_set(client, setup_router_db): experiment = create_experiment(client) score_set = deepcopy(TEST_MINIMAL_SEQ_SCORESET) score_set["experimentUrn"] = experiment["urn"] - score_set.update({"score_ranges": TEST_SCORESET_RANGE}) + score_set.update({"score_ranges": TEST_SCORE_SET_RANGE}) response = client.post("/api/v1/score-sets/", json=score_set) assert response.status_code == 200 @@ -171,7 +176,7 @@ def test_remove_score_range_from_score_set(client, setup_router_db): expected_response = update_expected_response_for_created_resources( deepcopy(TEST_MINIMAL_SEQ_SCORESET_RESPONSE), experiment, response_data ) - expected_response["scoreRanges"] = TEST_SAVED_SCORESET_RANGE + expected_response["scoreRanges"] = TEST_SAVED_SCORE_SET_RANGE assert sorted(expected_response.keys()) == sorted(response_data.keys()) for key in expected_response: @@ -230,7 +235,7 @@ def test_cannot_create_score_set_with_invalid_target_gene_category(client, setup ("doi_identifiers", [{"identifier": TEST_CROSSREF_IDENTIFIER}], [SAVED_DOI_IDENTIFIER]), ("license_id", EXTRA_LICENSE["id"], SAVED_SHORT_EXTRA_LICENSE), ("target_genes", TEST_MINIMAL_ACC_SCORESET["targetGenes"], TEST_MINIMAL_ACC_SCORESET_RESPONSE["targetGenes"]), - ("score_ranges", TEST_SCORESET_RANGE, TEST_SAVED_SCORESET_RANGE), + ("score_ranges", TEST_SCORE_SET_RANGE, TEST_SAVED_SCORE_SET_RANGE), ], ) @pytest.mark.parametrize( @@ -360,7 +365,7 @@ def test_can_update_score_set_supporting_data_after_publication( ("target_genes", TEST_MINIMAL_ACC_SCORESET["targetGenes"], TEST_MINIMAL_SEQ_SCORESET_RESPONSE["targetGenes"]), ( "score_ranges", - TEST_SCORESET_RANGE, + TEST_SCORE_SET_RANGE, None, ), ], @@ -2415,3 +2420,159 @@ def test_download_counts_file(session, data_provider, client, setup_router_db, d assert "hgvs_nt" in columns assert "hgvs_pro" in columns assert "hgvs_splice" not in columns + + +######################################################################################################################## +# Fetching clinical controls and control options for a score set +######################################################################################################################## + + +def test_can_fetch_current_clinical_controls_for_score_set(client, setup_router_db, session, data_provider, data_files): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + link_clinical_controls_to_mapped_variants(session, score_set) + + response = client.get(f"/api/v1/score-sets/{score_set['urn']}/clinical-controls") + assert response.status_code == 200 + + response_data = response.json() + assert len(response_data) == 2 + for control in response_data: + mapped_variants = control.pop("mappedVariants") + assert len(mapped_variants) == 1 + assert all( + control[k] in (TEST_SAVED_CLINVAR_CONTROL[k], TEST_SAVED_GENERIC_CLINICAL_CONTROL[k]) + for k in TEST_SAVED_CLINVAR_CONTROL.keys() + if k != "mappedVariants" + ) + + +@pytest.mark.parametrize("clinical_control", [TEST_SAVED_CLINVAR_CONTROL, TEST_SAVED_GENERIC_CLINICAL_CONTROL]) +@pytest.mark.parametrize( + "parameters", [[("db", "dbName")], [("version", "dbVersion")], [("db", "dbName"), ("version", "dbVersion")]] +) +def test_can_fetch_current_clinical_controls_for_score_set_with_parameters( + client, setup_router_db, session, data_provider, data_files, clinical_control, parameters +): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + link_clinical_controls_to_mapped_variants(session, score_set) + + query_string = "?" + for param, accessor in parameters: + query_string += f"{param}={clinical_control[accessor]}&" + + # Remove the last '&' from the query string + query_string = query_string.strip("&") + + response = client.get(f"/api/v1/score-sets/{score_set['urn']}/clinical-controls{query_string}") + assert response.status_code == 200 + + response_data = response.json() + assert len(response_data) + for param, accessor in parameters: + assert all(control[accessor] == clinical_control[accessor] for control in response_data) + + +def test_cannot_fetch_clinical_controls_for_nonexistent_score_set( + client, setup_router_db, session, data_provider, data_files +): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + link_clinical_controls_to_mapped_variants(session, score_set) + + response = client.get(f"/api/v1/score-sets/{score_set['urn']+'xxx'}/clinical-controls") + + assert response.status_code == 404 + response_data = response.json() + assert f"score set with URN '{score_set['urn']+'xxx'}' not found" in response_data["detail"] + + +def test_cannot_fetch_clinical_controls_for_score_set_when_none_exist( + client, setup_router_db, session, data_provider, data_files +): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + + response = client.get(f"/api/v1/score-sets/{score_set['urn']}/clinical-controls") + + assert response.status_code == 404 + response_data = response.json() + assert ( + f"No clinical control variants matching the provided filters associated with score set URN {score_set['urn']} were found" + in response_data["detail"] + ) + + +def test_can_fetch_current_clinical_control_options_for_score_set( + client, setup_router_db, session, data_provider, data_files +): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + link_clinical_controls_to_mapped_variants(session, score_set) + + response = client.get(f"/api/v1/score-sets/{score_set['urn']}/clinical-controls/options") + assert response.status_code == 200 + + response_data = response.json() + assert len(response_data) == 2 + for control_option in response_data: + assert len(control_option["availableVersions"]) == 1 + assert control_option["dbName"] in ( + TEST_SAVED_CLINVAR_CONTROL["dbName"], + TEST_SAVED_GENERIC_CLINICAL_CONTROL["dbName"], + ) + assert all( + control_version + in (TEST_SAVED_CLINVAR_CONTROL["dbVersion"], TEST_SAVED_GENERIC_CLINICAL_CONTROL["dbVersion"]) + for control_version in control_option["availableVersions"] + ) + + +def test_cannot_fetch_clinical_control_options_for_nonexistent_score_set( + client, setup_router_db, session, data_provider, data_files +): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + link_clinical_controls_to_mapped_variants(session, score_set) + + response = client.get(f"/api/v1/score-sets/{score_set['urn']+'xxx'}/clinical-controls/options") + + assert response.status_code == 404 + response_data = response.json() + assert f"score set with URN '{score_set['urn']+'xxx'}' not found" in response_data["detail"] + + +def test_cannot_fetch_clinical_control_options_for_score_set_when_none_exist( + client, setup_router_db, session, data_provider, data_files +): + experiment = create_experiment(client) + score_set = create_seq_score_set_with_mapped_variants( + client, session, data_provider, experiment["urn"], data_files / "scores.csv" + ) + + # removes all clinical controls from the db. + session.execute(delete(ClinicalControl)) + session.commit() + + response = client.get(f"/api/v1/score-sets/{score_set['urn']}/clinical-controls/options") + print(response.json()) + + assert response.status_code == 404 + response_data = response.json() + assert ( + f"no clinical control variants associated with score set URN {score_set['urn']} were found" + in response_data["detail"] + ) diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 18e0846a0..dde195009 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -15,6 +15,7 @@ from mavedb.data_providers.services import VRSMap from mavedb.lib.mave.constants import HGVS_NT_COLUMN from mavedb.lib.score_sets import csv_data_to_df +from mavedb.lib.clingen.linked_data_hub import ClinGenLdhService, clingen_allele_id_from_ldh_variation from mavedb.lib.validation.exceptions import ValidationError from mavedb.models.enums.mapping_state import MappingState from mavedb.models.enums.processing_state import ProcessingState @@ -30,14 +31,24 @@ create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager, + submit_score_set_mappings_to_ldh, + link_clingen_variants, ) from tests.helpers.constants import ( TEST_CDOT_TRANSCRIPT, + TEST_CLINGEN_SUBMISSION_RESPONSE, + TEST_CLINGEN_SUBMISSION_BAD_RESQUEST_RESPONSE, + TEST_CLINGEN_SUBMISSION_UNAUTHORIZED_RESPONSE, + TEST_CLINGEN_LDH_LINKING_RESPONSE, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_EXPERIMENT, TEST_MINIMAL_SEQ_SCORESET, TEST_VARIANT_MAPPING_SCAFFOLD, VALID_ACCESSION, + TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS1_X, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, + TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, + TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, ) from tests.helpers.util import awaitable_exception @@ -91,6 +102,30 @@ async def setup_records_files_and_variants(session, async_client, data_files, in return score_set_with_variants +async def setup_records_files_and_variants_with_mapping( + session, async_client, data_files, input_score_set, standalone_worker_context +): + score_set = await setup_records_files_and_variants( + session, async_client, data_files, input_score_set, standalone_worker_context + ) + await sanitize_mapping_queue(standalone_worker_context, score_set) + + async def dummy_mapping_job(): + return await setup_mapping_output(async_client, session, score_set) + + with patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_mapping_job(), + ): + result = await map_variants_for_score_set(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert result["success"] + assert not result["retried"] + assert result["enqueued_job"] is not None + return session.scalars(select(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn)).one() + + async def sanitize_mapping_queue(standalone_worker_context, score_set): queued_job = await standalone_worker_context["redis"].rpop(MAPPING_QUEUE_NAME) assert int(queued_job.decode("utf-8")) == score_set.id @@ -108,10 +143,10 @@ async def setup_mapping_output(async_client, session, score_set, empty=False): variants = session.scalars(select(Variant).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn)).all() for variant in variants: mapped_score = { - "pre_mapped": {"test": "pre_mapped_output"}, - "pre_mapped_2_0": {"test": "pre_mapped_output (2.0)"}, - "post_mapped": {"test": "post_mapped_output"}, - "post_mapped_2_0": {"test": "post_mapped_output (2.0)"}, + "pre_mapped": TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS1_X, + "pre_mapped_2_0": TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, + "post_mapped": TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, + "post_mapped_2_0": TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, "mavedb_id": variant.urn, } @@ -429,6 +464,13 @@ async def test_create_variants_for_score_set_enqueues_manager_and_successful_map async def dummy_mapping_job(): return await setup_mapping_output(async_client, session, score_set) + async def dummy_submission_job(): + return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] + + # Variants have not yet been created, so infer their URNs. + async def dummy_linking_job(): + return [(f"{score_set_urn}#{i}", TEST_CLINGEN_LDH_LINKING_RESPONSE) for i in range(1, len(scores) + 1)] + with ( patch.object( cdot.hgvs.dataproviders.RESTDataProvider, @@ -438,9 +480,11 @@ async def dummy_mapping_job(): patch.object( _UnixSelectorEventLoop, "run_in_executor", - return_value=dummy_mapping_job(), + side_effect=[dummy_mapping_job(), dummy_submission_job(), dummy_linking_job()], ), - patch("mavedb.worker.jobs.BACKOFF_IN_SECONDS", 0), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + patch("mavedb.worker.jobs.MAPPING_BACKOFF_IN_SECONDS", 0), + patch("mavedb.worker.jobs.LINKING_BACKOFF_IN_SECONDS", 0), ): await arq_redis.enqueue_job("create_variants_for_score_set", uuid4().hex, score_set.id, 1, scores, counts) await arq_worker.async_run() @@ -550,6 +594,7 @@ async def dummy_mapping_job(): assert (await standalone_worker_context["redis"].get(MAPPING_CURRENT_ID_NAME)).decode("utf-8") == "" assert result["success"] assert not result["retried"] + assert result["enqueued_job"] is not None assert len(mapped_variants_for_score_set) == score_set.num_variants assert score_set.mapping_state == MappingState.complete assert score_set.mapping_errors is None @@ -623,6 +668,7 @@ async def dummy_mapping_job(): assert (await standalone_worker_context["redis"].get(MAPPING_CURRENT_ID_NAME)).decode("utf-8") == "" assert result["success"] assert not result["retried"] + assert result["enqueued_job"] is not None assert len(mapped_variants_for_score_set) == score_set.num_variants + 1 assert len(preexisting_variants) == 1 assert len(new_variants) == score_set.num_variants @@ -1015,6 +1061,7 @@ async def dummy_mapping_job(): assert (await standalone_worker_context["redis"].get(MAPPING_CURRENT_ID_NAME)).decode("utf-8") == "" assert result["success"] assert not result["retried"] + assert result["enqueued_job"] is not None assert len(mapped_variants_for_score_set) == 0 assert score_set.mapping_state == MappingState.failed @@ -1348,6 +1395,17 @@ async def test_mapping_manager_enqueues_mapping_process_with_successful_mapping( async def dummy_mapping_job(): return await setup_mapping_output(async_client, session, score_set) + async def dummy_submission_job(): + return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] + + async def dummy_linking_job(): + return [ + (variant_urn, TEST_CLINGEN_LDH_LINKING_RESPONSE) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + # We seem unable to mock requests via requests_mock that occur inside another event loop. Workaround # this limitation by instead patching the _UnixSelectorEventLoop 's executor function, with a coroutine # object that sets up test mappingn output. @@ -1355,9 +1413,11 @@ async def dummy_mapping_job(): patch.object( _UnixSelectorEventLoop, "run_in_executor", - return_value=dummy_mapping_job(), + side_effect=[dummy_mapping_job(), dummy_submission_job(), dummy_linking_job()], ), - patch("mavedb.worker.jobs.BACKOFF_IN_SECONDS", 0), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + patch("mavedb.worker.jobs.MAPPING_BACKOFF_IN_SECONDS", 0), + patch("mavedb.worker.jobs.LINKING_BACKOFF_IN_SECONDS", 0), ): await arq_redis.enqueue_job("variant_mapper_manager", uuid4().hex, 1) await arq_worker.async_run() @@ -1392,6 +1452,17 @@ async def failed_mapping_job(): async def dummy_mapping_job(): return await setup_mapping_output(async_client, session, score_set) + async def dummy_submission_job(): + return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] + + async def dummy_linking_job(): + return [ + (variant_urn, TEST_CLINGEN_LDH_LINKING_RESPONSE) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + # We seem unable to mock requests via requests_mock that occur inside another event loop. Workaround # this limitation by instead patching the _UnixSelectorEventLoop 's executor function, with a coroutine # object that sets up test mappingn output. @@ -1399,9 +1470,11 @@ async def dummy_mapping_job(): patch.object( _UnixSelectorEventLoop, "run_in_executor", - side_effect=[failed_mapping_job(), dummy_mapping_job()], + side_effect=[failed_mapping_job(), dummy_mapping_job(), dummy_submission_job(), dummy_linking_job()], ), - patch("mavedb.worker.jobs.BACKOFF_IN_SECONDS", 0), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + patch("mavedb.worker.jobs.MAPPING_BACKOFF_IN_SECONDS", 0), + patch("mavedb.worker.jobs.LINKING_BACKOFF_IN_SECONDS", 0), ): await arq_redis.enqueue_job("variant_mapper_manager", uuid4().hex, 1) await arq_worker.async_run() @@ -1442,7 +1515,7 @@ async def failed_mapping_job(): "run_in_executor", side_effect=[failed_mapping_job()] * 5, ), - patch("mavedb.worker.jobs.BACKOFF_IN_SECONDS", 0), + patch("mavedb.worker.jobs.MAPPING_BACKOFF_IN_SECONDS", 0), ): await arq_redis.enqueue_job("variant_mapper_manager", uuid4().hex, 1) await arq_worker.async_run() @@ -1457,3 +1530,601 @@ async def failed_mapping_job(): assert len(mapped_variants_for_score_set) == 0 assert score_set.mapping_state == MappingState.failed assert score_set.mapping_errors is not None + + +############################################################################################################################################ +# ClinGen Submission +############################################################################################################################################ + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_success( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_submission_job(): + return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_submission_job(), + ), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert result["success"] + assert not result["retried"] + assert result["enqueued_job"] is not None + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_exception_in_setup( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + with patch( + "mavedb.worker.jobs.setup_job_state", + side_effect=Exception(), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_exception_in_auth( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + with patch.object( + ClinGenLdhService, + "_existing_jwt", + side_effect=Exception(), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_no_variants_exist( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + with ( + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_exception_in_hgvs_generation( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + with patch( + "mavedb.lib.variants.hgvs_from_mapped_variant", + side_effect=Exception(), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_exception_in_ldh_submission_construction( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + with patch( + "mavedb.lib.clingen.content_constructors.construct_ldh_submission", + side_effect=Exception(), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_exception_during_submission( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def failed_submission_job(): + return Exception() + + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + side_effect=failed_submission_job(), + ), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "error_response", [TEST_CLINGEN_SUBMISSION_BAD_RESQUEST_RESPONSE, TEST_CLINGEN_SUBMISSION_UNAUTHORIZED_RESPONSE] +) +async def test_submit_score_set_mappings_to_ldh_submission_failures_exist( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis, error_response +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_submission_job(): + return [None, error_response] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_submission_job(), + ), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_exception_during_linking_enqueue( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_submission_job(): + return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_submission_job(), + ), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + patch.object(ArqRedis, "enqueue_job", side_effect=Exception()), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_submit_score_set_mappings_to_ldh_linking_not_queued_when_expected( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_submission_job(): + return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_submission_job(), + ), + patch.object(ClinGenLdhService, "_existing_jwt", return_value="test_jwt"), + patch.object(ArqRedis, "enqueue_job", return_value=None), + ): + result = await submit_score_set_mappings_to_ldh(standalone_worker_context, uuid4().hex, score_set.id) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +############################################################################################################################################## +## ClinGen Linkage +############################################################################################################################################## + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_success( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_linking_job(): + return [ + (variant_urn, TEST_CLINGEN_LDH_LINKING_RESPONSE) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_linking_job(), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + for variant in session.scalars( + select(MappedVariant).join(Variant).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ): + assert variant.clingen_allele_id == clingen_allele_id_from_ldh_variation(TEST_CLINGEN_LDH_LINKING_RESPONSE) + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_exception_in_setup( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + with patch( + "mavedb.worker.jobs.setup_job_state", + side_effect=Exception(), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + for variant in session.scalars( + select(MappedVariant).join(Variant).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ): + assert variant.clingen_allele_id is None + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_no_variants_to_link( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_exception_during_linkage( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + side_effect=Exception(), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_exception_while_parsing_linkages( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with patch( + "mavedb.lib.clingen.linked_data_hub.clingen_allele_id_from_ldh_variation", + side_effect=Exception(), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_failures_exist_but_do_not_eclipse_retry_threshold( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_linking_job(): + return [ + (variant_urn, None) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_linking_job(), + ), + patch( + "mavedb.worker.jobs.LINKED_DATA_RETRY_THRESHOLD", + 2, + ), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_failures_exist_and_eclipse_retry_threshold( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_linking_job(): + return [ + (variant_urn, None) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_linking_job(), + ), + patch( + "mavedb.worker.jobs.LINKED_DATA_RETRY_THRESHOLD", + 1, + ), + patch( + "mavedb.worker.jobs.LINKING_BACKOFF_IN_SECONDS", + 0, + ), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert not result["success"] + assert result["retried"] + assert result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_failures_exist_and_eclipse_retry_threshold_cant_enqueue( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_linking_job(): + return [ + (variant_urn, None) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_linking_job(), + ), + patch( + "mavedb.worker.jobs.LINKED_DATA_RETRY_THRESHOLD", + 1, + ), + patch.object(ArqRedis, "enqueue_job", return_value=awaitable_exception()), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 1) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"] + + +@pytest.mark.asyncio +async def test_link_score_set_mappings_to_ldh_objects_failures_exist_and_eclipse_retry_threshold_retries_exceeded( + setup_worker_db, standalone_worker_context, session, async_client, data_files, arq_worker, arq_redis +): + score_set = await setup_records_files_and_variants_with_mapping( + session, + async_client, + data_files, + TEST_MINIMAL_SEQ_SCORESET, + standalone_worker_context, + ) + + async def dummy_linking_job(): + return [ + (variant_urn, None) + for variant_urn in session.scalars( + select(Variant.urn).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn) + ).all() + ] + + # We are unable to mock requests via requests_mock that occur inside another event loop. Instead, patch the return + # value of the EventLoop itself, which would have made the request. + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_linking_job(), + ), + patch( + "mavedb.worker.jobs.LINKED_DATA_RETRY_THRESHOLD", + 1, + ), + patch( + "mavedb.worker.jobs.LINKING_BACKOFF_IN_SECONDS", + 0, + ), + patch( + "mavedb.worker.jobs.BACKOFF_LIMIT", + 1, + ), + ): + result = await link_clingen_variants(standalone_worker_context, uuid4().hex, score_set.id, 2) + + assert not result["success"] + assert not result["retried"] + assert not result["enqueued_job"]