Skip to content
Merged
2 changes: 1 addition & 1 deletion docker-compose-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ services:
- vrs-mapping-data-dev:/var/lib/postgresql/data

seqrepo:
image: biocommons/seqrepo:2021-01-29
image: biocommons/seqrepo:2024-12-20
volumes:
- vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo

Expand Down
2 changes: 1 addition & 1 deletion settings/.env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ MAVEDB_API_KEY=
# Environment variables for seqrepo
####################################################################################################

SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29
SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20
70 changes: 58 additions & 12 deletions src/api/routers/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,31 @@
from fastapi.responses import JSONResponse
from requests import HTTPError

from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
from dcd_mapping.align import build_alignment_result
from dcd_mapping.annotate import (
_get_computed_reference_sequence,
_get_mapped_reference_sequence,
_set_scoreset_layer,
annotate,
)
from dcd_mapping.lookup import DataLookupError
from dcd_mapping.mavedb_data import (
from dcd_mapping.exceptions import (
AlignmentError,
BlatNotFoundError,
DataLookupError,
MissingSequenceIdError,
ResourceAcquisitionError,
ScoresetNotSupportedError,
UnsupportedReferenceSequenceNameSpaceError,
UnsupportedReferenceSequencePrefixError,
VrsMapError,
)
from dcd_mapping.mavedb_data import (
get_raw_scoreset_metadata,
get_scoreset_metadata,
get_scoreset_records,
patch_target_sequence_type,
with_mavedb_score_set,
)
from dcd_mapping.resource_utils import ResourceAcquisitionError
from dcd_mapping.schemas import (
ScoreAnnotation,
ScoresetMapping,
Expand All @@ -31,7 +39,7 @@
VrsVersion,
)
from dcd_mapping.transcripts import select_transcripts
from dcd_mapping.vrs_map import VrsMapError, vrs_map
from dcd_mapping.vrs_map import vrs_map

router = APIRouter(
prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}
Expand Down Expand Up @@ -68,6 +76,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
error_message="Score set contains no variants to map",
).model_dump(exclude_none=True)
)
total_score_records = sum(len(v) for v in records.values())

try:
alignment_results = build_alignment_result(metadata, True)
Expand Down Expand Up @@ -114,21 +123,38 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
transcript=transcripts[target_gene],
silent=True,
)
except VrsMapError as e:
except (
UnsupportedReferenceSequenceNameSpaceError,
VrsMapError,
UnsupportedReferenceSequencePrefixError,
MissingSequenceIdError,
) as e:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
if not vrs_results or all(
mapping_result is None for mapping_result in vrs_results.values()
):

nonetype_vrs_results = [
result is None
for target_gene in vrs_results
for result in vrs_results[target_gene]
]

if not vrs_results or all(nonetype_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="No variant mappings available for this score set",
).model_dump(exclude_none=True)
)
if any(nonetype_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.",
).model_dump(exclude_none=True)
)

annotated_vrs_results = {}
try:
Expand All @@ -146,15 +172,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
if not annotated_vrs_results or all(
mapping_result is None for mapping_result in annotated_vrs_results.values()
):

nonetype_annotated_vrs_results = [
result is None
for target_gene in annotated_vrs_results
for result in annotated_vrs_results[target_gene]
]

if not annotated_vrs_results or all(nonetype_annotated_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="No annotated variant mappings available for this score set",
).model_dump(exclude_none=True)
)
if any(nonetype_annotated_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.",
).model_dump(exclude_none=True)
)

try:
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
Expand Down Expand Up @@ -235,6 +273,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
).model_dump(exclude_none=True)
)

if len(mapped_scores) != total_score_records:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.",
).model_dump(exclude_none=True)
)

return JSONResponse(
content=ScoresetMapping(
metadata=raw_metadata,
Expand Down
19 changes: 7 additions & 12 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
from Bio.SearchIO._model import Hit, QueryResult
from cool_seq_tool.schemas import Strand

from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError
from dcd_mapping.resource_utils import (
from dcd_mapping.exceptions import (
AlignmentError,
BlatNotFoundError,
ResourceAcquisitionError,
http_download,
ScoresetNotSupportedError,
)
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH
from dcd_mapping.resource_utils import http_download
from dcd_mapping.schemas import (
AlignmentResult,
GeneLocation,
Expand All @@ -32,14 +35,6 @@
_logger = logging.getLogger(__name__)


class AlignmentError(Exception):
"""Raise when errors encountered during alignment."""


class BlatNotFoundError(AlignmentError):
"""Raise when BLAT binary appears to be missing."""


def _write_query_file(file: Path, lines: list[str]) -> None:
"""Write lines to query file. This method is broken out to enable easy mocking while
testing.
Expand Down
84 changes: 52 additions & 32 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def _get_vrs_ref_allele_seq(
metadata: TargetGene,
urn: str,
tx_select_results: TxSelectResult | None,
) -> Extension:
) -> Extension | None:
"""Create `vrs_ref_allele_seq` property."""
start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end)
if (
Expand All @@ -161,8 +161,12 @@ def _get_vrs_ref_allele_seq(
seq = f"ga4gh:{allele.location.sequenceReference.refgetAccession}" # type: ignore
sr = get_seqrepo()
ref = sr.get_sequence(seq, start, end)
if ref is None:
raise ValueError

if not ref:
msg = f"Could not retrieve reference sequence for allele {allele.id} in urn {urn} with start {start} and end {end}"
_logger.warning(msg)
return None

return Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)


Expand Down Expand Up @@ -256,23 +260,28 @@ def _annotate_allele_mapping(
post_mapped: Allele = mapped_score.post_mapped

# get vrs_ref_allele_seq for pre-mapped variants
pre_mapped.extensions = [
_get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results)
]
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
pre_mapped, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
pre_mapped.extensions = [ref_allele_seq_extension]

if post_mapped:
# Determine reference sequence
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}"
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
if accession is None:
raise ValueError
if accession.startswith("refseq:"):
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
elif accession.startswith("refseq:"):
accession = accession[7:]
else:
if tx_results is None or isinstance(tx_results, TxSelectError):
raise ValueError # impossible by definition
accession = tx_results.np
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
else:
accession = tx_results.np

sr = get_seqrepo()
loc = mapped_score.post_mapped.location
Expand All @@ -281,8 +290,9 @@ def _annotate_allele_mapping(
post_mapped.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
if accession:
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]

if vrs_version == VrsVersion.V_1_3:
pre_mapped = _allele_to_vod(pre_mapped)
Expand All @@ -295,9 +305,7 @@ def _annotate_allele_mapping(
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score else None,
annotation_layer=mapped_score.annotation_layer,
error_message=mapped_score.error_message
if mapped_score.error_message
else None, # TODO might not need if statement here
error_message=mapped_score.error_message,
)


Expand All @@ -311,23 +319,32 @@ def _annotate_haplotype_mapping(
"""Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings."""
pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore
post_mapped: Haplotype = mapped_score.post_mapped # type: ignore

# get vrs_ref_allele_seq for pre-mapped variants
for allele in pre_mapped.members:
allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)]
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
allele, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
allele.extensions = [ref_allele_seq_extension]

if post_mapped:
# Determine reference sequence
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}"
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
if accession is None:
raise ValueError
if accession.startswith("refseq:"):
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
elif accession.startswith("refseq:"):
accession = accession[7:]
else:
if tx_results is None or isinstance(tx_results, TxSelectError):
raise ValueError # impossible by definition
accession = tx_results.np
# impossible by definition
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
else:
accession = tx_results.np

sr = get_seqrepo()
for allele in post_mapped.members:
Expand All @@ -337,8 +354,9 @@ def _annotate_haplotype_mapping(
allele.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]
hgvs, syntax = _get_hgvs_string(allele, accession)
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
if accession:
hgvs, syntax = _get_hgvs_string(allele, accession)
allele.expressions = [Expression(syntax=syntax, value=hgvs)]

if vrs_version == VrsVersion.V_1_3:
pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped)
Expand All @@ -351,9 +369,7 @@ def _annotate_haplotype_mapping(
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score is not None else None,
annotation_layer=mapped_score.annotation_layer,
error_message=mapped_score.error_message
if mapped_score.error_message
else None, # TODO might not need if statement here
error_message=mapped_score.error_message,
)


Expand Down Expand Up @@ -388,6 +404,7 @@ def annotate(
ScoreAnnotationWithLayer(
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score else None,
vrs_version=vrs_version,
error_message=mapped_score.error_message,
)
)
Expand All @@ -410,8 +427,16 @@ def annotate(
)
)
else:
# TODO how to combine this error message with other potential error messages?
ValueError("inconsistent variant structure")
score_annotations.append(
ScoreAnnotationWithLayer(
pre_mapped=mapped_score.pre_mapped,
post_mapped=mapped_score.post_mapped,
vrs_version=vrs_version,
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score else None,
error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}",
)
)

return score_annotations

Expand Down Expand Up @@ -519,11 +544,6 @@ def _set_scoreset_layer(
expressions. If genomic expressions are available, that's what we'd like to use.
This function tells us how to filter the final annotation objects.
"""
if urn.startswith("urn:mavedb:00000097"):
_logger.debug(
"Manually selecting protein annotation for scores from urn:mavedb:00000097"
)
return AnnotationLayer.PROTEIN
for mapping in mappings:
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
return AnnotationLayer.GENOMIC
Expand Down
10 changes: 6 additions & 4 deletions src/dcd_mapping/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@

import click

from dcd_mapping.align import AlignmentError
from dcd_mapping.exceptions import (
AlignmentError,
ResourceAcquisitionError,
TxSelectError,
VrsMapError,
)
from dcd_mapping.main import map_scoreset_urn
from dcd_mapping.resource_utils import ResourceAcquisitionError
from dcd_mapping.schemas import VrsVersion
from dcd_mapping.transcripts import TxSelectError
from dcd_mapping.vrs_map import VrsMapError

_logger = logging.getLogger(__name__)

Expand Down
Loading