Skip to content
108 changes: 55 additions & 53 deletions gnomad/utils/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,56 +36,56 @@

VRS_CHROM_IDS = {
"GRCh38": {
"chr1": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
"chr2": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g",
"chr3": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX",
"chr4": "ga4gh:SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc",
"chr5": "ga4gh:SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI",
"chr6": "ga4gh:SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV",
"chr7": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
"chr8": "ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs",
"chr9": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
"chr10": "ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB",
"chr11": "ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1",
"chr12": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl",
"chr13": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"chr14": "ga4gh:SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm",
"chr15": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
"chr16": "ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0",
"chr17": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
"chr18": "ga4gh:SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV",
"chr19": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
"chr20": "ga4gh:SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo",
"chr21": "ga4gh:SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8",
"chr22": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ",
"chrX": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
"chrY": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
"chr1": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
"chr2": "SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g",
"chr3": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX",
"chr4": "SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc",
"chr5": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI",
"chr6": "SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV",
"chr7": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
"chr8": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs",
"chr9": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
"chr10": "SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB",
"chr11": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1",
"chr12": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl",
"chr13": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"chr14": "SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm",
"chr15": "SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
"chr16": "SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0",
"chr17": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
"chr18": "SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV",
"chr19": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
"chr20": "SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo",
"chr21": "SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8",
"chr22": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ",
"chrX": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
"chrY": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
},
"GRCh37": {
"1": "ga4gh:SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU",
"2": "ga4gh:SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO",
"3": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS",
"4": "ga4gh:SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V",
"5": "ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX",
"6": "ga4gh:SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek",
"7": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86",
"8": "ga4gh:SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6",
"9": "ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
"10": "ga4gh:SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX",
"11": "ga4gh:SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG",
"12": "ga4gh:SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH",
"13": "ga4gh:SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH",
"14": "ga4gh:SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf",
"15": "ga4gh:SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt",
"16": "ga4gh:SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb",
"17": "ga4gh:SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz",
"18": "ga4gh:SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD",
"19": "ga4gh:SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX",
"20": "ga4gh:SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf",
"21": "ga4gh:SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg",
"22": "ga4gh:SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8",
"X": "ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm",
"Y": "ga4gh:SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-",
"1": "SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU",
"2": "SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO",
"3": "SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS",
"4": "SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V",
"5": "SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX",
"6": "SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek",
"7": "SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86",
"8": "SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6",
"9": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
"10": "SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX",
"11": "SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG",
"12": "SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH",
"13": "SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH",
"14": "SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf",
"15": "SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt",
"16": "SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb",
"17": "SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz",
"18": "SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD",
"19": "SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX",
"20": "SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf",
"21": "SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg",
"22": "SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8",
"X": "SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm",
"Y": "SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-",
},
}

Expand Down Expand Up @@ -2672,9 +2672,13 @@ def add_gks_vrs(
"state": {"type": "LiteralSequenceExpression", "sequence": vrs_state_sequence},
}

location_id = ga4gh_core._internal.identifiers.ga4gh_identify(
ga4gh_vrs.models.SequenceLocation(**vrs_dict_out["location"])
seq_ref = ga4gh_vrs.models.SequenceReference(refgetAccession=vrs_chrom_id)
seq_loc = ga4gh_vrs.models.SequenceLocation(
sequenceReference=seq_ref,
start=vrs_start_value,
end=vrs_end_value,
)
location_id = ga4gh_core.ga4gh_identify(seq_loc)

vrs_dict_out["location"]["_id"] = location_id

Expand Down Expand Up @@ -2860,9 +2864,7 @@ def _create_group_dicts(
ancillaryResults["grpMaxFAF95"] = {
"frequency": input_struct.grpMaxFAF95.grpmax_gen_anc,
"confidenceInterval": 0.95,
"groupId": (
f"{gnomad_id}.{input_struct.grpMaxFAF95.grpmax_gen_anc.upper()}"
),
"groupId": f"{gnomad_id}.{input_struct.grpMaxFAF95.grpmax_gen_anc.upper()}",
}

# Add joint group max FAF if it exists.
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
annoy
ga4gh.vrs[extras]==0.8.4
ga4gh.vrs[extras]==2.0.1
hail
hdbscan
ipywidgets
Expand Down
108 changes: 108 additions & 0 deletions tests/utils/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

from typing import Dict, List

import ga4gh.core as ga4gh_core
import ga4gh.vrs as ga4gh_vrs
import hail as hl
import pytest

from gnomad.utils.annotations import (
VRS_CHROM_IDS,
add_gks_vrs,
annotate_downsamplings,
fill_missing_key_combinations,
get_copy_state_by_sex,
Expand Down Expand Up @@ -1088,6 +1092,110 @@ def test_merge_histograms_sum_with_negatives_error(self, sample_ht):
ht.select(result_hist=result_hist).collect()


class TestVRSFunctions:
"""Test the VRS-related functions."""

def test_vrs_identifier_generation(self):
"""Test that GA4GH identifiers are generated correctly."""
# Test that we can import and access VRS modules
assert hasattr(ga4gh_core, "__version__")
assert hasattr(ga4gh_vrs, "models")
assert hasattr(ga4gh_vrs.models, "SequenceLocation")

# Test that VRS 2.0.1+ API is available
assert hasattr(
ga4gh_core, "ga4gh_identify"
), "VRS 2.0.1+ ga4gh_identify function not found"

# Test that we can create a VRS object using the 2.0.1+ API
seq_loc = ga4gh_vrs.models.SequenceLocation(
sequenceReference="ga4gh:SQ.test",
start=1,
end=2,
)
assert seq_loc is not None

def test_vrs_error_handling(self):
"""Test that VRS functions handle errors gracefully."""
# Test with invalid locus (should raise appropriate error)
with pytest.raises((AttributeError, TypeError)):
# This should fail because we're not providing proper Hail locus objects
add_gks_vrs("invalid_locus", "invalid_vrs")

def test_add_gks_vrs_actual_api_call(self):
"""Test that add_gks_vrs actually calls the VRS 2.0.1 API with real data."""
# Create a real Hail locus and VRS struct that would come from actual data
locus = hl.locus("chr1", 100, reference_genome="GRCh38")

# Create a VRS struct that mimics what would be in actual gnomAD data
vrs_struct = hl.struct(
VRS_Allele_IDs=["test_ref_id", "ga4gh:VA.test_var_id"],
VRS_Starts=[99, 99], # 0-based coordinates
VRS_Ends=[100, 100],
VRS_States=["A", "T"], # ref, alt
)

# Evaluate to get actual Python objects
locus_py = hl.eval(locus)
vrs_py = hl.eval(vrs_struct)

result = add_gks_vrs(locus_py, vrs_py)

# Verify the result has the expected VRS structure
assert isinstance(result, dict)
assert result["type"] == "Allele"
assert "location" in result
assert result["location"]["type"] == "SequenceLocation"
assert "_id" in result["location"] # This comes from ga4gh_identify() call
assert "state" in result
assert result["state"]["type"] == "LiteralSequenceExpression"
assert result["state"]["sequence"] == "T"

# Verify the chromosome ID matches our VRS_CHROM_IDS mapping
expected_chr1_id = VRS_CHROM_IDS["GRCh38"]["chr1"]
assert result["location"]["sequence_id"] == expected_chr1_id
assert expected_chr1_id == "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"

# Verify the location ID was generated by the VRS API
location_id = result["location"]["_id"]
assert isinstance(location_id, str)
assert location_id.startswith("ga4gh:") # VRS identifiers start with ga4gh:

def test_add_gks_vrs_grch37_chromosome_mapping(self):
"""Test that VRS chromosome mapping works correctly for GRCh37."""
# Test GRCh37 (note different chromosome naming: "1" vs "chr1")
locus_grch37 = hl.locus("1", 100, reference_genome="GRCh37")
vrs_struct = hl.struct(
VRS_Allele_IDs=["test_ref_id", "ga4gh:VA.test_var_id"],
VRS_Starts=[99, 99],
VRS_Ends=[100, 100],
VRS_States=["A", "T"],
)

locus_py = hl.eval(locus_grch37)
vrs_py = hl.eval(vrs_struct)

result = add_gks_vrs(locus_py, vrs_py)

# Verify GRCh37 chromosome 1 mapping
expected_chr1_grch37_id = VRS_CHROM_IDS["GRCh37"]["1"]
assert result["location"]["sequence_id"] == expected_chr1_grch37_id
assert expected_chr1_grch37_id == "SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU"

# Verify the API was called and generated a proper identifier
location_id = result["location"]["_id"]
assert isinstance(location_id, str)
assert location_id.startswith("ga4gh:")

def test_vrs_version_compatibility(self):
"""Test that VRS 2.0.1+ is properly installed."""
# Test that VRS 2.0.1+ API is available
assert hasattr(ga4gh_core, "ga4gh_identify"), "VRS 2.0.1+ is required"

# The function should work with VRS 2.0.1+
assert callable(add_gks_vrs)


class TestAnnotateDownsamplings:
"""Test the annotate_downsamplings function."""

Expand Down