diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 6417b5646..22b933c5a 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -4,6 +4,7 @@ DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset, + create_genotype_dosage_dataset, ) from .stats.association import gwas_linear_regression @@ -13,5 +14,6 @@ "DIM_SAMPLE", "DIM_VARIANT", "create_genotype_call_dataset", + "create_genotype_dosage_dataset", "gwas_linear_regression", ] diff --git a/sgkit/api.py b/sgkit/api.py index 3fcdb6744..eb5b68486 100644 --- a/sgkit/api.py +++ b/sgkit/api.py @@ -1,5 +1,6 @@ from typing import Any, Dict, Hashable, List +import numpy as np import xarray as xr from .utils import check_array_like @@ -79,3 +80,59 @@ def create_genotype_call_dataset( data_vars["variant/id"] = ([DIM_VARIANT], variant_id) attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names} return xr.Dataset(data_vars=data_vars, attrs=attrs) + + +def create_genotype_dosage_dataset( + *, + variant_contig_names: List[str], + variant_contig: Any, + variant_position: Any, + variant_alleles: Any, + sample_id: Any, + call_dosage: Any, + variant_id: Any = None, +) -> xr.Dataset: + """Create a dataset of genotype calls. + + Parameters + ---------- + variant_contig_names : list of str + The contig names. + variant_contig : array_like, int + The (index of the) contig for each variant. + variant_position : array_like, int + The reference position of the variant. + variant_alleles : array_like, S1 + The possible alleles for the variant. + sample_id : array_like, str + The unique identifier of the sample. + call_dosage : array_like, float + Dosages, encoded as floats, with NaN indicating a + missing value. + variant_id: array_like, str, optional + The unique identifier of the variant. + + Returns + ------- + xr.Dataset + The dataset of genotype calls. + + """ + check_array_like(variant_contig, kind="i", ndim=1) + check_array_like(variant_position, kind="i", ndim=1) + check_array_like(variant_alleles, kind="S", ndim=2) + check_array_like(sample_id, kind="U", ndim=1) + check_array_like(call_dosage, kind="f", ndim=2) + data_vars: Dict[Hashable, Any] = { + "variant/contig": ([DIM_VARIANT], variant_contig), + "variant/position": ([DIM_VARIANT], variant_position), + "variant/alleles": ([DIM_VARIANT, DIM_ALLELE], variant_alleles), + "sample/id": ([DIM_SAMPLE], sample_id), + "call/dosage": ([DIM_VARIANT, DIM_SAMPLE], call_dosage), + "call/dosage_mask": ([DIM_VARIANT, DIM_SAMPLE], np.isnan(call_dosage),), + } + if variant_id is not None: + check_array_like(variant_id, kind="U", ndim=1) + data_vars["variant/id"] = ([DIM_VARIANT], variant_id) + attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names} + return xr.Dataset(data_vars=data_vars, attrs=attrs) diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py index d37b44e66..127a0421a 100644 --- a/sgkit/tests/test_api.py +++ b/sgkit/tests/test_api.py @@ -7,6 +7,7 @@ DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset, + create_genotype_dosage_dataset, ) @@ -48,3 +49,34 @@ def test_create_genotype_call_dataset(): assert_array_equal(ds["call/genotype"], call_genotype) assert_array_equal(ds["call/genotype_mask"], call_genotype < 0) assert_array_equal(ds["call/genotype_phased"], call_genotype_phased) + + +def test_create_genotype_dosage_dataset(): + variant_contig_names = ["chr1"] + variant_contig = np.array([0, 0], dtype="i1") + variant_position = np.array([1000, 2000], dtype="i4") + variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1") + variant_id = np.array(["rs1", "rs2"], dtype=str) + sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str) + call_dosage = np.array([[0.8, 0.9, 1.0], [1.0, 1.1, 1.2]], dtype="f4") + ds = create_genotype_dosage_dataset( + variant_contig_names=variant_contig_names, + variant_contig=variant_contig, + variant_position=variant_position, + variant_alleles=variant_alleles, + sample_id=sample_id, + call_dosage=call_dosage, + variant_id=variant_id, + ) + + assert DIM_VARIANT in ds.dims + assert DIM_SAMPLE in ds.dims + + assert ds.attrs["contigs"] == variant_contig_names + assert_array_equal(ds["variant/contig"], variant_contig) + assert_array_equal(ds["variant/position"], variant_position) + assert_array_equal(ds["variant/alleles"], variant_alleles) + assert_array_equal(ds["variant/id"], variant_id) + assert_array_equal(ds["sample/id"], sample_id) + assert_array_equal(ds["call/dosage"], call_dosage) + assert_array_equal(ds["call/dosage_mask"], np.isnan(call_dosage))