diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46dd80499..f287e3162 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,6 +18,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + sudo apt install libgsl-dev # Needed for msprime < 1.0. Binary wheels include GSL for >= 1.0 python -m pip install --upgrade pip pip install -r requirements.txt -r requirements-dev.txt - name: Run pre-commit diff --git a/docs/api.rst b/docs/api.rst index 16d055d7f..fcffa357e 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -24,10 +24,14 @@ Methods count_call_alleles count_variant_alleles + divergence + diversity + Fst gwas_linear_regression hardy_weinberg_test regenie variant_stats + Tajimas_D Utilities ========= diff --git a/requirements-dev.txt b/requirements-dev.txt index 90caa787d..53fde6be1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,5 +6,6 @@ pytest-datadir hypothesis statsmodels zarr +msprime sphinx sphinx_rtd_theme diff --git a/setup.cfg b/setup.cfg index abe82e850..8fc2c6199 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,6 +44,7 @@ addopts = --doctest-modules --ignore=validation norecursedirs = .eggs docs filterwarnings = error + ignore::DeprecationWarning [flake8] ignore = @@ -61,7 +62,7 @@ ignore = profile = black default_section = THIRDPARTY known_first_party = sgkit -known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr +known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 9a90dc1f0..793776e33 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -11,6 +11,7 @@ from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats from .stats.association import gwas_linear_regression from .stats.hwe import hardy_weinberg_test +from .stats.popgen import Fst, Tajimas_D, divergence, diversity from .stats.regenie import regenie __all__ = [ @@ -28,4 +29,8 @@ "regenie", "hardy_weinberg_test", "variant_stats", + "diversity", + "divergence", + "Fst", + "Tajimas_D", ] diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py new file mode 100644 index 000000000..abfc93c44 --- /dev/null +++ b/sgkit/stats/popgen.py @@ -0,0 +1,174 @@ +from typing import Hashable + +import dask.array as da +import numpy as np +import xarray as xr +from xarray import DataArray, Dataset + +from .aggregation import count_variant_alleles + + +def diversity( + ds: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute diversity from allele counts. + + Because we're not providing any arguments on windowing, etc, + we return the total over the whole region. Maybe this isn't + the behaviour we want, but it's a starting point. Note that + this is different to the tskit default behaviour where we + normalise by the size of windows so that results + in different windows are comparable. However, we don't have + any information about the overall length of the sequence here + so we can't normalise by it. + + Parameters + ---------- + ds : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + diversity value. + """ + if len(ds.samples) < 2: + return xr.DataArray(np.nan) + if allele_counts not in ds: + ds_new = count_variant_alleles(ds) + else: + ds_new = ds + ac = ds_new[allele_counts] + an = ac.sum(axis=1) + n_pairs = an * (an - 1) / 2 + n_same = (ac * (ac - 1) / 2).sum(axis=1) + n_diff = n_pairs - n_same + pi = n_diff / n_pairs + return pi.sum() # type: ignore[no-any-return] + + +def divergence( + ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute divergence between two genotype call datasets. + + Parameters + ---------- + ds1 : Dataset + Genotype call dataset. + ds2 : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + divergence value between the two datasets. + """ + if allele_counts not in ds1: + ds1_new = count_variant_alleles(ds1) + else: + ds1_new = ds1 + ac1 = ds1_new[allele_counts] + if allele_counts not in ds2: + ds2_new = count_variant_alleles(ds2) + else: + ds2_new = ds2 + ac2 = ds2_new[allele_counts] + an1 = ds1_new[allele_counts].sum(axis=1) + an2 = ds2_new[allele_counts].sum(axis=1) + + n_pairs = an1 * an2 + n_same = (ac1 * ac2).sum(axis=1) + n_diff = n_pairs - n_same + div = n_diff / n_pairs + return div.sum() # type: ignore[no-any-return] + + +def Fst( + ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute Fst between two genotype call datasets. + + Parameters + ---------- + ds1 : Dataset + Genotype call dataset. + ds2 : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + fst value between the two datasets. + """ + total_div = diversity(ds1) + diversity(ds2) + gs = divergence(ds1, ds2) + den = total_div + 2 * gs # type: ignore[operator] + fst = 1 - (2 * total_div / den) + return fst # type: ignore[no-any-return] + + +def Tajimas_D( + ds: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute Tajimas' D for a genotype call dataset. + + Parameters + ---------- + ds : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + Tajimas' D value. + """ + if allele_counts not in ds: + ds_new = count_variant_alleles(ds) + else: + ds_new = ds + ac = ds_new[allele_counts] + + # count segregating + S = ((ac > 0).sum(axis=1) > 1).sum() + + # assume number of chromosomes sampled is constant for all variants + n = ac.sum(axis=1).max() + + # (n-1)th harmonic number + a1 = (1 / da.arange(1, n)).sum() + + # calculate Watterson's theta (absolute value) + theta = S / a1 + + # calculate diversity + div = diversity(ds_new) + + # N.B., both theta estimates are usually divided by the number of + # (accessible) bases but here we want the absolute difference + d = div - theta + + # calculate the denominator (standard deviation) + a2 = (1 / (da.arange(1, n) ** 2)).sum() + b1 = (n + 1) / (3 * (n - 1)) + b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) + c1 = b1 - (1 / a1) + c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2)) + e1 = c1 / a1 + e2 = c2 / (a1 ** 2 + a2) + d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) + + if d_stdev == 0: + return xr.DataArray(np.nan) + + # finally calculate Tajima's D + D = d / d_stdev + return D # type: ignore[no-any-return] diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py new file mode 100644 index 000000000..478f4cae0 --- /dev/null +++ b/sgkit/tests/test_popgen.py @@ -0,0 +1,75 @@ +import msprime # type: ignore +import numpy as np +import pytest + +from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity + + +def ts_to_dataset(ts, samples=None): + """ + Convert the specified tskit tree sequence into an sgkit dataset. + Note this just generates haploids for now. With msprime 1.0, we'll be + able to generate diploid/whatever-ploid individuals easily. + """ + if samples is None: + samples = ts.samples() + tables = ts.dump_tables() + alleles = [] + genotypes = [] + for var in ts.variants(samples=samples): + alleles.append(var.alleles) + genotypes.append(var.genotypes) + alleles = np.array(alleles).astype("S") + genotypes = np.expand_dims(genotypes, axis=2) + + df = create_genotype_call_dataset( + variant_contig_names=["1"], + variant_contig=np.zeros(len(tables.sites), dtype=int), + variant_position=tables.sites.position.astype(int), + variant_alleles=alleles, + sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"), + call_genotype=genotypes, + ) + return df + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_diversity(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + div = diversity(ds).compute() + ts_div = ts.diversity(span_normalise=False) + np.testing.assert_allclose(div, ts_div) + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_divergence(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + subset_1 = ts.samples()[: ts.num_samples // 2] + subset_2 = ts.samples()[ts.num_samples // 2 :] + ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call] + ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call] + div = divergence(ds1, ds2).compute() + ts_div = ts.divergence([subset_1, subset_2], span_normalise=False) + np.testing.assert_allclose(div, ts_div) + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_Fst(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + subset_1 = ts.samples()[: ts.num_samples // 2] + subset_2 = ts.samples()[ts.num_samples // 2 :] + ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call] + ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call] + fst = Fst(ds1, ds2).compute() + ts_fst = ts.Fst([subset_1, subset_2]) + np.testing.assert_allclose(fst, ts_fst) + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_Tajimas_D(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + ts_d = ts.Tajimas_D() + d = Tajimas_D(ds).compute() + np.testing.assert_allclose(d, ts_d)