From a6915246808598d63075b779625345bf75f70bb7 Mon Sep 17 00:00:00 2001 From: daletovar Date: Fri, 7 Aug 2020 17:17:54 -0700 Subject: [PATCH 1/3] add ts_to_dataset --- sgkit/api.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/sgkit/api.py b/sgkit/api.py index 3408b6052..bab5e964e 100644 --- a/sgkit/api.py +++ b/sgkit/api.py @@ -24,7 +24,6 @@ def create_genotype_call_dataset( variant_id: Any = None, ) -> xr.Dataset: """Create a dataset of genotype calls. - Parameters ---------- variant_contig_names : list of str @@ -46,12 +45,10 @@ def create_genotype_call_dataset( omitted all calls are unphased. variant_id: array_like, str or object, optional The unique identifier of the variant. - Returns ------- :class:`xarray.Dataset` The dataset of genotype calls. - """ check_array_like(variant_contig, kind="i", ndim=1) check_array_like(variant_position, kind="i", ndim=1) @@ -115,12 +112,10 @@ def create_genotype_dosage_dataset( missing value. variant_id: array_like, str or object, optional The unique identifier of the variant. - Returns ------- xr.Dataset The dataset of genotype calls. - """ check_array_like(variant_contig, kind="i", ndim=1) check_array_like(variant_position, kind="i", ndim=1) @@ -149,3 +144,31 @@ def create_genotype_dosage_dataset( data_vars["variant_id"] = ([DIM_VARIANT], variant_id) attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names} return xr.Dataset(data_vars=data_vars, attrs=attrs) + + +def ts_to_dataset(ts, samples=None): + """ + Convert the specified tskit tree sequence into an sgkit dataset. + Note this just generates haploids for now. With msprime 1.0, we'll be + able to generate diploid/whatever-ploid individuals easily. + """ + if samples is None: + samples = ts.samples() + tables = ts.dump_tables() + alleles = [] + genotypes = [] + for var in ts.variants(samples=samples): + alleles.append(var.alleles) + genotypes.append(var.genotypes) + alleles = np.array(alleles).astype("S") + genotypes = np.expand_dims(genotypes, axis=2) + + df = create_genotype_call_dataset( + variant_contig_names=["1"], + variant_contig=np.zeros(len(tables.sites), dtype=int), + variant_position=tables.sites.position.astype(int), + variant_alleles=alleles, + sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"), + call_genotype=genotypes, + ) + return df \ No newline at end of file From a8ebfa3d0d44a85cb8c90563a629b1abfb3a47fd Mon Sep 17 00:00:00 2001 From: daletovar Date: Fri, 7 Aug 2020 17:18:20 -0700 Subject: [PATCH 2/3] add minimal diversity and divergence remove ts_to_dataset from public api make divergence take in two datasets add minimal fst Add read_vcfzarr (#40) add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api make divergence take in two datasets add minimal fst add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add minimal fst add tajimas d fix allele count update cfg remove spaces add msprime and use np.testing add libgsl-dev dependency add docstrings ignore dep warning add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api make divergence take in two datasets add minimal fst Add read_vcfzarr (#40) add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api make divergence take in two datasets add minimal fst add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add minimal fst add tajimas d fix allele count update cfg remove spaces add msprime and use np.testing add libgsl-dev dependency add docstrings fix divide by zero --- .github/workflows/build.yml | 1 + requirements-dev.txt | 1 + setup.cfg | 3 +- sgkit/__init__.py | 5 ++ sgkit/api.py | 33 ++----- sgkit/stats/popgen.py | 166 ++++++++++++++++++++++++++++++++++++ sgkit/tests/test_popgen.py | 75 ++++++++++++++++ 7 files changed, 255 insertions(+), 29 deletions(-) create mode 100644 sgkit/stats/popgen.py create mode 100644 sgkit/tests/test_popgen.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46dd80499..f287e3162 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,6 +18,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + sudo apt install libgsl-dev # Needed for msprime < 1.0. Binary wheels include GSL for >= 1.0 python -m pip install --upgrade pip pip install -r requirements.txt -r requirements-dev.txt - name: Run pre-commit diff --git a/requirements-dev.txt b/requirements-dev.txt index 90caa787d..53fde6be1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,5 +6,6 @@ pytest-datadir hypothesis statsmodels zarr +msprime sphinx sphinx_rtd_theme diff --git a/setup.cfg b/setup.cfg index abe82e850..8fc2c6199 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,6 +44,7 @@ addopts = --doctest-modules --ignore=validation norecursedirs = .eggs docs filterwarnings = error + ignore::DeprecationWarning [flake8] ignore = @@ -61,7 +62,7 @@ ignore = profile = black default_section = THIRDPARTY known_first_party = sgkit -known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr +known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 9a90dc1f0..793776e33 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -11,6 +11,7 @@ from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats from .stats.association import gwas_linear_regression from .stats.hwe import hardy_weinberg_test +from .stats.popgen import Fst, Tajimas_D, divergence, diversity from .stats.regenie import regenie __all__ = [ @@ -28,4 +29,8 @@ "regenie", "hardy_weinberg_test", "variant_stats", + "diversity", + "divergence", + "Fst", + "Tajimas_D", ] diff --git a/sgkit/api.py b/sgkit/api.py index bab5e964e..3408b6052 100644 --- a/sgkit/api.py +++ b/sgkit/api.py @@ -24,6 +24,7 @@ def create_genotype_call_dataset( variant_id: Any = None, ) -> xr.Dataset: """Create a dataset of genotype calls. + Parameters ---------- variant_contig_names : list of str @@ -45,10 +46,12 @@ def create_genotype_call_dataset( omitted all calls are unphased. variant_id: array_like, str or object, optional The unique identifier of the variant. + Returns ------- :class:`xarray.Dataset` The dataset of genotype calls. + """ check_array_like(variant_contig, kind="i", ndim=1) check_array_like(variant_position, kind="i", ndim=1) @@ -112,10 +115,12 @@ def create_genotype_dosage_dataset( missing value. variant_id: array_like, str or object, optional The unique identifier of the variant. + Returns ------- xr.Dataset The dataset of genotype calls. + """ check_array_like(variant_contig, kind="i", ndim=1) check_array_like(variant_position, kind="i", ndim=1) @@ -144,31 +149,3 @@ def create_genotype_dosage_dataset( data_vars["variant_id"] = ([DIM_VARIANT], variant_id) attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names} return xr.Dataset(data_vars=data_vars, attrs=attrs) - - -def ts_to_dataset(ts, samples=None): - """ - Convert the specified tskit tree sequence into an sgkit dataset. - Note this just generates haploids for now. With msprime 1.0, we'll be - able to generate diploid/whatever-ploid individuals easily. - """ - if samples is None: - samples = ts.samples() - tables = ts.dump_tables() - alleles = [] - genotypes = [] - for var in ts.variants(samples=samples): - alleles.append(var.alleles) - genotypes.append(var.genotypes) - alleles = np.array(alleles).astype("S") - genotypes = np.expand_dims(genotypes, axis=2) - - df = create_genotype_call_dataset( - variant_contig_names=["1"], - variant_contig=np.zeros(len(tables.sites), dtype=int), - variant_position=tables.sites.position.astype(int), - variant_alleles=alleles, - sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"), - call_genotype=genotypes, - ) - return df \ No newline at end of file diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py new file mode 100644 index 000000000..cb64f23d4 --- /dev/null +++ b/sgkit/stats/popgen.py @@ -0,0 +1,166 @@ +from typing import Hashable + +import dask.array as da +import numpy as np +import xarray as xr +from xarray import DataArray, Dataset + +from .aggregation import count_variant_alleles + + +def diversity( + ds: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute diversity from allele counts. + + Because we're not providing any arguments on windowing, etc, + we return the total over the whole region. Maybe this isn't + the behaviour we want, but it's a starting point. Note that + this is different to the tskit default behaviour where we + normalise by the size of windows so that results + in different windows are comparable. However, we don't have + any information about the overall length of the sequence here + so we can't normalise by it. + + Parameters + ---------- + ds : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + diversity value. + """ + if len(ds.samples) < 2: + return xr.DataArray(np.nan) + if allele_counts not in ds: + ds = count_variant_alleles(ds) + ac = ds[allele_counts] + an = ac.sum(axis=1) + n_pairs = an * (an - 1) / 2 + n_same = (ac * (ac - 1) / 2).sum(axis=1) + n_diff = n_pairs - n_same + pi = n_diff / n_pairs + return pi.sum() # type: ignore[no-any-return] + + +def divergence( + ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute divergence between two genotype call datasets. + + Parameters + ---------- + ds1 : Dataset + Genotype call dataset. + ds2 : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + divergence value between the two datasets. + """ + if allele_counts not in ds1: + ds1 = count_variant_alleles(ds1) + ac1 = ds1[allele_counts] + if allele_counts not in ds2: + ds2 = count_variant_alleles(ds2) + ac2 = ds2[allele_counts] + an1 = ds1[allele_counts].sum(axis=1) + an2 = ds2[allele_counts].sum(axis=1) + + n_pairs = an1 * an2 + n_same = (ac1 * ac2).sum(axis=1) + n_diff = n_pairs - n_same + div = n_diff / n_pairs + return div.sum() # type: ignore[no-any-return] + + +def Fst( + ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute Fst between two genotype call datasets. + + Parameters + ---------- + ds1 : Dataset + Genotype call dataset. + ds2 : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + fst value between the two datasets. + """ + total_div = diversity(ds1) + diversity(ds2) + gs = divergence(ds1, ds2) + den = total_div + 2 * gs # type: ignore[operator] + fst = 1 - (2 * total_div / den) + return fst # type: ignore[no-any-return] + + +def Tajimas_D( + ds: Dataset, allele_counts: Hashable = "variant_allele_count", +) -> DataArray: + """Compute Tajimas' D for a genotype call dataset. + + Parameters + ---------- + ds : Dataset + Genotype call dataset. + allele_counts : Hashable + allele counts to use or calculate. + + Returns + ------- + DataArray + Tajimas' D value. + """ + if allele_counts not in ds: + ds = count_variant_alleles(ds) + ac = ds[allele_counts] + + # count segregating + S = ((ac > 0).sum(axis=1) > 1).sum() + + # assume number of chromosomes sampled is constant for all variants + n = ac.sum(axis=1).max() + + # (n-1)th harmonic number + a1 = (1 / da.arange(1, n)).sum() + + # calculate Watterson's theta (absolute value) + theta = S / a1 + + # calculate diversity + div = diversity(ds) + + # N.B., both theta estimates are usually divided by the number of + # (accessible) bases but here we want the absolute difference + d = div - theta + + # calculate the denominator (standard deviation) + a2 = (1 / (da.arange(1, n) ** 2)).sum() + b1 = (n + 1) / (3 * (n - 1)) + b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) + c1 = b1 - (1 / a1) + c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2)) + e1 = c1 / a1 + e2 = c2 / (a1 ** 2 + a2) + d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) + + if d_stdev == 0: + return xr.DataArray(np.nan) + + # finally calculate Tajima's D + D = d / d_stdev + return D # type: ignore[no-any-return] diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py new file mode 100644 index 000000000..478f4cae0 --- /dev/null +++ b/sgkit/tests/test_popgen.py @@ -0,0 +1,75 @@ +import msprime # type: ignore +import numpy as np +import pytest + +from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity + + +def ts_to_dataset(ts, samples=None): + """ + Convert the specified tskit tree sequence into an sgkit dataset. + Note this just generates haploids for now. With msprime 1.0, we'll be + able to generate diploid/whatever-ploid individuals easily. + """ + if samples is None: + samples = ts.samples() + tables = ts.dump_tables() + alleles = [] + genotypes = [] + for var in ts.variants(samples=samples): + alleles.append(var.alleles) + genotypes.append(var.genotypes) + alleles = np.array(alleles).astype("S") + genotypes = np.expand_dims(genotypes, axis=2) + + df = create_genotype_call_dataset( + variant_contig_names=["1"], + variant_contig=np.zeros(len(tables.sites), dtype=int), + variant_position=tables.sites.position.astype(int), + variant_alleles=alleles, + sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"), + call_genotype=genotypes, + ) + return df + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_diversity(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + div = diversity(ds).compute() + ts_div = ts.diversity(span_normalise=False) + np.testing.assert_allclose(div, ts_div) + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_divergence(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + subset_1 = ts.samples()[: ts.num_samples // 2] + subset_2 = ts.samples()[ts.num_samples // 2 :] + ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call] + ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call] + div = divergence(ds1, ds2).compute() + ts_div = ts.divergence([subset_1, subset_2], span_normalise=False) + np.testing.assert_allclose(div, ts_div) + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_Fst(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + subset_1 = ts.samples()[: ts.num_samples // 2] + subset_2 = ts.samples()[ts.num_samples // 2 :] + ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call] + ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call] + fst = Fst(ds1, ds2).compute() + ts_fst = ts.Fst([subset_1, subset_2]) + np.testing.assert_allclose(fst, ts_fst) + + +@pytest.mark.parametrize("size", [2, 3, 10, 100]) +def test_Tajimas_D(size): + ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) + ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + ts_d = ts.Tajimas_D() + d = Tajimas_D(ds).compute() + np.testing.assert_allclose(d, ts_d) From f09fac3606d3ab058c75b2518572286276e0b822 Mon Sep 17 00:00:00 2001 From: Dale Tovar Date: Thu, 3 Sep 2020 17:18:08 -0700 Subject: [PATCH 3/3] add functions to rst and avoid modifying input ds --- docs/api.rst | 4 ++++ sgkit/stats/popgen.py | 30 +++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 16d055d7f..fcffa357e 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -24,10 +24,14 @@ Methods count_call_alleles count_variant_alleles + divergence + diversity + Fst gwas_linear_regression hardy_weinberg_test regenie variant_stats + Tajimas_D Utilities ========= diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index cb64f23d4..abfc93c44 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -37,8 +37,10 @@ def diversity( if len(ds.samples) < 2: return xr.DataArray(np.nan) if allele_counts not in ds: - ds = count_variant_alleles(ds) - ac = ds[allele_counts] + ds_new = count_variant_alleles(ds) + else: + ds_new = ds + ac = ds_new[allele_counts] an = ac.sum(axis=1) n_pairs = an * (an - 1) / 2 n_same = (ac * (ac - 1) / 2).sum(axis=1) @@ -67,13 +69,17 @@ def divergence( divergence value between the two datasets. """ if allele_counts not in ds1: - ds1 = count_variant_alleles(ds1) - ac1 = ds1[allele_counts] + ds1_new = count_variant_alleles(ds1) + else: + ds1_new = ds1 + ac1 = ds1_new[allele_counts] if allele_counts not in ds2: - ds2 = count_variant_alleles(ds2) - ac2 = ds2[allele_counts] - an1 = ds1[allele_counts].sum(axis=1) - an2 = ds2[allele_counts].sum(axis=1) + ds2_new = count_variant_alleles(ds2) + else: + ds2_new = ds2 + ac2 = ds2_new[allele_counts] + an1 = ds1_new[allele_counts].sum(axis=1) + an2 = ds2_new[allele_counts].sum(axis=1) n_pairs = an1 * an2 n_same = (ac1 * ac2).sum(axis=1) @@ -126,8 +132,10 @@ def Tajimas_D( Tajimas' D value. """ if allele_counts not in ds: - ds = count_variant_alleles(ds) - ac = ds[allele_counts] + ds_new = count_variant_alleles(ds) + else: + ds_new = ds + ac = ds_new[allele_counts] # count segregating S = ((ac > 0).sum(axis=1) > 1).sum() @@ -142,7 +150,7 @@ def Tajimas_D( theta = S / a1 # calculate diversity - div = diversity(ds) + div = diversity(ds_new) # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference