sgkit-dev · jeromekelleher · Sep 14, 2020 · Aug 8, 2020 · Aug 8, 2020 · Sep 4, 2020
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -18,6 +18,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
+        sudo apt install libgsl-dev   # Needed for msprime < 1.0. Binary wheels include GSL for >= 1.0
         python -m pip install --upgrade pip
         pip install -r requirements.txt -r requirements-dev.txt
     - name: Run pre-commit

diff --git a/docs/api.rst b/docs/api.rst
@@ -24,10 +24,14 @@ Methods
 
    count_call_alleles
    count_variant_alleles
+   divergence
+   diversity
+   Fst
    gwas_linear_regression
    hardy_weinberg_test
    regenie
    variant_stats
+   Tajimas_D
 
 Utilities
 =========

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,5 +6,6 @@ pytest-datadir
 hypothesis
 statsmodels
 zarr
+msprime
 sphinx
 sphinx_rtd_theme
diff --git a/setup.cfg b/setup.cfg
@@ -44,6 +44,7 @@ addopts = --doctest-modules --ignore=validation
 norecursedirs = .eggs docs
 filterwarnings =
     error
+    ignore::DeprecationWarning
 
 [flake8]
 ignore =
@@ -61,7 +62,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0

diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -11,6 +11,7 @@
 from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats
 from .stats.association import gwas_linear_regression
 from .stats.hwe import hardy_weinberg_test
+from .stats.popgen import Fst, Tajimas_D, divergence, diversity
 from .stats.regenie import regenie
 
 __all__ = [
@@ -28,4 +29,8 @@
     "regenie",
     "hardy_weinberg_test",
     "variant_stats",
+    "diversity",
+    "divergence",
+    "Fst",
+    "Tajimas_D",
 ]
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -0,0 +1,174 @@
+from typing import Hashable
+
+import dask.array as da
+import numpy as np
+import xarray as xr
+from xarray import DataArray, Dataset
+
+from .aggregation import count_variant_alleles
+
+
+def diversity(
+    ds: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute diversity from allele counts.
+
+    Because we're not providing any arguments on windowing, etc,
+    we return the total over the whole region. Maybe this isn't
+    the behaviour we want, but it's a starting point. Note that
+    this is different to the tskit default behaviour where we
+    normalise by the size of windows so that results
+    in different windows are comparable. However, we don't have
+    any information about the overall length of the sequence here
+    so we can't normalise by it.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+        diversity value.
+    """
+    if len(ds.samples) < 2:
+        return xr.DataArray(np.nan)
+    if allele_counts not in ds:
+        ds_new = count_variant_alleles(ds)
+    else:
+        ds_new = ds
+    ac = ds_new[allele_counts]
+    an = ac.sum(axis=1)
+    n_pairs = an * (an - 1) / 2
+    n_same = (ac * (ac - 1) / 2).sum(axis=1)
+    n_diff = n_pairs - n_same
+    pi = n_diff / n_pairs
+    return pi.sum()  # type: ignore[no-any-return]
+
+
+def divergence(
+    ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute divergence between two genotype call datasets.
+
+    Parameters
+    ----------
+    ds1 : Dataset
+        Genotype call dataset.
+    ds2 : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+         divergence value between the two datasets.
+    """
+    if allele_counts not in ds1:
+        ds1_new = count_variant_alleles(ds1)
+    else:
+        ds1_new = ds1
+    ac1 = ds1_new[allele_counts]
+    if allele_counts not in ds2:
+        ds2_new = count_variant_alleles(ds2)
+    else:
+        ds2_new = ds2
+    ac2 = ds2_new[allele_counts]
+    an1 = ds1_new[allele_counts].sum(axis=1)
+    an2 = ds2_new[allele_counts].sum(axis=1)
+
+    n_pairs = an1 * an2
+    n_same = (ac1 * ac2).sum(axis=1)
+    n_diff = n_pairs - n_same
+    div = n_diff / n_pairs
+    return div.sum()  # type: ignore[no-any-return]
+
+
+def Fst(
+    ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute Fst between two genotype call datasets.
+
+    Parameters
+    ----------
+    ds1 : Dataset
+        Genotype call dataset.
+    ds2 : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+         fst value between the two datasets.
+    """
+    total_div = diversity(ds1) + diversity(ds2)
+    gs = divergence(ds1, ds2)
+    den = total_div + 2 * gs  # type: ignore[operator]
+    fst = 1 - (2 * total_div / den)
+    return fst  # type: ignore[no-any-return]
+
+
+def Tajimas_D(
+    ds: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute Tajimas' D for a genotype call dataset.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+         Tajimas' D value.
+    """
+    if allele_counts not in ds:
+        ds_new = count_variant_alleles(ds)
+    else:
+        ds_new = ds
+    ac = ds_new[allele_counts]
+
+    # count segregating
+    S = ((ac > 0).sum(axis=1) > 1).sum()
+
+    # assume number of chromosomes sampled is constant for all variants
+    n = ac.sum(axis=1).max()
+
+    # (n-1)th harmonic number
+    a1 = (1 / da.arange(1, n)).sum()
+
+    # calculate Watterson's theta (absolute value)
+    theta = S / a1
+
+    # calculate diversity
+    div = diversity(ds_new)
+
+    # N.B., both theta estimates are usually divided by the number of
+    # (accessible) bases but here we want the absolute difference
+    d = div - theta
+
+    # calculate the denominator (standard deviation)
+    a2 = (1 / (da.arange(1, n) ** 2)).sum()
+    b1 = (n + 1) / (3 * (n - 1))
+    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
+    c1 = b1 - (1 / a1)
+    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
+    e1 = c1 / a1
+    e2 = c2 / (a1 ** 2 + a2)
+    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
+
+    if d_stdev == 0:
+        return xr.DataArray(np.nan)
+
+    # finally calculate Tajima's D
+    D = d / d_stdev
+    return D  # type: ignore[no-any-return]
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -0,0 +1,75 @@
+import msprime  # type: ignore
+import numpy as np
+import pytest
+
+from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity
+
+
+def ts_to_dataset(ts, samples=None):
+    """
+    Convert the specified tskit tree sequence into an sgkit dataset.
+    Note this just generates haploids for now. With msprime 1.0, we'll be
+    able to generate diploid/whatever-ploid individuals easily.
+    """
+    if samples is None:
+        samples = ts.samples()
+    tables = ts.dump_tables()
+    alleles = []
+    genotypes = []
+    for var in ts.variants(samples=samples):
+        alleles.append(var.alleles)
+        genotypes.append(var.genotypes)
+    alleles = np.array(alleles).astype("S")
+    genotypes = np.expand_dims(genotypes, axis=2)
+
+    df = create_genotype_call_dataset(
+        variant_contig_names=["1"],
+        variant_contig=np.zeros(len(tables.sites), dtype=int),
+        variant_position=tables.sites.position.astype(int),
+        variant_alleles=alleles,
+        sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
+        call_genotype=genotypes,
+    )
+    return df
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_diversity(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
+    div = diversity(ds).compute()
+    ts_div = ts.diversity(span_normalise=False)
+    np.testing.assert_allclose(div, ts_div)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_divergence(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    subset_1 = ts.samples()[: ts.num_samples // 2]
+    subset_2 = ts.samples()[ts.num_samples // 2 :]
+    ds1 = ts_to_dataset(ts, subset_1)  # type: ignore[no-untyped-call]
+    ds2 = ts_to_dataset(ts, subset_2)  # type: ignore[no-untyped-call]
+    div = divergence(ds1, ds2).compute()
+    ts_div = ts.divergence([subset_1, subset_2], span_normalise=False)
+    np.testing.assert_allclose(div, ts_div)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_Fst(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    subset_1 = ts.samples()[: ts.num_samples // 2]
+    subset_2 = ts.samples()[ts.num_samples // 2 :]
+    ds1 = ts_to_dataset(ts, subset_1)  # type: ignore[no-untyped-call]
+    ds2 = ts_to_dataset(ts, subset_2)  # type: ignore[no-untyped-call]
+    fst = Fst(ds1, ds2).compute()
+    ts_fst = ts.Fst([subset_1, subset_2])
+    np.testing.assert_allclose(fst, ts_fst)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_Tajimas_D(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
+    ts_d = ts.Tajimas_D()
+    d = Tajimas_D(ds).compute()
+    np.testing.assert_allclose(d, ts_d)