From a6915246808598d63075b779625345bf75f70bb7 Mon Sep 17 00:00:00 2001
From: daletovar <daletovar@gmail.com>
Date: Fri, 7 Aug 2020 17:17:54 -0700
Subject: [PATCH 1/3] add ts_to_dataset

---
 sgkit/api.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/sgkit/api.py b/sgkit/api.py
index 3408b6052..bab5e964e 100644
--- a/sgkit/api.py
+++ b/sgkit/api.py
@@ -24,7 +24,6 @@ def create_genotype_call_dataset(
     variant_id: Any = None,
 ) -> xr.Dataset:
     """Create a dataset of genotype calls.
-
     Parameters
     ----------
     variant_contig_names : list of str
@@ -46,12 +45,10 @@ def create_genotype_call_dataset(
         omitted all calls are unphased.
     variant_id: array_like, str or object, optional
         The unique identifier of the variant.
-
     Returns
     -------
     :class:`xarray.Dataset`
         The dataset of genotype calls.
-
     """
     check_array_like(variant_contig, kind="i", ndim=1)
     check_array_like(variant_position, kind="i", ndim=1)
@@ -115,12 +112,10 @@ def create_genotype_dosage_dataset(
         missing value.
     variant_id: array_like, str or object, optional
         The unique identifier of the variant.
-
     Returns
     -------
     xr.Dataset
         The dataset of genotype calls.
-
     """
     check_array_like(variant_contig, kind="i", ndim=1)
     check_array_like(variant_position, kind="i", ndim=1)
@@ -149,3 +144,31 @@ def create_genotype_dosage_dataset(
         data_vars["variant_id"] = ([DIM_VARIANT], variant_id)
     attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
     return xr.Dataset(data_vars=data_vars, attrs=attrs)
+
+
+def ts_to_dataset(ts, samples=None):
+    """
+    Convert the specified tskit tree sequence into an sgkit dataset.
+    Note this just generates haploids for now. With msprime 1.0, we'll be
+    able to generate diploid/whatever-ploid individuals easily.
+    """
+    if samples is None:
+        samples = ts.samples()
+    tables = ts.dump_tables()
+    alleles = []
+    genotypes = []
+    for var in ts.variants(samples=samples):
+        alleles.append(var.alleles)
+        genotypes.append(var.genotypes)
+    alleles = np.array(alleles).astype("S")
+    genotypes = np.expand_dims(genotypes, axis=2)
+
+    df = create_genotype_call_dataset(
+        variant_contig_names=["1"],
+        variant_contig=np.zeros(len(tables.sites), dtype=int),
+        variant_position=tables.sites.position.astype(int),
+        variant_alleles=alleles,
+        sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
+        call_genotype=genotypes,
+    )
+    return df
\ No newline at end of file

From a8ebfa3d0d44a85cb8c90563a629b1abfb3a47fd Mon Sep 17 00:00:00 2001
From: daletovar <daletovar@gmail.com>
Date: Fri, 7 Aug 2020 17:18:20 -0700
Subject: [PATCH 2/3] add minimal diversity and divergence

remove ts_to_dataset from public api

make divergence take in two datasets

add minimal fst

Add read_vcfzarr (#40)

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

add tajimas d

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

make divergence take in two datasets

add minimal fst

add tajimas d

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

add minimal fst

add tajimas d

fix allele count

update cfg

remove spaces

add msprime and use np.testing

add libgsl-dev dependency

add docstrings

ignore dep warning

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

make divergence take in two datasets

add minimal fst

Add read_vcfzarr (#40)

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

add tajimas d

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

make divergence take in two datasets

add minimal fst

add tajimas d

add ts_to_dataset

add minimal diversity and divergence

remove ts_to_dataset from public api

add minimal fst

add tajimas d

fix allele count

update cfg

remove spaces

add msprime and use np.testing

add libgsl-dev dependency

add docstrings

fix divide by zero
---
 .github/workflows/build.yml |   1 +
 requirements-dev.txt        |   1 +
 setup.cfg                   |   3 +-
 sgkit/__init__.py           |   5 ++
 sgkit/api.py                |  33 ++-----
 sgkit/stats/popgen.py       | 166 ++++++++++++++++++++++++++++++++++++
 sgkit/tests/test_popgen.py  |  75 ++++++++++++++++
 7 files changed, 255 insertions(+), 29 deletions(-)
 create mode 100644 sgkit/stats/popgen.py
 create mode 100644 sgkit/tests/test_popgen.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 46dd80499..f287e3162 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,6 +18,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
+        sudo apt install libgsl-dev   # Needed for msprime < 1.0. Binary wheels include GSL for >= 1.0
         python -m pip install --upgrade pip
         pip install -r requirements.txt -r requirements-dev.txt
     - name: Run pre-commit
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 90caa787d..53fde6be1 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,5 +6,6 @@ pytest-datadir
 hypothesis
 statsmodels
 zarr
+msprime
 sphinx
 sphinx_rtd_theme
diff --git a/setup.cfg b/setup.cfg
index abe82e850..8fc2c6199 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,6 +44,7 @@ addopts = --doctest-modules --ignore=validation
 norecursedirs = .eggs docs
 filterwarnings =
     error
+    ignore::DeprecationWarning
 
 [flake8]
 ignore =
@@ -61,7 +62,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
index 9a90dc1f0..793776e33 100644
--- a/sgkit/__init__.py
+++ b/sgkit/__init__.py
@@ -11,6 +11,7 @@
 from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats
 from .stats.association import gwas_linear_regression
 from .stats.hwe import hardy_weinberg_test
+from .stats.popgen import Fst, Tajimas_D, divergence, diversity
 from .stats.regenie import regenie
 
 __all__ = [
@@ -28,4 +29,8 @@
     "regenie",
     "hardy_weinberg_test",
     "variant_stats",
+    "diversity",
+    "divergence",
+    "Fst",
+    "Tajimas_D",
 ]
diff --git a/sgkit/api.py b/sgkit/api.py
index bab5e964e..3408b6052 100644
--- a/sgkit/api.py
+++ b/sgkit/api.py
@@ -24,6 +24,7 @@ def create_genotype_call_dataset(
     variant_id: Any = None,
 ) -> xr.Dataset:
     """Create a dataset of genotype calls.
+
     Parameters
     ----------
     variant_contig_names : list of str
@@ -45,10 +46,12 @@ def create_genotype_call_dataset(
         omitted all calls are unphased.
     variant_id: array_like, str or object, optional
         The unique identifier of the variant.
+
     Returns
     -------
     :class:`xarray.Dataset`
         The dataset of genotype calls.
+
     """
     check_array_like(variant_contig, kind="i", ndim=1)
     check_array_like(variant_position, kind="i", ndim=1)
@@ -112,10 +115,12 @@ def create_genotype_dosage_dataset(
         missing value.
     variant_id: array_like, str or object, optional
         The unique identifier of the variant.
+
     Returns
     -------
     xr.Dataset
         The dataset of genotype calls.
+
     """
     check_array_like(variant_contig, kind="i", ndim=1)
     check_array_like(variant_position, kind="i", ndim=1)
@@ -144,31 +149,3 @@ def create_genotype_dosage_dataset(
         data_vars["variant_id"] = ([DIM_VARIANT], variant_id)
     attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
     return xr.Dataset(data_vars=data_vars, attrs=attrs)
-
-
-def ts_to_dataset(ts, samples=None):
-    """
-    Convert the specified tskit tree sequence into an sgkit dataset.
-    Note this just generates haploids for now. With msprime 1.0, we'll be
-    able to generate diploid/whatever-ploid individuals easily.
-    """
-    if samples is None:
-        samples = ts.samples()
-    tables = ts.dump_tables()
-    alleles = []
-    genotypes = []
-    for var in ts.variants(samples=samples):
-        alleles.append(var.alleles)
-        genotypes.append(var.genotypes)
-    alleles = np.array(alleles).astype("S")
-    genotypes = np.expand_dims(genotypes, axis=2)
-
-    df = create_genotype_call_dataset(
-        variant_contig_names=["1"],
-        variant_contig=np.zeros(len(tables.sites), dtype=int),
-        variant_position=tables.sites.position.astype(int),
-        variant_alleles=alleles,
-        sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
-        call_genotype=genotypes,
-    )
-    return df
\ No newline at end of file
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
new file mode 100644
index 000000000..cb64f23d4
--- /dev/null
+++ b/sgkit/stats/popgen.py
@@ -0,0 +1,166 @@
+from typing import Hashable
+
+import dask.array as da
+import numpy as np
+import xarray as xr
+from xarray import DataArray, Dataset
+
+from .aggregation import count_variant_alleles
+
+
+def diversity(
+    ds: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute diversity from allele counts.
+
+    Because we're not providing any arguments on windowing, etc,
+    we return the total over the whole region. Maybe this isn't
+    the behaviour we want, but it's a starting point. Note that
+    this is different to the tskit default behaviour where we
+    normalise by the size of windows so that results
+    in different windows are comparable. However, we don't have
+    any information about the overall length of the sequence here
+    so we can't normalise by it.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+        diversity value.
+    """
+    if len(ds.samples) < 2:
+        return xr.DataArray(np.nan)
+    if allele_counts not in ds:
+        ds = count_variant_alleles(ds)
+    ac = ds[allele_counts]
+    an = ac.sum(axis=1)
+    n_pairs = an * (an - 1) / 2
+    n_same = (ac * (ac - 1) / 2).sum(axis=1)
+    n_diff = n_pairs - n_same
+    pi = n_diff / n_pairs
+    return pi.sum()  # type: ignore[no-any-return]
+
+
+def divergence(
+    ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute divergence between two genotype call datasets.
+
+    Parameters
+    ----------
+    ds1 : Dataset
+        Genotype call dataset.
+    ds2 : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+         divergence value between the two datasets.
+    """
+    if allele_counts not in ds1:
+        ds1 = count_variant_alleles(ds1)
+    ac1 = ds1[allele_counts]
+    if allele_counts not in ds2:
+        ds2 = count_variant_alleles(ds2)
+    ac2 = ds2[allele_counts]
+    an1 = ds1[allele_counts].sum(axis=1)
+    an2 = ds2[allele_counts].sum(axis=1)
+
+    n_pairs = an1 * an2
+    n_same = (ac1 * ac2).sum(axis=1)
+    n_diff = n_pairs - n_same
+    div = n_diff / n_pairs
+    return div.sum()  # type: ignore[no-any-return]
+
+
+def Fst(
+    ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute Fst between two genotype call datasets.
+
+    Parameters
+    ----------
+    ds1 : Dataset
+        Genotype call dataset.
+    ds2 : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+         fst value between the two datasets.
+    """
+    total_div = diversity(ds1) + diversity(ds2)
+    gs = divergence(ds1, ds2)
+    den = total_div + 2 * gs  # type: ignore[operator]
+    fst = 1 - (2 * total_div / den)
+    return fst  # type: ignore[no-any-return]
+
+
+def Tajimas_D(
+    ds: Dataset, allele_counts: Hashable = "variant_allele_count",
+) -> DataArray:
+    """Compute Tajimas' D for a genotype call dataset.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Genotype call dataset.
+    allele_counts : Hashable
+        allele counts to use or calculate.
+
+    Returns
+    -------
+    DataArray
+         Tajimas' D value.
+    """
+    if allele_counts not in ds:
+        ds = count_variant_alleles(ds)
+    ac = ds[allele_counts]
+
+    # count segregating
+    S = ((ac > 0).sum(axis=1) > 1).sum()
+
+    # assume number of chromosomes sampled is constant for all variants
+    n = ac.sum(axis=1).max()
+
+    # (n-1)th harmonic number
+    a1 = (1 / da.arange(1, n)).sum()
+
+    # calculate Watterson's theta (absolute value)
+    theta = S / a1
+
+    # calculate diversity
+    div = diversity(ds)
+
+    # N.B., both theta estimates are usually divided by the number of
+    # (accessible) bases but here we want the absolute difference
+    d = div - theta
+
+    # calculate the denominator (standard deviation)
+    a2 = (1 / (da.arange(1, n) ** 2)).sum()
+    b1 = (n + 1) / (3 * (n - 1))
+    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
+    c1 = b1 - (1 / a1)
+    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
+    e1 = c1 / a1
+    e2 = c2 / (a1 ** 2 + a2)
+    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
+
+    if d_stdev == 0:
+        return xr.DataArray(np.nan)
+
+    # finally calculate Tajima's D
+    D = d / d_stdev
+    return D  # type: ignore[no-any-return]
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
new file mode 100644
index 000000000..478f4cae0
--- /dev/null
+++ b/sgkit/tests/test_popgen.py
@@ -0,0 +1,75 @@
+import msprime  # type: ignore
+import numpy as np
+import pytest
+
+from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity
+
+
+def ts_to_dataset(ts, samples=None):
+    """
+    Convert the specified tskit tree sequence into an sgkit dataset.
+    Note this just generates haploids for now. With msprime 1.0, we'll be
+    able to generate diploid/whatever-ploid individuals easily.
+    """
+    if samples is None:
+        samples = ts.samples()
+    tables = ts.dump_tables()
+    alleles = []
+    genotypes = []
+    for var in ts.variants(samples=samples):
+        alleles.append(var.alleles)
+        genotypes.append(var.genotypes)
+    alleles = np.array(alleles).astype("S")
+    genotypes = np.expand_dims(genotypes, axis=2)
+
+    df = create_genotype_call_dataset(
+        variant_contig_names=["1"],
+        variant_contig=np.zeros(len(tables.sites), dtype=int),
+        variant_position=tables.sites.position.astype(int),
+        variant_alleles=alleles,
+        sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
+        call_genotype=genotypes,
+    )
+    return df
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_diversity(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
+    div = diversity(ds).compute()
+    ts_div = ts.diversity(span_normalise=False)
+    np.testing.assert_allclose(div, ts_div)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_divergence(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    subset_1 = ts.samples()[: ts.num_samples // 2]
+    subset_2 = ts.samples()[ts.num_samples // 2 :]
+    ds1 = ts_to_dataset(ts, subset_1)  # type: ignore[no-untyped-call]
+    ds2 = ts_to_dataset(ts, subset_2)  # type: ignore[no-untyped-call]
+    div = divergence(ds1, ds2).compute()
+    ts_div = ts.divergence([subset_1, subset_2], span_normalise=False)
+    np.testing.assert_allclose(div, ts_div)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_Fst(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    subset_1 = ts.samples()[: ts.num_samples // 2]
+    subset_2 = ts.samples()[ts.num_samples // 2 :]
+    ds1 = ts_to_dataset(ts, subset_1)  # type: ignore[no-untyped-call]
+    ds2 = ts_to_dataset(ts, subset_2)  # type: ignore[no-untyped-call]
+    fst = Fst(ds1, ds2).compute()
+    ts_fst = ts.Fst([subset_1, subset_2])
+    np.testing.assert_allclose(fst, ts_fst)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_Tajimas_D(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
+    ts_d = ts.Tajimas_D()
+    d = Tajimas_D(ds).compute()
+    np.testing.assert_allclose(d, ts_d)

From f09fac3606d3ab058c75b2518572286276e0b822 Mon Sep 17 00:00:00 2001
From: Dale Tovar <daletovar@gmail.com>
Date: Thu, 3 Sep 2020 17:18:08 -0700
Subject: [PATCH 3/3] add functions to rst and avoid modifying input ds

---
 docs/api.rst          |  4 ++++
 sgkit/stats/popgen.py | 30 +++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index 16d055d7f..fcffa357e 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -24,10 +24,14 @@ Methods
 
    count_call_alleles
    count_variant_alleles
+   divergence
+   diversity
+   Fst
    gwas_linear_regression
    hardy_weinberg_test
    regenie
    variant_stats
+   Tajimas_D
 
 Utilities
 =========
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
index cb64f23d4..abfc93c44 100644
--- a/sgkit/stats/popgen.py
+++ b/sgkit/stats/popgen.py
@@ -37,8 +37,10 @@ def diversity(
     if len(ds.samples) < 2:
         return xr.DataArray(np.nan)
     if allele_counts not in ds:
-        ds = count_variant_alleles(ds)
-    ac = ds[allele_counts]
+        ds_new = count_variant_alleles(ds)
+    else:
+        ds_new = ds
+    ac = ds_new[allele_counts]
     an = ac.sum(axis=1)
     n_pairs = an * (an - 1) / 2
     n_same = (ac * (ac - 1) / 2).sum(axis=1)
@@ -67,13 +69,17 @@ def divergence(
          divergence value between the two datasets.
     """
     if allele_counts not in ds1:
-        ds1 = count_variant_alleles(ds1)
-    ac1 = ds1[allele_counts]
+        ds1_new = count_variant_alleles(ds1)
+    else:
+        ds1_new = ds1
+    ac1 = ds1_new[allele_counts]
     if allele_counts not in ds2:
-        ds2 = count_variant_alleles(ds2)
-    ac2 = ds2[allele_counts]
-    an1 = ds1[allele_counts].sum(axis=1)
-    an2 = ds2[allele_counts].sum(axis=1)
+        ds2_new = count_variant_alleles(ds2)
+    else:
+        ds2_new = ds2
+    ac2 = ds2_new[allele_counts]
+    an1 = ds1_new[allele_counts].sum(axis=1)
+    an2 = ds2_new[allele_counts].sum(axis=1)
 
     n_pairs = an1 * an2
     n_same = (ac1 * ac2).sum(axis=1)
@@ -126,8 +132,10 @@ def Tajimas_D(
          Tajimas' D value.
     """
     if allele_counts not in ds:
-        ds = count_variant_alleles(ds)
-    ac = ds[allele_counts]
+        ds_new = count_variant_alleles(ds)
+    else:
+        ds_new = ds
+    ac = ds_new[allele_counts]
 
     # count segregating
     S = ((ac > 0).sum(axis=1) > 1).sum()
@@ -142,7 +150,7 @@ def Tajimas_D(
     theta = S / a1
 
     # calculate diversity
-    div = diversity(ds)
+    div = diversity(ds_new)
 
     # N.B., both theta estimates are usually divided by the number of
     # (accessible) bases but here we want the absolute difference