sgkit-dev · tomwhite · Oct 20, 2020
diff --git a/docs/api.rst b/docs/api.rst
@@ -43,6 +43,7 @@ Methods
    :toctree: generated/
 
    count_call_alleles
+   count_cohort_alleles
    count_variant_alleles
    divergence
    diversity

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -276,6 +276,8 @@ Xarray and Pandas operations in a single pipeline:
         .pipe(lambda ds: ds.sel(variants=ds.variant_call_rate > .8))
         # Assign a "cohort" variable that splits samples into two groups
         .assign(sample_cohort=np.repeat([0, 1], ds.dims['samples'] // 2))
+        # Count alleles for each cohort
+        .pipe(sg.count_cohort_alleles)
         # Compute Fst between the groups
         .pipe(sg.Fst)
         # Extract the Fst values for cohort pairs

diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -8,7 +8,12 @@
     create_genotype_call_dataset,
     create_genotype_dosage_dataset,
 )
-from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats
+from .stats.aggregation import (
+    count_call_alleles,
+    count_cohort_alleles,
+    count_variant_alleles,
+    variant_stats,
+)
 from .stats.association import gwas_linear_regression
 from .stats.hwe import hardy_weinberg_test
 from .stats.pc_relate import pc_relate
@@ -26,6 +31,7 @@
     "create_genotype_call_dataset",
     "count_variant_alleles",
     "count_call_alleles",
+    "count_cohort_alleles",
     "create_genotype_dosage_dataset",
     "display_genotypes",
     "filter_partial_calls",

diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -11,7 +11,6 @@
 from sgkit.window import has_windows, window_statistic
 
 from .. import variables
-from .aggregation import count_cohort_alleles, count_variant_alleles
 
 
 def diversity(
@@ -37,7 +36,7 @@ def diversity(
     ds
         Genotype call dataset.
     allele_counts
-        cohort allele counts to use or calculate. Defined by
+        cohort allele counts to use. Defined by
         :data:`sgkit.variables.cohort_allele_count_spec`
     call_genotype
         Input variable name holding call_genotype as defined by
@@ -57,10 +56,7 @@ def diversity(
     This method does not currently support datasets that are chunked along the
     samples dimension.
     """
-    if allele_counts not in ds:
-        ds = count_cohort_alleles(ds, call_genotype=call_genotype)
-    else:
-        variables.validate(ds, {allele_counts: variables.cohort_allele_count_spec})
+    variables.validate(ds, {allele_counts: variables.cohort_allele_count_spec})
     ac = ds[allele_counts]
     an = ac.sum(axis=2)
     n_pairs = an * (an - 1) / 2
@@ -162,7 +158,7 @@ def divergence(
     ds
         Genotype call dataset.
     allele_counts
-        cohort allele counts to use or calculate. Defined by
+        cohort allele counts to use. Defined by
         :data:`sgkit.variables.cohort_allele_count_spec`
     call_genotype
         Input variable name holding call_genotype as defined by
@@ -184,10 +180,7 @@ def divergence(
     samples dimension.
     """
 
-    if allele_counts not in ds:
-        ds = count_cohort_alleles(ds, call_genotype=call_genotype)
-    else:
-        variables.validate(ds, {allele_counts: variables.cohort_allele_count_spec})
+    variables.validate(ds, {allele_counts: variables.cohort_allele_count_spec})
     ac = ds[allele_counts]
 
     n_variants = ds.dims["variants"]
@@ -311,7 +304,7 @@ def Fst(
         Other supported estimators include ``Nei`` (1986), (the same estimator
         as tskit).
     allele_counts
-        cohort allele counts to use or calculate. Defined by
+        cohort allele counts to use. Defined by
         :data:`sgkit.variables.cohort_allele_count_spec`
     call_genotype
         Input variable name holding call_genotype as defined by
@@ -338,10 +331,7 @@ def Fst(
             f"Estimator '{estimator}' is not a known estimator: {known_estimators.keys()}"
         )
     estimator = estimator or "Hudson"
-    if allele_counts not in ds:
-        ds = count_cohort_alleles(ds, call_genotype=call_genotype)
-    else:
-        variables.validate(ds, {allele_counts: variables.cohort_allele_count_spec})
+    variables.validate(ds, {allele_counts: variables.cohort_allele_count_spec})
 
     n_cohorts = ds.dims["cohorts"]
     gs = divergence(
@@ -371,10 +361,10 @@ def Tajimas_D(
     ds
         Genotype call dataset.
     variant_allele_counts
-        variant allele counts to use or calculate. Defined by
+        variant allele counts to use. Defined by
         :data:`sgkit.variables.variant_allele_counts_spec`
     allele_counts
-        cohort allele counts to use or calculate. Defined by
+        cohort allele counts to use. Defined by
         :data:`sgkit.variables.cohort_allele_count_spec`
     call_genotype
         Input variable name holding call_genotype as defined by
@@ -394,12 +384,13 @@ def Tajimas_D(
     This method does not currently support datasets that are chunked along the
     samples dimension.
     """
-    if variant_allele_counts not in ds:
-        ds = count_variant_alleles(ds, call_genotype=call_genotype)
-    else:
-        variables.validate(
-            ds, {variant_allele_counts: variables.variant_allele_count_spec}
-        )
+    variables.validate(
+        ds,
+        {
+            variant_allele_counts: variables.variant_allele_count_spec,
+            allele_counts: variables.cohort_allele_count_spec,
+        },
+    )
     ac = ds[variant_allele_counts]
 
     # count segregating

diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -10,6 +10,7 @@
 from sgkit import (
     Fst,
     Tajimas_D,
+    count_cohort_alleles,
     count_variant_alleles,
     create_genotype_call_dataset,
     divergence,
@@ -57,6 +58,7 @@ def test_diversity(sample_size, chunks):
     sample_cohorts = np.full_like(ts.samples(), 0)
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     ds = ds.assign_coords({"cohorts": ["co_0"]})
+    ds = count_cohort_alleles(ds)
     ds = diversity(ds)
     div = ds.stat_diversity.sum(axis=0, skipna=False).sel(cohorts="co_0").values
     ts_div = ts.diversity(span_normalise=False)
@@ -70,6 +72,7 @@ def test_diversity__windowed(sample_size):
     sample_cohorts = np.full_like(ts.samples(), 0)
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     ds = ds.assign_coords({"cohorts": ["co_0"]})
+    ds = count_cohort_alleles(ds)
     ds = window(ds, size=25, step=25)
     ds = diversity(ds)
     div = ds["stat_diversity"].sel(cohorts="co_0").compute()
@@ -107,6 +110,7 @@ def test_divergence(sample_size, n_cohorts, chunks):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
+    ds = count_cohort_alleles(ds)
     ds = divergence(ds)
     div = ds.stat_divergence.sum(axis=0, skipna=False).values
 
@@ -136,6 +140,7 @@ def test_divergence__windowed(sample_size, n_cohorts, chunks):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
+    ds = count_cohort_alleles(ds)
     ds = window(ds, size=25, step=25)
     ds = divergence(ds)
     div = ds["stat_divergence"].values
@@ -169,6 +174,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
+    ds = count_cohort_alleles(ds)
     ds = window(ds, size=25, step=25)
     ds = divergence(ds)
     div = ds["stat_divergence"].values
@@ -202,6 +208,7 @@ def test_Fst__Hudson(sample_size):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
+    ds = count_cohort_alleles(ds)
     n_variants = ds.dims["variants"]
     ds = window(ds, size=n_variants, step=n_variants)  # single window
     ds = Fst(ds, estimator="Hudson")
@@ -230,6 +237,7 @@ def test_Fst__Nei(sample_size, n_cohorts):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
+    ds = count_cohort_alleles(ds)
     n_variants = ds.dims["variants"]
     ds = window(ds, size=n_variants, step=n_variants)  # single window
     ds = Fst(ds, estimator="Nei")
@@ -266,6 +274,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
+    ds = count_cohort_alleles(ds)
     ds = window(ds, size=25, step=25)
     fst_ds = Fst(ds, estimator="Nei")
     fst = fst_ds["stat_Fst"].values
@@ -302,6 +311,8 @@ def test_Tajimas_D(sample_size):
     ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
     sample_cohorts = np.full_like(ts.samples(), 0)
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
+    ds = count_variant_alleles(ds)
+    ds = count_cohort_alleles(ds)
     n_variants = ds.dims["variants"]
     ds = window(ds, size=n_variants, step=n_variants)  # single window
     ds = Tajimas_D(ds)