Skip to content

Make window step default to window size #389

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions sgkit/stats/popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def diversity(
[0.5 , 0.5 ]])

>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
array([[1.83333333, 1.83333333],
[1. , 1. ]])
Expand Down Expand Up @@ -239,7 +239,7 @@ def divergence(
[0.625 , 0.5 ]]])

>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
array([[[1.83333333, 1.5 ],
[1.5 , 1.83333333]],
Expand Down Expand Up @@ -431,7 +431,7 @@ def Fst(
[ 0.2 , nan]]])

>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
array([[[ nan, -0.22222222],
[-0.22222222, nan]],
Expand Down Expand Up @@ -523,7 +523,7 @@ def Tajimas_D(
[-3.35891429, -3.35891429]])

>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
array([[-0.22349574, -0.22349574],
[-2.18313233, -2.18313233]])
Expand Down Expand Up @@ -661,7 +661,7 @@ def pbs(
>>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})

>>> # Divide into two windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
array([ 0. , -0.160898])
"""
Expand Down
36 changes: 16 additions & 20 deletions sgkit/tests/test_popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_diversity__windowed(sample_size):
sample_cohorts = np.full_like(ts.samples(), 0)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds = ds.assign_coords({"cohorts": ["co_0"]})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = diversity(ds)
div = ds["stat_diversity"].sel(cohorts="co_0").compute()

Expand All @@ -103,7 +103,7 @@ def test_diversity__windowed(sample_size):
ds = count_variant_alleles(ts_to_dataset(ts)) # type: ignore[no-untyped-call]
ac = ds["variant_allele_count"].values
mpd = allel.mean_pairwise_difference(ac, fill=0)
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25)
ska_div = allel.moving_statistic(mpd, np.sum, size=25)
np.testing.assert_allclose(
div[:-1], ska_div
) # scikit-allel has final window missing
Expand Down Expand Up @@ -159,7 +159,7 @@ def test_divergence__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = divergence(ds)
div = ds["stat_divergence"].values
# test off-diagonal entries, by replacing diagonal with NaNs
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = divergence(ds)
div = ds["stat_divergence"].values
# test off-diagonal entries, by replacing diagonal with NaNs
Expand All @@ -205,7 +205,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
ac1 = ds1["variant_allele_count"].values
ac2 = ds2["variant_allele_count"].values
mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25) # noqa: F841
ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841
# TODO: investigate why numbers are different
np.testing.assert_allclose(
div[:-1], ska_div
Expand All @@ -226,7 +226,7 @@ def test_Fst__Hudson(sample_size):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Fst(ds, estimator="Hudson")
fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

Expand Down Expand Up @@ -254,7 +254,7 @@ def test_Fst__Nei(sample_size, n_cohorts):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Fst(ds, estimator="Nei")
fst = ds.stat_Fst.values

Expand Down Expand Up @@ -289,7 +289,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
fst_ds = Fst(ds, estimator="Nei")
fst = fst_ds["stat_Fst"].values

Expand All @@ -312,7 +312,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):

ac1 = fst_ds.cohort_allele_count.values[:, 0, :]
ac2 = fst_ds.cohort_allele_count.values[:, 1, :]
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25, step=25)
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25)

np.testing.assert_allclose(
fst[:-1], ska_fst
Expand All @@ -326,7 +326,7 @@ def test_Tajimas_D(sample_size):
sample_cohorts = np.full_like(ts.samples(), 0)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Tajimas_D(ds)
d = ds.stat_Tajimas_D.compute()
ts_d = ts.Tajimas_D()
Expand All @@ -348,7 +348,7 @@ def test_pbs(sample_size, n_cohorts):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window

ds = pbs(ds)
stat_pbs = ds["stat_pbs"]
Expand All @@ -360,9 +360,7 @@ def test_pbs(sample_size, n_cohorts):

ska_pbs_value = np.full([1, n_cohorts, n_cohorts, n_cohorts], np.nan)
for i, j, k in itertools.combinations(range(n_cohorts), 3):
ska_pbs_value[0, i, j, k] = allel.pbs(
ac1, ac2, ac3, window_size=n_variants, window_step=n_variants
)
ska_pbs_value[0, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=n_variants)

np.testing.assert_allclose(stat_pbs, ska_pbs_value)

Expand All @@ -382,7 +380,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)

ds = pbs(ds)
stat_pbs = ds["stat_pbs"].values
Expand All @@ -396,9 +394,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
n_windows = ds.dims["windows"] - 1
ska_pbs_value = np.full([n_windows, n_cohorts, n_cohorts, n_cohorts], np.nan)
for i, j, k in itertools.combinations(range(n_cohorts), 3):
ska_pbs_value[:, i, j, k] = allel.pbs(
ac1, ac2, ac3, window_size=25, window_step=25
)
ska_pbs_value[:, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=25)

np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)

Expand All @@ -418,7 +414,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
[np.full_like(subset, i) for i, subset in enumerate(subsets)]
)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds = window(ds, size=3, step=3)
ds = window(ds, size=3)

gh = Garud_h(ds)
h1 = gh.stat_Garud_h1.values
Expand All @@ -431,7 +427,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
gt = ds.call_genotype.values[:, sample_cohorts == c, :]
ska_gt = allel.GenotypeArray(gt)
ska_ha = ska_gt.to_haplotypes()
ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
ska_h = allel.moving_garud_h(ska_ha, size=3)

np.testing.assert_allclose(h1[:, c], ska_h[0])
np.testing.assert_allclose(h12[:, c], ska_h[1])
Expand Down
10 changes: 10 additions & 0 deletions sgkit/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ def test_window():
window(ds, 2, 2)


def test_window__default_step():
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
assert not has_windows(ds)
ds = window(ds, 2)
assert has_windows(ds)
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])


@pytest.mark.parametrize(
"n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp",
[
Expand Down
1 change: 1 addition & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def register_variable(cls, spec: Spec) -> Tuple[str, Spec]:
if spec.default_name in cls.registered_variables:
raise ValueError(f"`{spec.default_name}` already registered")
cls.registered_variables[spec.default_name] = spec
print(spec.__doc__)
return spec.default_name, spec

@classmethod
Expand Down
6 changes: 4 additions & 2 deletions sgkit/window.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Iterable, Tuple, Union
from typing import Any, Callable, Iterable, Optional, Tuple, Union

import dask.array as da
import numpy as np
Expand All @@ -15,7 +15,7 @@
def window(
ds: Dataset,
size: int,
step: int,
step: Optional[int] = None,
merge: bool = True,
) -> Dataset:
"""Add fixed-size windowing information to a dataset.
Expand All @@ -32,6 +32,7 @@ def window(
The window size (number of variants).
step
The distance (number of variants) between start positions of windows.
Defaults to ``size``.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
Expand All @@ -47,6 +48,7 @@ def window(
- :data:`sgkit.variables.window_stop_spec` (windows):
The index values of window stop positions.
"""
step = step or size
n_variants = ds.dims["variants"]
n_contigs = len(ds.attrs["contigs"])
contig_ids = np.arange(n_contigs)
Expand Down