Skip to content

Commit 07fb874

Browse files
committed
Make window step default to window size
1 parent 3974df2 commit 07fb874

File tree

5 files changed

+36
-27
lines changed

5 files changed

+36
-27
lines changed

sgkit/stats/popgen.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def diversity(
7676
[0.5 , 0.5 ]])
7777
7878
>>> # Divide into windows of size three (variants)
79-
>>> ds = sg.window(ds, size=3, step=3)
79+
>>> ds = sg.window(ds, size=3)
8080
>>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
8181
array([[1.83333333, 1.83333333],
8282
[1. , 1. ]])
@@ -239,7 +239,7 @@ def divergence(
239239
[0.625 , 0.5 ]]])
240240
241241
>>> # Divide into windows of size three (variants)
242-
>>> ds = sg.window(ds, size=3, step=3)
242+
>>> ds = sg.window(ds, size=3)
243243
>>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
244244
array([[[1.83333333, 1.5 ],
245245
[1.5 , 1.83333333]],
@@ -431,7 +431,7 @@ def Fst(
431431
[ 0.2 , nan]]])
432432
433433
>>> # Divide into windows of size three (variants)
434-
>>> ds = sg.window(ds, size=3, step=3)
434+
>>> ds = sg.window(ds, size=3)
435435
>>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
436436
array([[[ nan, -0.22222222],
437437
[-0.22222222, nan]],
@@ -523,7 +523,7 @@ def Tajimas_D(
523523
[-3.35891429, -3.35891429]])
524524
525525
>>> # Divide into windows of size three (variants)
526-
>>> ds = sg.window(ds, size=3, step=3)
526+
>>> ds = sg.window(ds, size=3)
527527
>>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
528528
array([[-0.22349574, -0.22349574],
529529
[-2.18313233, -2.18313233]])
@@ -661,7 +661,7 @@ def pbs(
661661
>>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})
662662
663663
>>> # Divide into two windows of size three (variants)
664-
>>> ds = sg.window(ds, size=3, step=3)
664+
>>> ds = sg.window(ds, size=3)
665665
>>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
666666
array([ 0. , -0.160898])
667667
"""

sgkit/tests/test_popgen.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def test_diversity__windowed(sample_size):
8787
sample_cohorts = np.full_like(ts.samples(), 0)
8888
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
8989
ds = ds.assign_coords({"cohorts": ["co_0"]})
90-
ds = window(ds, size=25, step=25)
90+
ds = window(ds, size=25)
9191
ds = diversity(ds)
9292
div = ds["stat_diversity"].sel(cohorts="co_0").compute()
9393

@@ -103,7 +103,7 @@ def test_diversity__windowed(sample_size):
103103
ds = count_variant_alleles(ts_to_dataset(ts)) # type: ignore[no-untyped-call]
104104
ac = ds["variant_allele_count"].values
105105
mpd = allel.mean_pairwise_difference(ac, fill=0)
106-
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25)
106+
ska_div = allel.moving_statistic(mpd, np.sum, size=25)
107107
np.testing.assert_allclose(
108108
div[:-1], ska_div
109109
) # scikit-allel has final window missing
@@ -159,7 +159,7 @@ def test_divergence__windowed(sample_size, n_cohorts, chunks):
159159
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
160160
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
161161
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
162-
ds = window(ds, size=25, step=25)
162+
ds = window(ds, size=25)
163163
ds = divergence(ds)
164164
div = ds["stat_divergence"].values
165165
# test off-diagonal entries, by replacing diagonal with NaNs
@@ -192,7 +192,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
192192
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
193193
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
194194
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
195-
ds = window(ds, size=25, step=25)
195+
ds = window(ds, size=25)
196196
ds = divergence(ds)
197197
div = ds["stat_divergence"].values
198198
# test off-diagonal entries, by replacing diagonal with NaNs
@@ -205,7 +205,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
205205
ac1 = ds1["variant_allele_count"].values
206206
ac2 = ds2["variant_allele_count"].values
207207
mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
208-
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25) # noqa: F841
208+
ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841
209209
# TODO: investigate why numbers are different
210210
np.testing.assert_allclose(
211211
div[:-1], ska_div
@@ -226,7 +226,7 @@ def test_Fst__Hudson(sample_size):
226226
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
227227
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
228228
n_variants = ds.dims["variants"]
229-
ds = window(ds, size=n_variants, step=n_variants) # single window
229+
ds = window(ds, size=n_variants) # single window
230230
ds = Fst(ds, estimator="Hudson")
231231
fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values
232232

@@ -254,7 +254,7 @@ def test_Fst__Nei(sample_size, n_cohorts):
254254
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
255255
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
256256
n_variants = ds.dims["variants"]
257-
ds = window(ds, size=n_variants, step=n_variants) # single window
257+
ds = window(ds, size=n_variants) # single window
258258
ds = Fst(ds, estimator="Nei")
259259
fst = ds.stat_Fst.values
260260

@@ -289,7 +289,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
289289
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
290290
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
291291
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
292-
ds = window(ds, size=25, step=25)
292+
ds = window(ds, size=25)
293293
fst_ds = Fst(ds, estimator="Nei")
294294
fst = fst_ds["stat_Fst"].values
295295

@@ -312,7 +312,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
312312

313313
ac1 = fst_ds.cohort_allele_count.values[:, 0, :]
314314
ac2 = fst_ds.cohort_allele_count.values[:, 1, :]
315-
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25, step=25)
315+
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25)
316316

317317
np.testing.assert_allclose(
318318
fst[:-1], ska_fst
@@ -326,7 +326,7 @@ def test_Tajimas_D(sample_size):
326326
sample_cohorts = np.full_like(ts.samples(), 0)
327327
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
328328
n_variants = ds.dims["variants"]
329-
ds = window(ds, size=n_variants, step=n_variants) # single window
329+
ds = window(ds, size=n_variants) # single window
330330
ds = Tajimas_D(ds)
331331
d = ds.stat_Tajimas_D.compute()
332332
ts_d = ts.Tajimas_D()
@@ -348,7 +348,7 @@ def test_pbs(sample_size, n_cohorts):
348348
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
349349
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
350350
n_variants = ds.dims["variants"]
351-
ds = window(ds, size=n_variants, step=n_variants) # single window
351+
ds = window(ds, size=n_variants) # single window
352352

353353
ds = pbs(ds)
354354
stat_pbs = ds["stat_pbs"]
@@ -360,9 +360,7 @@ def test_pbs(sample_size, n_cohorts):
360360

361361
ska_pbs_value = np.full([1, n_cohorts, n_cohorts, n_cohorts], np.nan)
362362
for i, j, k in itertools.combinations(range(n_cohorts), 3):
363-
ska_pbs_value[0, i, j, k] = allel.pbs(
364-
ac1, ac2, ac3, window_size=n_variants, window_step=n_variants
365-
)
363+
ska_pbs_value[0, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=n_variants)
366364

367365
np.testing.assert_allclose(stat_pbs, ska_pbs_value)
368366

@@ -382,7 +380,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
382380
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
383381
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
384382
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
385-
ds = window(ds, size=25, step=25)
383+
ds = window(ds, size=25)
386384

387385
ds = pbs(ds)
388386
stat_pbs = ds["stat_pbs"].values
@@ -396,9 +394,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
396394
n_windows = ds.dims["windows"] - 1
397395
ska_pbs_value = np.full([n_windows, n_cohorts, n_cohorts, n_cohorts], np.nan)
398396
for i, j, k in itertools.combinations(range(n_cohorts), 3):
399-
ska_pbs_value[:, i, j, k] = allel.pbs(
400-
ac1, ac2, ac3, window_size=25, window_step=25
401-
)
397+
ska_pbs_value[:, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=25)
402398

403399
np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)
404400

@@ -418,7 +414,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
418414
[np.full_like(subset, i) for i, subset in enumerate(subsets)]
419415
)
420416
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
421-
ds = window(ds, size=3, step=3)
417+
ds = window(ds, size=3)
422418

423419
gh = Garud_h(ds)
424420
h1 = gh.stat_Garud_h1.values
@@ -431,7 +427,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
431427
gt = ds.call_genotype.values[:, sample_cohorts == c, :]
432428
ska_gt = allel.GenotypeArray(gt)
433429
ska_ha = ska_gt.to_haplotypes()
434-
ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
430+
ska_h = allel.moving_garud_h(ska_ha, size=3)
435431

436432
np.testing.assert_allclose(h1[:, c], ska_h[0])
437433
np.testing.assert_allclose(h12[:, c], ska_h[1])

sgkit/tests/test_window.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ def test_window():
8484
window(ds, 2, 2)
8585

8686

87+
def test_window__default_step():
88+
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
89+
assert not has_windows(ds)
90+
ds = window(ds, 2)
91+
assert has_windows(ds)
92+
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
93+
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
94+
np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
95+
96+
8797
@pytest.mark.parametrize(
8898
"n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp",
8999
[

sgkit/variables.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def register_variable(cls, spec: Spec) -> Tuple[str, Spec]:
3737
if spec.default_name in cls.registered_variables:
3838
raise ValueError(f"`{spec.default_name}` already registered")
3939
cls.registered_variables[spec.default_name] = spec
40+
print(spec.__doc__)
4041
return spec.default_name, spec
4142

4243
@classmethod

sgkit/window.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, Iterable, Tuple, Union
1+
from typing import Any, Callable, Iterable, Optional, Tuple, Union
22

33
import dask.array as da
44
import numpy as np
@@ -15,7 +15,7 @@
1515
def window(
1616
ds: Dataset,
1717
size: int,
18-
step: int,
18+
step: Optional[int] = None,
1919
merge: bool = True,
2020
) -> Dataset:
2121
"""Add fixed-size windowing information to a dataset.
@@ -32,6 +32,7 @@ def window(
3232
The window size (number of variants).
3333
step
3434
The distance (number of variants) between start positions of windows.
35+
Defaults to ``size``.
3536
merge
3637
If True (the default), merge the input dataset and the computed
3738
output variables into a single dataset, otherwise return only
@@ -47,6 +48,7 @@ def window(
4748
- :data:`sgkit.variables.window_stop_spec` (windows):
4849
The index values of window stop positions.
4950
"""
51+
step = step or size
5052
n_variants = ds.dims["variants"]
5153
n_contigs = len(ds.attrs["contigs"])
5254
contig_ids = np.arange(n_contigs)

0 commit comments

Comments
 (0)