Skip to content

Commit c413978

Browse files
committed
Rename window to window_by_index, and add window_by_position
1 parent 1ec2f66 commit c413978

File tree

4 files changed

+151
-18
lines changed

4 files changed

+151
-18
lines changed

docs/getting_started.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ then with windows.
231231
232232
# Define windows of size 20 variants. This creates a new dimension called `windows`, and
233233
# some new variables for internal use.
234-
ds = sg.window(ds, 20)
234+
ds = sg.window(ds, size=20)
235235
236236
# The diversity statistic is now computed for every window
237237
sg.diversity(ds, merge=False)
@@ -269,7 +269,7 @@ example shows how to give cohorts names.
269269
.. ipython:: python
270270
:okwarning:
271271
272-
ds = sg.window(ds, 20)
272+
ds = sg.window(ds, size=20)
273273
ds = sg.Fst(ds)
274274
275275
cohort_names = ["Africa", "Asia", "Europe"]
@@ -310,7 +310,7 @@ Xarray and Pandas operations in a single pipeline:
310310
# Apply filter to include variants present across > 80% of samples
311311
.pipe(lambda ds: ds.sel(variants=ds.variant_call_rate > .8))
312312
# Create windows of size 20 variants
313-
.pipe(lambda ds: sg.window(ds, 20))
313+
.pipe(lambda ds: sg.window(ds, size=20))
314314
# Assign a "cohort" variable that splits samples into two groups
315315
.assign(sample_cohort=np.repeat([0, 1], ds.dims['samples'] // 2))
316316
# Compute Fst between the groups

sgkit/tests/test_ld.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def test_vs_skallel(args):
156156

157157
ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1])
158158
ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks}))
159-
ds = window(ds, size, step)
159+
ds = window(ds, size=size, step=step)
160160

161161
ldm = ld_matrix(ds, threshold=threshold)
162162
has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()

sgkit/tests/test_window.py

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
_get_windows,
1414
has_windows,
1515
moving_statistic,
16-
window,
16+
window_by_index,
17+
window_by_position,
1718
)
1819

1920

@@ -71,23 +72,23 @@ def test_moving_statistic__min_chunksize_smaller_than_size():
7172
moving_statistic(values, np.sum, size=3, step=3, dtype=values.dtype)
7273

7374

74-
def test_window():
75+
def test_window_by_index():
7576
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
7677
assert not has_windows(ds)
77-
ds = window(ds, 2, 2)
78+
ds = window_by_index(ds, size=2, step=2)
7879
assert has_windows(ds)
7980
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
8081
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
8182
np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
8283

8384
with pytest.raises(MergeWarning):
84-
window(ds, 2, 2)
85+
window_by_index(ds, size=2, step=2)
8586

8687

87-
def test_window__default_step():
88+
def test_window_by_index__default_step():
8889
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
8990
assert not has_windows(ds)
90-
ds = window(ds, 2)
91+
ds = window_by_index(ds, size=2)
9192
assert has_windows(ds)
9293
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
9394
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
@@ -120,13 +121,13 @@ def test_window__default_step():
120121
),
121122
],
122123
)
123-
def test_window__multiple_contigs(
124+
def test_window_by_index__multiple_contigs(
124125
n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp
125126
):
126127
ds = simulate_genotype_call_dataset(
127128
n_variant=n_variant, n_sample=1, n_contig=n_contig
128129
)
129-
ds = window(ds, 2, 2)
130+
ds = window_by_index(ds, size=2, step=2)
130131
np.testing.assert_equal(ds[window_contig].values, window_contigs_exp)
131132
np.testing.assert_equal(ds[window_start].values, window_starts_exp)
132133
np.testing.assert_equal(ds[window_stop].values, window_stops_exp)
@@ -193,3 +194,43 @@ def test_get_chunked_windows(
193194
)
194195
np.testing.assert_equal(rel_window_starts_actual, rel_window_starts_exp)
195196
np.testing.assert_equal(windows_per_chunk_actual, windows_per_chunk_exp)
197+
198+
199+
def test_window_by_position():
200+
ds = simulate_genotype_call_dataset(n_variant=5, n_sample=3, seed=0)
201+
assert not has_windows(ds)
202+
ds["variant_position"] = (
203+
["variants"],
204+
np.array([1, 4, 6, 8, 12]),
205+
)
206+
ds = window_by_position(ds, size=5)
207+
assert has_windows(ds)
208+
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
209+
np.testing.assert_equal(ds[window_start].values, [0, 1, 2, 3, 4])
210+
np.testing.assert_equal(ds[window_stop].values, [2, 4, 4, 5, 5])
211+
212+
213+
def test_window_by_position__multiple_contigs():
214+
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, n_contig=2)
215+
ds["variant_position"] = (
216+
["variants"],
217+
np.array([1, 4, 6, 8, 12, 1, 21, 25, 40, 55]),
218+
)
219+
ds = window_by_position(ds, size=10)
220+
assert has_windows(ds)
221+
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
222+
np.testing.assert_equal(ds[window_start].values, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
223+
np.testing.assert_equal(ds[window_stop].values, [4, 5, 5, 5, 5, 6, 8, 8, 9, 10])
224+
225+
226+
def test_window_by_position__offset():
227+
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, n_contig=2)
228+
ds["variant_position"] = (
229+
["variants"],
230+
np.array([1, 4, 6, 8, 12, 1, 21, 25, 40, 55]),
231+
)
232+
ds = window_by_position(ds, size=10, offset=-5)
233+
assert has_windows(ds)
234+
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
235+
np.testing.assert_equal(ds[window_start].values, [0, 0, 0, 1, 3, 5, 6, 6, 8, 9])
236+
np.testing.assert_equal(ds[window_stop].values, [2, 4, 4, 5, 5, 6, 8, 8, 9, 10])

sgkit/window.py

Lines changed: 98 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from typing import Any, Callable, Iterable, Optional, Tuple, Union
1+
from typing import Any, Callable, Hashable, Iterable, Optional, Tuple, Union
22

33
import dask.array as da
44
import numpy as np
55
from xarray import Dataset
66

7+
from sgkit import variables
78
from sgkit.utils import conditional_merge_datasets, create_dataset
89
from sgkit.variables import window_contig, window_start, window_stop
910

@@ -12,27 +13,32 @@
1213
# Window definition (user code)
1314

1415

15-
def window(
16+
def window_by_index(
1617
ds: Dataset,
18+
*,
1719
size: int,
1820
step: Optional[int] = None,
21+
variant_contig: Hashable = variables.variant_contig,
1922
merge: bool = True,
2023
) -> Dataset:
21-
"""Add fixed-size windowing information to a dataset.
24+
"""Add window information to a dataset, measured by number of variants.
2225
2326
Windows are defined over the ``variants`` dimension, and are
2427
used by some downstream functions to calculate statistics for
25-
each window.
28+
each window. Windows never span contigs.
2629
2730
Parameters
2831
----------
2932
ds
3033
Genotype call dataset.
3134
size
32-
The window size (number of variants).
35+
The window size, measured by number of variants.
3336
step
3437
The distance (number of variants) between start positions of windows.
3538
Defaults to ``size``.
39+
variant_contig
40+
Name of variable containing variant contig indexes.
41+
Defined by :data:`sgkit.variables.variant_contig_spec`.
3642
merge
3743
If True (the default), merge the input dataset and the computed
3844
output variables into a single dataset, otherwise return only
@@ -43,12 +49,84 @@ def window(
4349
-------
4450
A dataset containing the following variables:
4551
52+
- :data:`sgkit.variables.window_contig_spec` (windows):
53+
The index values of window contigs.
4654
- :data:`sgkit.variables.window_start_spec` (windows):
4755
The index values of window start positions.
4856
- :data:`sgkit.variables.window_stop_spec` (windows):
4957
The index values of window stop positions.
5058
"""
5159
step = step or size
60+
return _window_per_contig(ds, variant_contig, merge, _get_windows, size, step)
61+
62+
63+
window = window_by_index
64+
65+
66+
def window_by_position(
67+
ds: Dataset,
68+
*,
69+
size: int,
70+
offset: int = 0,
71+
variant_contig: Hashable = variables.variant_contig,
72+
variant_position: Hashable = variables.variant_position,
73+
merge: bool = True,
74+
) -> Dataset:
75+
"""Add window information to a dataset, measured by distance along the genome.
76+
77+
Windows are defined over the ``variants`` dimension, and are
78+
used by some downstream functions to calculate statistics for
79+
each window. Windows never span contigs.
80+
81+
Parameters
82+
----------
83+
ds
84+
Genotype call dataset.
85+
size
86+
The window size, measured by number of base pairs.
87+
offset
88+
The window offset, measured by number of base pairs. Defaults
89+
to no offset. For centered windows, use a negative offset that
90+
is half the window size.
91+
variant_contig
92+
Name of variable containing variant contig indexes.
93+
Defined by :data:`sgkit.variables.variant_contig_spec`.
94+
variant_position
95+
Name of variable containing variant positions.
96+
Must be monotonically increasing within a contig.
97+
Defined by :data:`sgkit.variables.variant_position_spec`.
98+
merge
99+
If True (the default), merge the input dataset and the computed
100+
output variables into a single dataset, otherwise return only
101+
the computed output variables.
102+
See :ref:`dataset_merge` for more details.
103+
104+
Returns
105+
-------
106+
A dataset containing the following variables:
107+
108+
- :data:`sgkit.variables.window_contig_spec` (windows):
109+
The index values of window contigs.
110+
- :data:`sgkit.variables.window_start_spec` (windows):
111+
The index values of window start positions.
112+
- :data:`sgkit.variables.window_stop_spec` (windows):
113+
The index values of window stop positions.
114+
"""
115+
116+
positions = ds[variant_position].values
117+
return _window_per_contig(
118+
ds, variant_contig, merge, _get_windows_by_position, size, offset, positions
119+
)
120+
121+
122+
def _window_per_contig(
123+
ds: Dataset,
124+
variant_contig: Hashable,
125+
merge: bool,
126+
windowing_fn: Callable[..., Any],
127+
*args: Any,
128+
**kwargs: Any,
129+
) -> Dataset:
52130
n_variants = ds.dims["variants"]
53131
n_contigs = len(ds.attrs["contigs"])
54132
contig_ids = np.arange(n_contigs)
@@ -60,7 +138,9 @@ def window(
60138
contig_window_starts = []
61139
contig_window_stops = []
62140
for i in range(n_contigs):
63-
starts, stops = _get_windows(contig_bounds[i], contig_bounds[i + 1], size, step)
141+
starts, stops = windowing_fn(
142+
contig_bounds[i], contig_bounds[i + 1], *args, **kwargs
143+
)
64144
contig_window_starts.append(starts)
65145
contig_window_stops.append(stops)
66146
contig_window_contigs.append(np.full_like(starts, i))
@@ -97,6 +177,18 @@ def _get_windows(
97177
return window_starts, window_stops
98178

99179

180+
def _get_windows_by_position(
181+
start: int, stop: int, size: int, offset: int, positions: ArrayLike
182+
) -> Tuple[ArrayLike, ArrayLike]:
183+
contig_pos = positions[start:stop]
184+
if offset == 0:
185+
window_starts = np.arange(contig_pos.shape[0]) + start
186+
else:
187+
window_starts = np.searchsorted(contig_pos, contig_pos + offset) + start
188+
window_stops = np.searchsorted(contig_pos, contig_pos + offset + size) + start
189+
return window_starts, window_stops
190+
191+
100192
# Computing statistics for windows (internal code)
101193

102194

0 commit comments

Comments
 (0)