Skip to content

Take contigs into account for fixed-size windowing #372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion sgkit/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from sgkit import simulate_genotype_call_dataset
from sgkit.utils import MergeWarning
from sgkit.variables import window_start, window_stop
from sgkit.variables import window_contig, window_start, window_stop
from sgkit.window import (
_get_chunked_windows,
_get_windows,
Expand Down Expand Up @@ -64,13 +64,52 @@ def test_window():
assert not has_windows(ds)
ds = window(ds, 2, 2)
assert has_windows(ds)
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])

with pytest.raises(MergeWarning):
window(ds, 2, 2)


@pytest.mark.parametrize(
"n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp",
[
(
15,
3,
[0, 0, 0, 1, 1, 1, 2, 2, 2],
[0, 2, 4, 5, 7, 9, 10, 12, 14],
[2, 4, 5, 7, 9, 10, 12, 14, 15],
),
(
15,
15,
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
),
(
1,
1,
[0],
[0],
[1],
),
],
)
def test_window__multiple_contigs(
n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp
):
ds = simulate_genotype_call_dataset(
n_variant=n_variant, n_sample=1, n_contig=n_contig
)
ds = window(ds, 2, 2)
np.testing.assert_equal(ds[window_contig].values, window_contigs_exp)
np.testing.assert_equal(ds[window_start].values, window_starts_exp)
np.testing.assert_equal(ds[window_stop].values, window_stops_exp)


@pytest.mark.parametrize(
"start, stop, size, step, window_starts_exp, window_stops_exp",
[
Expand Down
4 changes: 4 additions & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,10 @@ def _check_field(
ArrayLikeSpec("variant_t_value")
)
"""T statistics for each beta."""
window_contig, window_contig_spec = SgkitVariables.register_variable(
ArrayLikeSpec("window_contig", kind="i", ndim=1)
)
"""The contig index of each window."""
window_start, window_start_spec = SgkitVariables.register_variable(
ArrayLikeSpec("window_start", kind="i", ndim=1)
)
Expand Down
28 changes: 23 additions & 5 deletions sgkit/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from xarray import Dataset

from sgkit.utils import conditional_merge_datasets
from sgkit.variables import window_start, window_stop
from sgkit.variables import window_contig, window_start, window_stop

from .typing import ArrayLike, DType

Expand Down Expand Up @@ -48,12 +48,31 @@ def window(
The index values of window stop positions.
"""
n_variants = ds.dims["variants"]

length = n_variants
window_starts, window_stops = _get_windows(0, length, size, step)
n_contigs = len(ds.attrs["contigs"])
contig_ids = np.arange(n_contigs)
variant_contig = ds["variant_contig"]
contig_starts = np.searchsorted(variant_contig.values, contig_ids)
contig_bounds = np.append(contig_starts, [n_variants], axis=0)

contig_window_contigs = []
contig_window_starts = []
contig_window_stops = []
for i in range(n_contigs):
starts, stops = _get_windows(contig_bounds[i], contig_bounds[i + 1], size, step)
contig_window_starts.append(starts)
contig_window_stops.append(stops)
contig_window_contigs.append(np.full_like(starts, i))

window_contigs = np.concatenate(contig_window_contigs)
window_starts = np.concatenate(contig_window_starts)
window_stops = np.concatenate(contig_window_stops)

new_ds = Dataset(
{
window_contig: (
"windows",
window_contigs,
),
window_start: (
"windows",
window_starts,
Expand All @@ -71,7 +90,6 @@ def _get_windows(
start: int, stop: int, size: int, step: int
) -> Tuple[ArrayLike, ArrayLike]:
# Find the indexes for the start positions of all windows
# TODO: take contigs into account https://github.com/pystatgen/sgkit/issues/335
window_starts = np.arange(start, stop, step)
window_stops = np.clip(window_starts + size, start, stop)
return window_starts, window_stops
Expand Down