Skip to content

Commit 8ca9f8b

Browse files
committed
Sort variables names + add top level comment
1 parent b67d3e5 commit 8ca9f8b

File tree

2 files changed

+105
-89
lines changed

2 files changed

+105
-89
lines changed

docs/api.rst

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -54,41 +54,41 @@ Variables
5454
.. autosummary::
5555
:toctree: generated/
5656

57+
variables.base_prediction
58+
variables.call_allele_count
59+
variables.call_dosage
60+
variables.call_dosage_mask
5761
variables.call_genotype
5862
variables.call_genotype_mask
59-
variables.variant_contig
60-
variables.variant_position
61-
variables.variant_allele
62-
variables.sample_id
6363
variables.call_genotype_phased
64-
variables.variant_id
65-
variables.call_dosage
66-
variables.call_dosage_mask
6764
variables.call_genotype_probability
6865
variables.call_genotype_probability_mask
69-
variables.genotype_counts
70-
variables.call_allele_count
71-
variables.variant_allele_count
72-
variables.variant_hwe_p_value
73-
variables.variant_beta
74-
variables.variant_t_value
75-
variables.variant_p_value
7666
variables.covariates
77-
variables.traits
7867
variables.dosage
79-
variables.sample_pcs
80-
variables.pc_relate_phi
81-
variables.base_prediction
82-
variables.meta_prediction
68+
variables.genotype_counts
8369
variables.loco_prediction
84-
variables.variant_n_called
70+
variables.meta_prediction
71+
variables.pc_relate_phi
72+
variables.sample_id
73+
variables.sample_pcs
74+
variables.traits
75+
variables.variant_allele
76+
variables.variant_allele_count
77+
variables.variant_allele_frequency
78+
variables.variant_allele_total
79+
variables.variant_beta
8580
variables.variant_call_rate
81+
variables.variant_contig
82+
variables.variant_hwe_p_value
83+
variables.variant_id
84+
variables.variant_n_called
8685
variables.variant_n_het
87-
variables.variant_n_hom_ref
8886
variables.variant_n_hom_alt
87+
variables.variant_n_hom_ref
8988
variables.variant_n_non_ref
90-
variables.variant_allele_total
91-
variables.variant_allele_frequency
89+
variables.variant_p_value
90+
variables.variant_position
91+
variables.variant_t_value
9292

9393
Utilities
9494
=========

sgkit/variables.py

Lines changed: 82 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,36 @@ class ArrayLikeSpec(Spec):
2222
ndim: Union[None, int, Set[int]] = None
2323

2424

25+
"""
26+
We define xr.Dataset variables used in the sgkit methods below,
27+
these definitions:
28+
* provide documentation
29+
* specify shapes/types of data
30+
* are used for internal input/output validation
31+
32+
Users writing their own methods do not have to use the validation
33+
if they don't want to.
34+
35+
Regarding documentation, the first sentence of the docstring should
36+
be a short summary (one sentence), it will appear on the global variable
37+
summary page. The rest of the docstring will appear on the variable
38+
specific page.
39+
"""
40+
41+
base_prediction = ArrayLikeSpec("base_prediction", ndim=4, kind="f")
42+
"""
43+
REGENIE's base prediction (blocks, alphas, samples, outcomes). Stage 1
44+
predictions from ridge regression reduction.
45+
"""
46+
call_allele_count = ArrayLikeSpec("call_allele_count", ndim=3, kind="u")
47+
"""
48+
Allele counts. With shape (variants, samples, alleles) and values
49+
corresponding to the number of non-missing occurrences of each allele.
50+
"""
51+
call_dosage = ArrayLikeSpec("call_dosage", kind="f", ndim=2)
52+
"""Dosages, encoded as floats, with NaN indicating a missing value."""
53+
call_dosage_mask = ArrayLikeSpec("call_dosage_mask", kind="b", ndim=2)
54+
"""TODO"""
2555
call_genotype = ArrayLikeSpec("call_genotype", kind="i", ndim=3)
2656
"""
2757
Call genotype. Encoded as allele values (0 for the reference, 1 for
@@ -30,69 +60,23 @@ class ArrayLikeSpec(Spec):
3060
"""
3161
call_genotype_mask = ArrayLikeSpec("call_genotype_mask", kind="b", ndim=3)
3262
"""TODO"""
33-
variant_contig = ArrayLikeSpec("variant_contig", kind="i", ndim=1)
34-
"""The (index of the) contig for each variant."""
35-
variant_position = ArrayLikeSpec("variant_position", kind="i", ndim=1)
36-
"""The reference position of the variant."""
37-
variant_allele = ArrayLikeSpec("variant_allele", kind={"S", "O"}, ndim=2)
38-
"""The possible alleles for the variant."""
39-
sample_id = ArrayLikeSpec("sample_id", kind={"U", "O"}, ndim=1)
40-
"""The unique identifier of the sample."""
4163
call_genotype_phased = ArrayLikeSpec("call_genotype_phased", kind="b", ndim=2)
4264
"""
4365
A flag for each call indicating if it is phased or not. If omitted
4466
all calls are unphased.
4567
"""
46-
variant_id = ArrayLikeSpec("variant_id", kind="U", ndim=1)
47-
"""The unique identifier of the variant."""
48-
call_dosage = ArrayLikeSpec("call_dosage", kind="f", ndim=2)
49-
"""Dosages, encoded as floats, with NaN indicating a missing value."""
50-
call_dosage_mask = ArrayLikeSpec("call_dosage_mask", kind="b", ndim=2)
51-
"""TODO"""
5268
call_genotype_probability = ArrayLikeSpec("call_genotype_probability", kind="f", ndim=3)
5369
"""TODO"""
5470
call_genotype_probability_mask = ArrayLikeSpec(
5571
"call_genotype_probability_mask", kind="b", ndim=3
5672
)
5773
"""TODO"""
58-
genotype_counts = ArrayLikeSpec("genotype_counts", ndim=2, kind="i")
59-
"""
60-
Genotype counts. Must correspond to an (`N`, 3) array where `N` is equal
61-
to the number of variants and the 3 columns contain heterozygous,
62-
homozygous reference, and homozygous alternate counts (in that order)
63-
across all samples for a variant.
64-
"""
65-
call_allele_count = ArrayLikeSpec("call_allele_count", ndim=3, kind="u")
66-
"""
67-
Allele counts. With shape (variants, samples, alleles) and values
68-
corresponding to the number of non-missing occurrences of each allele.
69-
"""
70-
variant_allele_count = ArrayLikeSpec("variant_allele_count", ndim=2, kind="u")
71-
"""
72-
Variant allele counts. With shape (variants, alleles) and values
73-
corresponding to the number of non-missing occurrences of each allele.
74-
"""
75-
variant_hwe_p_value = ArrayLikeSpec("variant_hwe_p_value", kind="f")
76-
"""P values from HWE test for each variant as float in [0, 1]."""
77-
variant_beta = ArrayLikeSpec("variant_beta")
78-
"""Beta values associated with each variant and trait."""
79-
variant_t_value = ArrayLikeSpec("variant_t_value")
80-
"""T statistics for each beta."""
81-
variant_p_value = ArrayLikeSpec("variant_p_value", kind="f")
82-
"""P values as float in [0, 1]."""
8374
covariates = ArrayLikeSpec("covariates", ndim={1, 2})
8475
"""
8576
Covariate variable names. Must correspond to 1 or 2D dataset
8677
variables of shape (samples[, covariates]). All covariate arrays
8778
will be concatenated along the second axis (columns).
8879
"""
89-
traits = ArrayLikeSpec("traits", ndim={1, 2})
90-
"""
91-
Trait (for example phenotype) variable names. Must all be continuous and
92-
correspond to 1 or 2D dataset variables of shape (samples[, traits]).
93-
2D trait arrays will be assumed to contain separate traits within columns
94-
and concatenated to any 1D traits along the second axis (columns).
95-
"""
9680
dosage = ArrayLikeSpec("dosage")
9781
"""
9882
Dosage variable name. Where "dosage" array can contain represent
@@ -102,19 +86,12 @@ class ArrayLikeSpec(Spec):
10286
- True dosages as computed from imputed or probabilistic variant calls
10387
- Any other custom encoding in a user-defined variable
10488
"""
105-
sample_pcs = ArrayLikeSpec("sample_pcs", ndim=2, kind="f")
106-
"""Sample PCs (PCxS)."""
107-
pc_relate_phi = ArrayLikeSpec("pc_relate_phi", ndim=2, kind="f")
108-
"""PC Relate kinship coefficient matrix."""
109-
base_prediction = ArrayLikeSpec("base_prediction", ndim=4, kind="f")
110-
"""
111-
REGENIE's base prediction (blocks, alphas, samples, outcomes). Stage 1
112-
predictions from ridge regression reduction.
113-
"""
114-
meta_prediction = ArrayLikeSpec("meta_prediction", ndim=2, kind="f")
89+
genotype_counts = ArrayLikeSpec("genotype_counts", ndim=2, kind="i")
11590
"""
116-
REGENIE's meta_prediction (samples, outcomes). Stage 2 predictions from
117-
the best meta estimator trained on the out-of-sample Stage 1 predictions.
91+
Genotype counts. Must correspond to an (`N`, 3) array where `N` is equal
92+
to the number of variants and the 3 columns contain heterozygous,
93+
homozygous reference, and homozygous alternate counts (in that order)
94+
across all samples for a variant.
11895
"""
11996
loco_prediction = ArrayLikeSpec("loco_prediction", ndim=3, kind="f")
12097
"""
@@ -123,22 +100,61 @@ class ArrayLikeSpec(Spec):
123100
held out contigs. This will be absent if the data provided does not contain
124101
at least 2 contigs.
125102
"""
126-
variant_n_called = ArrayLikeSpec("variant_n_called", ndim=1, kind="i")
127-
"""The number of samples with called genotypes."""
103+
meta_prediction = ArrayLikeSpec("meta_prediction", ndim=2, kind="f")
104+
"""
105+
REGENIE's meta_prediction (samples, outcomes). Stage 2 predictions from
106+
the best meta estimator trained on the out-of-sample Stage 1 predictions.
107+
"""
108+
pc_relate_phi = ArrayLikeSpec("pc_relate_phi", ndim=2, kind="f")
109+
"""PC Relate kinship coefficient matrix."""
110+
sample_id = ArrayLikeSpec("sample_id", kind={"U", "O"}, ndim=1)
111+
"""The unique identifier of the sample."""
112+
sample_pcs = ArrayLikeSpec("sample_pcs", ndim=2, kind="f")
113+
"""Sample PCs (PCxS)."""
114+
traits = ArrayLikeSpec("traits", ndim={1, 2})
115+
"""
116+
Trait (for example phenotype) variable names. Must all be continuous and
117+
correspond to 1 or 2D dataset variables of shape (samples[, traits]).
118+
2D trait arrays will be assumed to contain separate traits within columns
119+
and concatenated to any 1D traits along the second axis (columns).
120+
"""
121+
variant_allele = ArrayLikeSpec("variant_allele", kind={"S", "O"}, ndim=2)
122+
"""The possible alleles for the variant."""
123+
variant_allele_count = ArrayLikeSpec("variant_allele_count", ndim=2, kind="u")
124+
"""
125+
Variant allele counts. With shape (variants, alleles) and values
126+
corresponding to the number of non-missing occurrences of each allele.
127+
"""
128+
variant_allele_frequency = ArrayLikeSpec("variant_allele_frequency", ndim=2, kind="f")
129+
"""The frequency of the occurrence of each allele."""
130+
variant_allele_total = ArrayLikeSpec("variant_allele_total", ndim=1, kind="i")
131+
"""The number of occurrences of all alleles."""
132+
variant_beta = ArrayLikeSpec("variant_beta")
133+
"""Beta values associated with each variant and trait."""
128134
variant_call_rate = ArrayLikeSpec("variant_call_rate", ndim=1, kind="f")
129135
"""The number of samples with heterozygous calls."""
136+
variant_contig = ArrayLikeSpec("variant_contig", kind="i", ndim=1)
137+
"""The (index of the) contig for each variant."""
138+
variant_hwe_p_value = ArrayLikeSpec("variant_hwe_p_value", kind="f")
139+
"""P values from HWE test for each variant as float in [0, 1]."""
140+
variant_id = ArrayLikeSpec("variant_id", kind="U", ndim=1)
141+
"""The unique identifier of the variant."""
142+
variant_n_called = ArrayLikeSpec("variant_n_called", ndim=1, kind="i")
143+
"""The number of samples with called genotypes."""
130144
variant_n_het = ArrayLikeSpec("variant_n_het", ndim=1, kind="i")
131145
"""The number of samples with heterozygous calls."""
132-
variant_n_hom_ref = ArrayLikeSpec("variant_n_hom_ref", ndim=1, kind="i")
133-
"""The number of samples with homozygous reference calls."""
134146
variant_n_hom_alt = ArrayLikeSpec("variant_n_hom_alt", ndim=1, kind="i")
135147
"""The number of samples with homozygous alternate calls."""
148+
variant_n_hom_ref = ArrayLikeSpec("variant_n_hom_ref", ndim=1, kind="i")
149+
"""The number of samples with homozygous reference calls."""
136150
variant_n_non_ref = ArrayLikeSpec("variant_n_non_ref", ndim=1, kind="i")
137151
"""The number of samples that are not homozygous reference calls."""
138-
variant_allele_total = ArrayLikeSpec("variant_allele_total", ndim=1, kind="i")
139-
"""The number of occurrences of all alleles."""
140-
variant_allele_frequency = ArrayLikeSpec("variant_allele_frequency", ndim=2, kind="f")
141-
"""The frequency of the occurrence of each allele."""
152+
variant_p_value = ArrayLikeSpec("variant_p_value", kind="f")
153+
"""P values as float in [0, 1]."""
154+
variant_position = ArrayLikeSpec("variant_position", kind="i", ndim=1)
155+
"""The reference position of the variant."""
156+
variant_t_value = ArrayLikeSpec("variant_t_value")
157+
"""T statistics for each beta."""
142158

143159

144160
class SgkitVariables:

0 commit comments

Comments
 (0)