@@ -22,6 +22,36 @@ class ArrayLikeSpec(Spec):
22
22
ndim : Union [None , int , Set [int ]] = None
23
23
24
24
25
+ """
26
+ We define xr.Dataset variables used in the sgkit methods below,
27
+ these definitions:
28
+ * provide documentation
29
+ * specify shapes/types of data
30
+ * are used for internal input/output validation
31
+
32
+ Users writing their own methods do not have to use the validation
33
+ if they don't want to.
34
+
35
+ Regarding documentation, the first sentence of the docstring should
36
+ be a short summary (one sentence), it will appear on the global variable
37
+ summary page. The rest of the docstring will appear on the variable
38
+ specific page.
39
+ """
40
+
41
+ base_prediction = ArrayLikeSpec ("base_prediction" , ndim = 4 , kind = "f" )
42
+ """
43
+ REGENIE's base prediction (blocks, alphas, samples, outcomes). Stage 1
44
+ predictions from ridge regression reduction.
45
+ """
46
+ call_allele_count = ArrayLikeSpec ("call_allele_count" , ndim = 3 , kind = "u" )
47
+ """
48
+ Allele counts. With shape (variants, samples, alleles) and values
49
+ corresponding to the number of non-missing occurrences of each allele.
50
+ """
51
+ call_dosage = ArrayLikeSpec ("call_dosage" , kind = "f" , ndim = 2 )
52
+ """Dosages, encoded as floats, with NaN indicating a missing value."""
53
+ call_dosage_mask = ArrayLikeSpec ("call_dosage_mask" , kind = "b" , ndim = 2 )
54
+ """TODO"""
25
55
call_genotype = ArrayLikeSpec ("call_genotype" , kind = "i" , ndim = 3 )
26
56
"""
27
57
Call genotype. Encoded as allele values (0 for the reference, 1 for
@@ -30,69 +60,23 @@ class ArrayLikeSpec(Spec):
30
60
"""
31
61
call_genotype_mask = ArrayLikeSpec ("call_genotype_mask" , kind = "b" , ndim = 3 )
32
62
"""TODO"""
33
- variant_contig = ArrayLikeSpec ("variant_contig" , kind = "i" , ndim = 1 )
34
- """The (index of the) contig for each variant."""
35
- variant_position = ArrayLikeSpec ("variant_position" , kind = "i" , ndim = 1 )
36
- """The reference position of the variant."""
37
- variant_allele = ArrayLikeSpec ("variant_allele" , kind = {"S" , "O" }, ndim = 2 )
38
- """The possible alleles for the variant."""
39
- sample_id = ArrayLikeSpec ("sample_id" , kind = {"U" , "O" }, ndim = 1 )
40
- """The unique identifier of the sample."""
41
63
call_genotype_phased = ArrayLikeSpec ("call_genotype_phased" , kind = "b" , ndim = 2 )
42
64
"""
43
65
A flag for each call indicating if it is phased or not. If omitted
44
66
all calls are unphased.
45
67
"""
46
- variant_id = ArrayLikeSpec ("variant_id" , kind = "U" , ndim = 1 )
47
- """The unique identifier of the variant."""
48
- call_dosage = ArrayLikeSpec ("call_dosage" , kind = "f" , ndim = 2 )
49
- """Dosages, encoded as floats, with NaN indicating a missing value."""
50
- call_dosage_mask = ArrayLikeSpec ("call_dosage_mask" , kind = "b" , ndim = 2 )
51
- """TODO"""
52
68
call_genotype_probability = ArrayLikeSpec ("call_genotype_probability" , kind = "f" , ndim = 3 )
53
69
"""TODO"""
54
70
call_genotype_probability_mask = ArrayLikeSpec (
55
71
"call_genotype_probability_mask" , kind = "b" , ndim = 3
56
72
)
57
73
"""TODO"""
58
- genotype_counts = ArrayLikeSpec ("genotype_counts" , ndim = 2 , kind = "i" )
59
- """
60
- Genotype counts. Must correspond to an (`N`, 3) array where `N` is equal
61
- to the number of variants and the 3 columns contain heterozygous,
62
- homozygous reference, and homozygous alternate counts (in that order)
63
- across all samples for a variant.
64
- """
65
- call_allele_count = ArrayLikeSpec ("call_allele_count" , ndim = 3 , kind = "u" )
66
- """
67
- Allele counts. With shape (variants, samples, alleles) and values
68
- corresponding to the number of non-missing occurrences of each allele.
69
- """
70
- variant_allele_count = ArrayLikeSpec ("variant_allele_count" , ndim = 2 , kind = "u" )
71
- """
72
- Variant allele counts. With shape (variants, alleles) and values
73
- corresponding to the number of non-missing occurrences of each allele.
74
- """
75
- variant_hwe_p_value = ArrayLikeSpec ("variant_hwe_p_value" , kind = "f" )
76
- """P values from HWE test for each variant as float in [0, 1]."""
77
- variant_beta = ArrayLikeSpec ("variant_beta" )
78
- """Beta values associated with each variant and trait."""
79
- variant_t_value = ArrayLikeSpec ("variant_t_value" )
80
- """T statistics for each beta."""
81
- variant_p_value = ArrayLikeSpec ("variant_p_value" , kind = "f" )
82
- """P values as float in [0, 1]."""
83
74
covariates = ArrayLikeSpec ("covariates" , ndim = {1 , 2 })
84
75
"""
85
76
Covariate variable names. Must correspond to 1 or 2D dataset
86
77
variables of shape (samples[, covariates]). All covariate arrays
87
78
will be concatenated along the second axis (columns).
88
79
"""
89
- traits = ArrayLikeSpec ("traits" , ndim = {1 , 2 })
90
- """
91
- Trait (for example phenotype) variable names. Must all be continuous and
92
- correspond to 1 or 2D dataset variables of shape (samples[, traits]).
93
- 2D trait arrays will be assumed to contain separate traits within columns
94
- and concatenated to any 1D traits along the second axis (columns).
95
- """
96
80
dosage = ArrayLikeSpec ("dosage" )
97
81
"""
98
82
Dosage variable name. Where "dosage" array can contain represent
@@ -102,19 +86,12 @@ class ArrayLikeSpec(Spec):
102
86
- True dosages as computed from imputed or probabilistic variant calls
103
87
- Any other custom encoding in a user-defined variable
104
88
"""
105
- sample_pcs = ArrayLikeSpec ("sample_pcs" , ndim = 2 , kind = "f" )
106
- """Sample PCs (PCxS)."""
107
- pc_relate_phi = ArrayLikeSpec ("pc_relate_phi" , ndim = 2 , kind = "f" )
108
- """PC Relate kinship coefficient matrix."""
109
- base_prediction = ArrayLikeSpec ("base_prediction" , ndim = 4 , kind = "f" )
110
- """
111
- REGENIE's base prediction (blocks, alphas, samples, outcomes). Stage 1
112
- predictions from ridge regression reduction.
113
- """
114
- meta_prediction = ArrayLikeSpec ("meta_prediction" , ndim = 2 , kind = "f" )
89
+ genotype_counts = ArrayLikeSpec ("genotype_counts" , ndim = 2 , kind = "i" )
115
90
"""
116
- REGENIE's meta_prediction (samples, outcomes). Stage 2 predictions from
117
- the best meta estimator trained on the out-of-sample Stage 1 predictions.
91
+ Genotype counts. Must correspond to an (`N`, 3) array where `N` is equal
92
+ to the number of variants and the 3 columns contain heterozygous,
93
+ homozygous reference, and homozygous alternate counts (in that order)
94
+ across all samples for a variant.
118
95
"""
119
96
loco_prediction = ArrayLikeSpec ("loco_prediction" , ndim = 3 , kind = "f" )
120
97
"""
@@ -123,22 +100,61 @@ class ArrayLikeSpec(Spec):
123
100
held out contigs. This will be absent if the data provided does not contain
124
101
at least 2 contigs.
125
102
"""
126
- variant_n_called = ArrayLikeSpec ("variant_n_called" , ndim = 1 , kind = "i" )
127
- """The number of samples with called genotypes."""
103
+ meta_prediction = ArrayLikeSpec ("meta_prediction" , ndim = 2 , kind = "f" )
104
+ """
105
+ REGENIE's meta_prediction (samples, outcomes). Stage 2 predictions from
106
+ the best meta estimator trained on the out-of-sample Stage 1 predictions.
107
+ """
108
+ pc_relate_phi = ArrayLikeSpec ("pc_relate_phi" , ndim = 2 , kind = "f" )
109
+ """PC Relate kinship coefficient matrix."""
110
+ sample_id = ArrayLikeSpec ("sample_id" , kind = {"U" , "O" }, ndim = 1 )
111
+ """The unique identifier of the sample."""
112
+ sample_pcs = ArrayLikeSpec ("sample_pcs" , ndim = 2 , kind = "f" )
113
+ """Sample PCs (PCxS)."""
114
+ traits = ArrayLikeSpec ("traits" , ndim = {1 , 2 })
115
+ """
116
+ Trait (for example phenotype) variable names. Must all be continuous and
117
+ correspond to 1 or 2D dataset variables of shape (samples[, traits]).
118
+ 2D trait arrays will be assumed to contain separate traits within columns
119
+ and concatenated to any 1D traits along the second axis (columns).
120
+ """
121
+ variant_allele = ArrayLikeSpec ("variant_allele" , kind = {"S" , "O" }, ndim = 2 )
122
+ """The possible alleles for the variant."""
123
+ variant_allele_count = ArrayLikeSpec ("variant_allele_count" , ndim = 2 , kind = "u" )
124
+ """
125
+ Variant allele counts. With shape (variants, alleles) and values
126
+ corresponding to the number of non-missing occurrences of each allele.
127
+ """
128
+ variant_allele_frequency = ArrayLikeSpec ("variant_allele_frequency" , ndim = 2 , kind = "f" )
129
+ """The frequency of the occurrence of each allele."""
130
+ variant_allele_total = ArrayLikeSpec ("variant_allele_total" , ndim = 1 , kind = "i" )
131
+ """The number of occurrences of all alleles."""
132
+ variant_beta = ArrayLikeSpec ("variant_beta" )
133
+ """Beta values associated with each variant and trait."""
128
134
variant_call_rate = ArrayLikeSpec ("variant_call_rate" , ndim = 1 , kind = "f" )
129
135
"""The number of samples with heterozygous calls."""
136
+ variant_contig = ArrayLikeSpec ("variant_contig" , kind = "i" , ndim = 1 )
137
+ """The (index of the) contig for each variant."""
138
+ variant_hwe_p_value = ArrayLikeSpec ("variant_hwe_p_value" , kind = "f" )
139
+ """P values from HWE test for each variant as float in [0, 1]."""
140
+ variant_id = ArrayLikeSpec ("variant_id" , kind = "U" , ndim = 1 )
141
+ """The unique identifier of the variant."""
142
+ variant_n_called = ArrayLikeSpec ("variant_n_called" , ndim = 1 , kind = "i" )
143
+ """The number of samples with called genotypes."""
130
144
variant_n_het = ArrayLikeSpec ("variant_n_het" , ndim = 1 , kind = "i" )
131
145
"""The number of samples with heterozygous calls."""
132
- variant_n_hom_ref = ArrayLikeSpec ("variant_n_hom_ref" , ndim = 1 , kind = "i" )
133
- """The number of samples with homozygous reference calls."""
134
146
variant_n_hom_alt = ArrayLikeSpec ("variant_n_hom_alt" , ndim = 1 , kind = "i" )
135
147
"""The number of samples with homozygous alternate calls."""
148
+ variant_n_hom_ref = ArrayLikeSpec ("variant_n_hom_ref" , ndim = 1 , kind = "i" )
149
+ """The number of samples with homozygous reference calls."""
136
150
variant_n_non_ref = ArrayLikeSpec ("variant_n_non_ref" , ndim = 1 , kind = "i" )
137
151
"""The number of samples that are not homozygous reference calls."""
138
- variant_allele_total = ArrayLikeSpec ("variant_allele_total" , ndim = 1 , kind = "i" )
139
- """The number of occurrences of all alleles."""
140
- variant_allele_frequency = ArrayLikeSpec ("variant_allele_frequency" , ndim = 2 , kind = "f" )
141
- """The frequency of the occurrence of each allele."""
152
+ variant_p_value = ArrayLikeSpec ("variant_p_value" , kind = "f" )
153
+ """P values as float in [0, 1]."""
154
+ variant_position = ArrayLikeSpec ("variant_position" , kind = "i" , ndim = 1 )
155
+ """The reference position of the variant."""
156
+ variant_t_value = ArrayLikeSpec ("variant_t_value" )
157
+ """T statistics for each beta."""
142
158
143
159
144
160
class SgkitVariables :
0 commit comments