9
9
from sgkit .typing import ArrayLike
10
10
from sgkit .utils import conditional_merge_datasets
11
11
12
+ from .. import variables
12
13
from .aggregation import count_cohort_alleles , count_variant_alleles
13
14
14
15
15
16
def diversity (
16
- ds : Dataset , allele_counts : Hashable = "cohort_allele_count" , merge : bool = True
17
+ ds : Dataset ,
18
+ * ,
19
+ allele_counts : Hashable = variables .cohort_allele_count ,
20
+ call_genotype : Hashable = variables .call_genotype ,
21
+ merge : bool = True
17
22
) -> Dataset :
18
23
"""Compute diversity from cohort allele counts.
19
24
@@ -31,19 +36,30 @@ def diversity(
31
36
ds
32
37
Genotype call dataset.
33
38
allele_counts
34
- cohort allele counts to use or calculate.
39
+ cohort allele counts to use or calculate. Defined by
40
+ :data:`sgkit.variables.cohort_allele_count_spec`
41
+ call_genotype
42
+ Input variable name holding call_genotype as defined by
43
+ :data:`sgkit.variables.call_genotype_spec`
44
+ merge
45
+ If True (the default), merge the input dataset and the computed
46
+ output variables into a single dataset, otherwise return only
47
+ the computed output variables.
48
+ See :ref:`dataset_merge` for more details.
35
49
36
50
Returns
37
51
-------
38
- diversity value.
52
+ diversity value, as defined by :data:`sgkit.variables.stat_diversity_spec` .
39
53
40
54
Warnings
41
55
--------
42
56
This method does not currently support datasets that are chunked along the
43
57
samples dimension.
44
58
"""
45
59
if allele_counts not in ds :
46
- ds = count_cohort_alleles (ds )
60
+ ds = count_cohort_alleles (ds , call_genotype = call_genotype )
61
+ else :
62
+ variables .validate (ds , {allele_counts : variables .cohort_allele_count_spec })
47
63
ac = ds [allele_counts ]
48
64
an = ac .sum (axis = 2 )
49
65
n_pairs = an * (an - 1 ) / 2
@@ -55,13 +71,13 @@ def diversity(
55
71
pi_sum = pi .sum (axis = 0 , skipna = False )
56
72
new_ds = Dataset (
57
73
{
58
- " stat_diversity" : (
74
+ variables . stat_diversity : (
59
75
"cohorts" ,
60
76
pi_sum ,
61
77
)
62
78
}
63
79
)
64
- return conditional_merge_datasets (ds , new_ds , merge )
80
+ return conditional_merge_datasets (ds , variables . validate ( new_ds ) , merge )
65
81
66
82
67
83
# c = cohorts, k = alleles
@@ -100,7 +116,11 @@ def _divergence(ac: ArrayLike, an: ArrayLike, out: ArrayLike) -> None:
100
116
101
117
102
118
def divergence (
103
- ds : Dataset , allele_counts : Hashable = "cohort_allele_count" , merge : bool = True
119
+ ds : Dataset ,
120
+ * ,
121
+ call_genotype : Hashable = variables .call_genotype ,
122
+ allele_counts : Hashable = variables .cohort_allele_count ,
123
+ merge : bool = True
104
124
) -> Dataset :
105
125
"""Compute divergence between pairs of cohorts.
106
126
@@ -109,11 +129,21 @@ def divergence(
109
129
ds
110
130
Genotype call dataset.
111
131
allele_counts
112
- cohort allele counts to use or calculate.
132
+ cohort allele counts to use or calculate. Defined by
133
+ :data:`sgkit.variables.cohort_allele_count_spec`
134
+ call_genotype
135
+ Input variable name holding call_genotype as defined by
136
+ :data:`sgkit.variables.call_genotype_spec`
137
+ merge
138
+ If True (the default), merge the input dataset and the computed
139
+ output variables into a single dataset, otherwise return only
140
+ the computed output variables.
141
+ See :ref:`dataset_merge` for more details.
113
142
114
143
Returns
115
144
-------
116
- divergence value between pairs of cohorts.
145
+ divergence value between pairs of cohorts, as defined by
146
+ :data:`sgkit.variables.stat_divergence_spec`.
117
147
118
148
Warnings
119
149
--------
@@ -122,7 +152,9 @@ def divergence(
122
152
"""
123
153
124
154
if allele_counts not in ds :
125
- ds = count_cohort_alleles (ds )
155
+ ds = count_cohort_alleles (ds , call_genotype = call_genotype )
156
+ else :
157
+ variables .validate (ds , {allele_counts : variables .cohort_allele_count_spec })
126
158
ac = ds [allele_counts ]
127
159
an = ac .sum (axis = 2 )
128
160
@@ -137,8 +169,8 @@ def divergence(
137
169
d_sum = d .sum (axis = 0 )
138
170
assert_array_shape (d_sum , n_cohorts , n_cohorts )
139
171
140
- new_ds = Dataset ({" stat_divergence" : (("cohorts_0" , "cohorts_1" ), d_sum )})
141
- return conditional_merge_datasets (ds , new_ds , merge )
172
+ new_ds = Dataset ({variables . stat_divergence : (("cohorts_0" , "cohorts_1" ), d_sum )})
173
+ return conditional_merge_datasets (ds , variables . validate ( new_ds ) , merge )
142
174
143
175
144
176
# c = cohorts
@@ -169,7 +201,11 @@ def _pairwise_sum(d: ArrayLike, out: ArrayLike) -> None:
169
201
170
202
171
203
def Fst (
172
- ds : Dataset , allele_counts : Hashable = "cohort_allele_count" , merge : bool = True
204
+ ds : Dataset ,
205
+ * ,
206
+ call_genotype : Hashable = variables .call_genotype ,
207
+ allele_counts : Hashable = variables .cohort_allele_count ,
208
+ merge : bool = True
173
209
) -> Dataset :
174
210
"""Compute Fst between pairs of cohorts.
175
211
@@ -178,21 +214,35 @@ def Fst(
178
214
ds
179
215
Genotype call dataset.
180
216
allele_counts
181
- cohort allele counts to use or calculate.
217
+ cohort allele counts to use or calculate. Defined by
218
+ :data:`sgkit.variables.cohort_allele_count_spec`
219
+ call_genotype
220
+ Input variable name holding call_genotype as defined by
221
+ :data:`sgkit.variables.call_genotype_spec`
222
+ merge
223
+ If True (the default), merge the input dataset and the computed
224
+ output variables into a single dataset, otherwise return only
225
+ the computed output variables.
226
+ See :ref:`dataset_merge` for more details.
182
227
183
228
Returns
184
229
-------
185
- Fst value between pairs of cohorts.
230
+ Fst value between pairs of cohorts, as defined by
231
+ :data:`sgkit.variables.stat_Fst_spec`.
186
232
187
233
Warnings
188
234
--------
189
235
This method does not currently support datasets that are chunked along the
190
236
samples dimension.
191
237
"""
192
238
if allele_counts not in ds :
193
- ds = count_cohort_alleles (ds )
239
+ ds = count_cohort_alleles (ds , call_genotype = call_genotype )
240
+ else :
241
+ variables .validate (ds , {allele_counts : variables .cohort_allele_count_spec })
194
242
n_cohorts = ds .dims ["cohorts" ]
195
- div = diversity (ds , allele_counts , merge = False ).stat_diversity
243
+ div = diversity (
244
+ ds , allele_counts = allele_counts , call_genotype = call_genotype , merge = False
245
+ ).stat_diversity
196
246
assert_array_shape (div , n_cohorts )
197
247
198
248
# calculate diversity pairs
@@ -201,37 +251,60 @@ def Fst(
201
251
div_pairs = da .map_blocks (_pairwise_sum , div , chunks = shape , dtype = np .float64 )
202
252
assert_array_shape (div_pairs , n_cohorts , n_cohorts )
203
253
204
- gs = divergence (ds , allele_counts , merge = False ).stat_divergence
254
+ gs = divergence (
255
+ ds , allele_counts = allele_counts , call_genotype = call_genotype , merge = False
256
+ ).stat_divergence
205
257
den = div_pairs + 2 * gs
206
258
fst = 1 - (2 * div_pairs / den )
207
- new_ds = Dataset ({" stat_Fst" : fst })
208
- return conditional_merge_datasets (ds , new_ds , merge )
259
+ new_ds = Dataset ({variables . stat_Fst : fst })
260
+ return conditional_merge_datasets (ds , variables . validate ( new_ds ) , merge )
209
261
210
262
211
263
def Tajimas_D (
212
- ds : Dataset , allele_counts : Hashable = "variant_allele_count" , merge : bool = True
264
+ ds : Dataset ,
265
+ * ,
266
+ call_genotype : Hashable = variables .call_genotype ,
267
+ variant_allele_counts : Hashable = variables .variant_allele_count ,
268
+ allele_counts : Hashable = variables .cohort_allele_count ,
269
+ merge : bool = True
213
270
) -> Dataset :
214
271
"""Compute Tajimas' D for a genotype call dataset.
215
272
216
273
Parameters
217
274
----------
218
275
ds
219
276
Genotype call dataset.
277
+ variant_allele_counts
278
+ variant allele counts to use or calculate. Defined by
279
+ :data:`sgkit.variables.variant_allele_counts_spec`
220
280
allele_counts
221
- allele counts to use or calculate.
281
+ cohort allele counts to use or calculate. Defined by
282
+ :data:`sgkit.variables.cohort_allele_count_spec`
283
+ call_genotype
284
+ Input variable name holding call_genotype as defined by
285
+ :data:`sgkit.variables.call_genotype_spec`
286
+ merge
287
+ If True (the default), merge the input dataset and the computed
288
+ output variables into a single dataset, otherwise return only
289
+ the computed output variables.
290
+ See :ref:`dataset_merge` for more details.
222
291
223
292
Returns
224
293
-------
225
- Tajimas' D value.
294
+ Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec` .
226
295
227
296
Warnings
228
297
--------
229
298
This method does not currently support datasets that are chunked along the
230
299
samples dimension.
231
300
"""
232
- if allele_counts not in ds :
233
- ds = count_variant_alleles (ds )
234
- ac = ds [allele_counts ]
301
+ if variant_allele_counts not in ds :
302
+ ds = count_variant_alleles (ds , call_genotype = call_genotype )
303
+ else :
304
+ variables .validate (
305
+ ds , {variant_allele_counts : variables .variant_allele_count_spec }
306
+ )
307
+ ac = ds [variant_allele_counts ]
235
308
236
309
# count segregating
237
310
S = ((ac > 0 ).sum (axis = 1 ) > 1 ).sum ()
@@ -246,7 +319,9 @@ def Tajimas_D(
246
319
theta = S / a1
247
320
248
321
# calculate diversity
249
- div = diversity (ds ).stat_diversity
322
+ div = diversity (
323
+ ds , allele_counts = allele_counts , call_genotype = call_genotype , merge = False
324
+ ).stat_diversity
250
325
251
326
# N.B., both theta estimates are usually divided by the number of
252
327
# (accessible) bases but here we want the absolute difference
@@ -268,5 +343,5 @@ def Tajimas_D(
268
343
# finally calculate Tajima's D
269
344
D = d / d_stdev
270
345
271
- new_ds = Dataset ({" stat_Tajimas_D" : D })
272
- return conditional_merge_datasets (ds , new_ds , merge )
346
+ new_ds = Dataset ({variables . stat_Tajimas_D : D })
347
+ return conditional_merge_datasets (ds , variables . validate ( new_ds ) , merge )
0 commit comments