1
1
import tempfile
2
2
from pathlib import Path
3
- from typing import List , Optional
3
+ from typing import Hashable , List , Optional
4
4
5
5
import dask .array as da
6
6
import xarray as xr
11
11
from ..model import DIM_VARIANT , create_genotype_call_dataset
12
12
from ..typing import ArrayLike , PathType
13
13
from ..utils import encode_array , max_str_len
14
+ from .vcf .vcf_reader import zarrs_to_dataset
14
15
15
16
16
17
def _ensure_2d (arr : ArrayLike ) -> ArrayLike :
@@ -60,6 +61,7 @@ def vcfzarr_to_zarr(
60
61
grouped_by_contig : bool = False ,
61
62
consolidated : bool = False ,
62
63
tempdir : Optional [PathType ] = None ,
64
+ concat_algorithm : Optional [str ] = None ,
63
65
) -> None :
64
66
"""Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format.
65
67
@@ -78,6 +80,10 @@ def vcfzarr_to_zarr(
78
80
tempdir
79
81
Temporary directory where intermediate files are stored. The default None means
80
82
use the system default temporary directory.
83
+ concat_algorithm
84
+ The algorithm to use to concatenate and rechunk Zarr files. The default None means
85
+ use the optimized version suitable for large files, whereas ``xarray_internal`` will
86
+ use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
81
87
"""
82
88
83
89
if consolidated :
@@ -118,50 +124,16 @@ def vcfzarr_to_zarr(
118
124
contig_zarr_file = Path (tmpdir ) / contig
119
125
ds .to_zarr (contig_zarr_file )
120
126
121
- zarr_files .append (contig_zarr_file )
122
-
123
- zarr_groups = [zarr .open_group (str (f )) for f in zarr_files ]
124
-
125
- first_zarr_group = zarr_groups [0 ]
126
-
127
- with zarr .open_group (str (output )) as output_zarr :
128
-
129
- var_to_attrs = {} # attributes to copy
130
- delayed = [] # do all the rechunking operations in one computation
131
- for var in vars_to_rechunk :
132
- var_to_attrs [var ] = first_zarr_group [var ].attrs .asdict ()
133
- dtype = None
134
- if var == "variant_id" :
135
- kind = first_zarr_group [var ].dtype .kind
136
- max_len = _get_max_len (zarr_groups , "max_variant_id_length" )
137
- dtype = f"{ kind } { max_len } "
138
- elif var == "variant_allele" :
139
- kind = first_zarr_group [var ].dtype .kind
140
- max_len = _get_max_len (zarr_groups , "max_variant_allele_length" )
141
- dtype = f"{ kind } { max_len } "
142
-
143
- arr = concatenate_and_rechunk (
144
- [group [var ] for group in zarr_groups ], dtype = dtype
145
- )
146
- d = arr .to_zarr (
147
- str (output ),
148
- component = var ,
149
- overwrite = True ,
150
- compute = False ,
151
- fill_value = None ,
152
- )
153
- delayed .append (d )
154
- da .compute (* delayed )
155
-
156
- # copy variables that are not rechunked (e.g. sample_id)
157
- for var in vars_to_copy :
158
- output_zarr [var ] = first_zarr_group [var ]
159
- output_zarr [var ].attrs .update (first_zarr_group [var ].attrs )
160
-
161
- # copy attributes
162
- output_zarr .attrs .update (first_zarr_group .attrs )
163
- for (var , attrs ) in var_to_attrs .items ():
164
- output_zarr [var ].attrs .update (attrs )
127
+ zarr_files .append (str (contig_zarr_file ))
128
+
129
+ if concat_algorithm == "xarray_internal" :
130
+ ds = zarrs_to_dataset (zarr_files )
131
+ ds .to_zarr (output , mode = "w" )
132
+ else :
133
+ # Use the optimized algorithm in `concatenate_and_rechunk`
134
+ _concat_zarrs_optimized (
135
+ zarr_files , output , vars_to_rechunk , vars_to_copy
136
+ )
165
137
166
138
167
139
def _vcfzarr_to_dataset (
@@ -222,7 +194,7 @@ def _vcfzarr_to_dataset(
222
194
kind = arr .dtype .kind
223
195
if kind in ["O" , "U" , "S" ]:
224
196
# Compute fixed-length string dtype for array
225
- if kind == "O" :
197
+ if kind == "O" or var in ( "variant_id" , "variant_allele" ) :
226
198
kind = "S"
227
199
max_len = max_str_len (arr ).values
228
200
dt = f"{ kind } { max_len } "
@@ -239,3 +211,51 @@ def _vcfzarr_to_dataset(
239
211
def _get_max_len (zarr_groups : List [zarr .Group ], attr_name : str ) -> int :
240
212
max_len : int = max ([group .attrs [attr_name ] for group in zarr_groups ])
241
213
return max_len
214
+
215
+
216
+ def _concat_zarrs_optimized (
217
+ zarr_files : List [str ],
218
+ output : PathType ,
219
+ vars_to_rechunk : List [Hashable ],
220
+ vars_to_copy : List [Hashable ],
221
+ ) -> None :
222
+ zarr_groups = [zarr .open_group (f ) for f in zarr_files ]
223
+
224
+ first_zarr_group = zarr_groups [0 ]
225
+
226
+ with zarr .open_group (str (output )) as output_zarr :
227
+
228
+ var_to_attrs = {} # attributes to copy
229
+ delayed = [] # do all the rechunking operations in one computation
230
+ for var in vars_to_rechunk :
231
+ var_to_attrs [var ] = first_zarr_group [var ].attrs .asdict ()
232
+ dtype = None
233
+ if var == "variant_id" :
234
+ max_len = _get_max_len (zarr_groups , "max_variant_id_length" )
235
+ dtype = f"S{ max_len } "
236
+ elif var == "variant_allele" :
237
+ max_len = _get_max_len (zarr_groups , "max_variant_allele_length" )
238
+ dtype = f"S{ max_len } "
239
+
240
+ arr = concatenate_and_rechunk (
241
+ [group [var ] for group in zarr_groups ], dtype = dtype
242
+ )
243
+ d = arr .to_zarr (
244
+ str (output ),
245
+ component = var ,
246
+ overwrite = True ,
247
+ compute = False ,
248
+ fill_value = None ,
249
+ )
250
+ delayed .append (d )
251
+ da .compute (* delayed )
252
+
253
+ # copy variables that are not rechunked (e.g. sample_id)
254
+ for var in vars_to_copy :
255
+ output_zarr [var ] = first_zarr_group [var ]
256
+ output_zarr [var ].attrs .update (first_zarr_group [var ].attrs )
257
+
258
+ # copy attributes
259
+ output_zarr .attrs .update (first_zarr_group .attrs )
260
+ for (var , attrs ) in var_to_attrs .items ():
261
+ output_zarr [var ].attrs .update (attrs )
0 commit comments