Skip to content

Commit 9655012

Browse files
committed
Remove duplicate indexer, fix zarr v3
1 parent e873d89 commit 9655012

File tree

2 files changed

+3
-58
lines changed

2 files changed

+3
-58
lines changed

bio2zarr/plink.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def generate_schema(
8282
schema.ZarrArraySpec.new(
8383
vcf_field=None,
8484
name="variant_allele",
85-
dtype="str",
85+
dtype="O",
8686
shape=[m, 2],
8787
dimensions=["variants", "alleles"],
8888
chunks=[variants_chunk_size, 2],
@@ -151,13 +151,10 @@ def convert(
151151
PlinkFormat(bed_path),
152152
target_num_partitions=target_num_partitions,
153153
schema=schema_instance,
154-
# dimension_separator=None,
155-
# max_variant_chunks=None
156154
)
157155
vzw.encode_all_partitions(
158156
worker_processes=worker_processes,
159157
show_progress=show_progress,
160-
# max_memory=None,
161158
)
162159
vzw.finalise(show_progress)
163160

bio2zarr/writer.py

Lines changed: 2 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -641,60 +641,8 @@ def finalise(self, show_progress=False):
641641
def create_index(self):
642642
"""Create an index to support efficient region queries."""
643643

644-
root = zarr.open_group(store=self.path, mode="r+")
645-
646-
contig = root["variant_contig"]
647-
pos = root["variant_position"]
648-
length = root["variant_length"]
649-
650-
assert contig.cdata_shape == pos.cdata_shape
651-
652-
index = []
653-
654-
logger.info("Creating region index")
655-
for v_chunk in range(pos.cdata_shape[0]):
656-
c = contig.blocks[v_chunk]
657-
p = pos.blocks[v_chunk]
658-
e = p + length.blocks[v_chunk] - 1
659-
660-
# create a row for each contig in the chunk
661-
d = np.diff(c, append=-1)
662-
c_start_idx = 0
663-
for c_end_idx in np.nonzero(d)[0]:
664-
assert c[c_start_idx] == c[c_end_idx]
665-
index.append(
666-
(
667-
v_chunk, # chunk index
668-
c[c_start_idx], # contig ID
669-
p[c_start_idx], # start
670-
p[c_end_idx], # end
671-
np.max(e[c_start_idx : c_end_idx + 1]), # max end
672-
c_end_idx - c_start_idx + 1, # num records
673-
)
674-
)
675-
c_start_idx = c_end_idx + 1
676-
677-
index = np.array(index, dtype=pos.dtype)
678-
kwargs = {}
679-
if not zarr_utils.zarr_v3():
680-
kwargs["dimension_separator"] = self.metadata.dimension_separator
681-
array = root.array(
682-
"region_index",
683-
data=index,
684-
shape=index.shape,
685-
chunks=index.shape,
686-
dtype=index.dtype,
687-
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
688-
fill_value=None,
689-
**kwargs,
690-
)
691-
array.attrs["_ARRAY_DIMENSIONS"] = [
692-
"region_index_values",
693-
"region_index_fields",
694-
]
695-
696-
logger.info("Consolidating Zarr metadata")
697-
zarr.consolidate_metadata(self.path)
644+
indexer = VcfZarrIndexer(self.path)
645+
indexer.create_index()
698646

699647
######################
700648
# encode_all_partitions

0 commit comments

Comments
 (0)