From 71dc01908489c5c2a4bbaa838b161f0721656eb5 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 20 Jan 2022 12:21:23 +0000 Subject: [PATCH 1/2] Fix for time_units in tskit 0.4.1 --- CHANGELOG.rst | 6 ++++++ requirements/CI-tests-conda.txt | 14 +++++++------- tests/test_cli.py | 6 +++++- tests/test_compression.py | 2 +- tszip/compression.py | 6 +++--- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 77f635f..ad92a47 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,9 @@ +-------------------- +[0.2.1] - 2021-01-20 +-------------------- + +- Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55) + -------------------- [0.2.0] - 2021-11-08 -------------------- diff --git a/requirements/CI-tests-conda.txt b/requirements/CI-tests-conda.txt index abd6573..cf109c0 100644 --- a/requirements/CI-tests-conda.txt +++ b/requirements/CI-tests-conda.txt @@ -1,8 +1,8 @@ -humanize==3.4.1 +humanize==3.13.1 h5py<3.2 -msprime==1.0.0 -pytest==6.2.3 -pytest-cov==2.11.1 -pytest-xdist==2.2.1 -tskit==0.3.5 -zarr==2.7.1 +msprime==1.1.0 +pytest==6.2.5 +pytest-cov==3.0.0 +pytest-xdist==2.5.0 +tskit==0.4.1 +zarr==2.10.3 diff --git a/tests/test_cli.py b/tests/test_cli.py index 92a4747..7d6193d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -277,7 +277,11 @@ def test_bad_file_format(self): with self.assertRaises(TestException): self.run_tszip([str(self.trees_path)]) mocked_exit.assert_called_once_with( - f"Error loading '{self.trees_path}': File not in KAS format" + f"Error loading '{self.trees_path}': File not in kastore format. If this" + f" file was generated by msprime < 0.6.0 (June 2018) it uses the old" + f" HDF5-based format which can no longer be read directly. Please" + f" convert to the new kastore format using the ``tskit upgrade``" + f" command." ) def test_compress_stdout(self): diff --git a/tests/test_compression.py b/tests/test_compression.py index eb52f22..51c430f 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -211,7 +211,7 @@ def test_all_fields(self): ) ts = msprime.sim_mutations(ts, rate=1, random_seed=42) tables = ts.dump_tables() - for name, table in tables.name_map.items(): + for name, table in tables.table_name_map.items(): if name not in ["provenances", "edges"]: table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))] diff --git a/tszip/compression.py b/tszip/compression.py index 2eeda6e..1324e41 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -192,9 +192,9 @@ def compress_zarr(ts, root, variants_only=False): # Sequence length is stored as an attr for compatibility with older versions of tszip del columns["sequence_length"] - # Schemas and metadata need to be converted to arrays + # Schemas, metadata and units need to be converted to arrays for name in columns: - if name.endswith("metadata_schema"): + if name.endswith("metadata_schema") or name == "time_units": columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8) if name.endswith("metadata"): columns[name] = np.frombuffer(columns[name], np.int8) @@ -302,7 +302,7 @@ def decompress_zarr(root): ) else: dict_repr.setdefault(key, {})[sub_key] = sub_value - elif key.endswith("metadata_schema"): + elif key.endswith("metadata_schema") or key == "time_units": dict_repr[key] = bytes(value).decode("utf-8") elif key.endswith("metadata"): dict_repr[key] = bytes(value) From e631e0d88adba141db9316a4a7c79a3d08bcf7dc Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 20 Jan 2022 14:57:43 +0000 Subject: [PATCH 2/2] Change to type detection --- tszip/compression.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tszip/compression.py b/tszip/compression.py index 1324e41..29e1f43 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -124,6 +124,14 @@ def __init__(self, name, array, delta_filter=False): self.delta_filter = delta_filter def compress(self, root, compressor): + array_type = "nparray" + if isinstance(self.array, str): + self.array = np.frombuffer(self.array.encode("utf-8"), np.int8) + array_type = "str" + elif isinstance(self.array, bytes): + self.array = np.frombuffer(self.array, np.int8) + array_type = "bytes" + shape = self.array.shape chunks = shape if shape[0] == 0: @@ -140,6 +148,7 @@ def compress(self, root, compressor): filters=filters, compressor=compressor, ) + compressed_array.attrs["tszip_type"] = array_type compressed_array[:] = self.array ratio = 0 if compressed_array.nbytes > 0: @@ -192,13 +201,6 @@ def compress_zarr(ts, root, variants_only=False): # Sequence length is stored as an attr for compatibility with older versions of tszip del columns["sequence_length"] - # Schemas, metadata and units need to be converted to arrays - for name in columns: - if name.endswith("metadata_schema") or name == "time_units": - columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8) - if name.endswith("metadata"): - columns[name] = np.frombuffer(columns[name], np.int8) - # Some columns benefit from being quantised coordinates = np.unique( np.hstack( @@ -302,7 +304,12 @@ def decompress_zarr(root): ) else: dict_repr.setdefault(key, {})[sub_key] = sub_value - elif key.endswith("metadata_schema") or key == "time_units": + elif value.attrs.get("tszip_type") == "str": + dict_repr[key] = bytes(value).decode("utf-8") + elif value.attrs.get("tszip_type") == "bytes": + dict_repr[key] = bytes(value) + # We manually check these for legacy tszips that didn't store type info. + elif key.endswith("metadata_schema"): dict_repr[key] = bytes(value).decode("utf-8") elif key.endswith("metadata"): dict_repr[key] = bytes(value)