tskit-dev · benjeffery · Jan 20, 2022 · Jan 20, 2022
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,9 @@
+--------------------
+[0.2.1] - 2021-01-20
+--------------------
+
+- Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55)
+
 --------------------
 [0.2.0] - 2021-11-08
 --------------------

diff --git a/requirements/CI-tests-conda.txt b/requirements/CI-tests-conda.txt
@@ -1,8 +1,8 @@
-humanize==3.4.1
+humanize==3.13.1
 h5py<3.2
-msprime==1.0.0
-pytest==6.2.3
-pytest-cov==2.11.1
-pytest-xdist==2.2.1
-tskit==0.3.5
-zarr==2.7.1
+msprime==1.1.0
+pytest==6.2.5
+pytest-cov==3.0.0
+pytest-xdist==2.5.0
+tskit==0.4.1
+zarr==2.10.3
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -277,7 +277,11 @@ def test_bad_file_format(self):
             with self.assertRaises(TestException):
                 self.run_tszip([str(self.trees_path)])
             mocked_exit.assert_called_once_with(
-                f"Error loading '{self.trees_path}': File not in KAS format"
+                f"Error loading '{self.trees_path}': File not in kastore format. If this"
+                f" file was generated by msprime < 0.6.0 (June 2018) it uses the old"
+                f" HDF5-based format which can no longer be read directly. Please"
+                f" convert to the new kastore format using the ``tskit upgrade``"
+                f" command."
             )
 
     def test_compress_stdout(self):

diff --git a/tests/test_compression.py b/tests/test_compression.py
@@ -211,7 +211,7 @@ def test_all_fields(self):
         )
         ts = msprime.sim_mutations(ts, rate=1, random_seed=42)
         tables = ts.dump_tables()
-        for name, table in tables.name_map.items():
+        for name, table in tables.table_name_map.items():
             if name not in ["provenances", "edges"]:
                 table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
                 metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]

diff --git a/tszip/compression.py b/tszip/compression.py
@@ -124,6 +124,14 @@ def __init__(self, name, array, delta_filter=False):
         self.delta_filter = delta_filter
 
     def compress(self, root, compressor):
+        array_type = "nparray"
+        if isinstance(self.array, str):
+            self.array = np.frombuffer(self.array.encode("utf-8"), np.int8)
+            array_type = "str"
+        elif isinstance(self.array, bytes):
+            self.array = np.frombuffer(self.array, np.int8)
+            array_type = "bytes"
+
         shape = self.array.shape
         chunks = shape
         if shape[0] == 0:
@@ -140,6 +148,7 @@ def compress(self, root, compressor):
             filters=filters,
             compressor=compressor,
         )
+        compressed_array.attrs["tszip_type"] = array_type
         compressed_array[:] = self.array
         ratio = 0
         if compressed_array.nbytes > 0:
@@ -192,13 +201,6 @@ def compress_zarr(ts, root, variants_only=False):
     # Sequence length is stored as an attr for compatibility with older versions of tszip
     del columns["sequence_length"]
 
-    # Schemas and metadata need to be converted to arrays
-    for name in columns:
-        if name.endswith("metadata_schema"):
-            columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
-        if name.endswith("metadata"):
-            columns[name] = np.frombuffer(columns[name], np.int8)
-
     # Some columns benefit from being quantised
     coordinates = np.unique(
         np.hstack(
@@ -302,6 +304,11 @@ def decompress_zarr(root):
                     )
                 else:
                     dict_repr.setdefault(key, {})[sub_key] = sub_value
+        elif value.attrs.get("tszip_type") == "str":
+            dict_repr[key] = bytes(value).decode("utf-8")
+        elif value.attrs.get("tszip_type") == "bytes":
+            dict_repr[key] = bytes(value)
+        # We manually check these for legacy tszips that didn't store type info.
         elif key.endswith("metadata_schema"):
             dict_repr[key] = bytes(value).decode("utf-8")
         elif key.endswith("metadata"):