From 71dc01908489c5c2a4bbaa838b161f0721656eb5 Mon Sep 17 00:00:00 2001
From: Ben Jeffery <ben.jeffery@bdi.ox.ac.uk>
Date: Thu, 20 Jan 2022 12:21:23 +0000
Subject: [PATCH 1/2] Fix for time_units in tskit 0.4.1

---
 CHANGELOG.rst                   |  6 ++++++
 requirements/CI-tests-conda.txt | 14 +++++++-------
 tests/test_cli.py               |  6 +++++-
 tests/test_compression.py       |  2 +-
 tszip/compression.py            |  6 +++---
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 77f635f..ad92a47 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,9 @@
+--------------------
+[0.2.1] - 2021-01-20
+--------------------
+
+- Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55)
+
 --------------------
 [0.2.0] - 2021-11-08
 --------------------
diff --git a/requirements/CI-tests-conda.txt b/requirements/CI-tests-conda.txt
index abd6573..cf109c0 100644
--- a/requirements/CI-tests-conda.txt
+++ b/requirements/CI-tests-conda.txt
@@ -1,8 +1,8 @@
-humanize==3.4.1
+humanize==3.13.1
 h5py<3.2
-msprime==1.0.0
-pytest==6.2.3
-pytest-cov==2.11.1
-pytest-xdist==2.2.1
-tskit==0.3.5
-zarr==2.7.1
+msprime==1.1.0
+pytest==6.2.5
+pytest-cov==3.0.0
+pytest-xdist==2.5.0
+tskit==0.4.1
+zarr==2.10.3
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 92a4747..7d6193d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -277,7 +277,11 @@ def test_bad_file_format(self):
             with self.assertRaises(TestException):
                 self.run_tszip([str(self.trees_path)])
             mocked_exit.assert_called_once_with(
-                f"Error loading '{self.trees_path}': File not in KAS format"
+                f"Error loading '{self.trees_path}': File not in kastore format. If this"
+                f" file was generated by msprime < 0.6.0 (June 2018) it uses the old"
+                f" HDF5-based format which can no longer be read directly. Please"
+                f" convert to the new kastore format using the ``tskit upgrade``"
+                f" command."
             )
 
     def test_compress_stdout(self):
diff --git a/tests/test_compression.py b/tests/test_compression.py
index eb52f22..51c430f 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -211,7 +211,7 @@ def test_all_fields(self):
         )
         ts = msprime.sim_mutations(ts, rate=1, random_seed=42)
         tables = ts.dump_tables()
-        for name, table in tables.name_map.items():
+        for name, table in tables.table_name_map.items():
             if name not in ["provenances", "edges"]:
                 table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
                 metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]
diff --git a/tszip/compression.py b/tszip/compression.py
index 2eeda6e..1324e41 100644
--- a/tszip/compression.py
+++ b/tszip/compression.py
@@ -192,9 +192,9 @@ def compress_zarr(ts, root, variants_only=False):
     # Sequence length is stored as an attr for compatibility with older versions of tszip
     del columns["sequence_length"]
 
-    # Schemas and metadata need to be converted to arrays
+    # Schemas, metadata and units need to be converted to arrays
     for name in columns:
-        if name.endswith("metadata_schema"):
+        if name.endswith("metadata_schema") or name == "time_units":
             columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
         if name.endswith("metadata"):
             columns[name] = np.frombuffer(columns[name], np.int8)
@@ -302,7 +302,7 @@ def decompress_zarr(root):
                     )
                 else:
                     dict_repr.setdefault(key, {})[sub_key] = sub_value
-        elif key.endswith("metadata_schema"):
+        elif key.endswith("metadata_schema") or key == "time_units":
             dict_repr[key] = bytes(value).decode("utf-8")
         elif key.endswith("metadata"):
             dict_repr[key] = bytes(value)

From e631e0d88adba141db9316a4a7c79a3d08bcf7dc Mon Sep 17 00:00:00 2001
From: Ben Jeffery <ben.jeffery@bdi.ox.ac.uk>
Date: Thu, 20 Jan 2022 14:57:43 +0000
Subject: [PATCH 2/2] Change to type detection

---
 tszip/compression.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tszip/compression.py b/tszip/compression.py
index 1324e41..29e1f43 100644
--- a/tszip/compression.py
+++ b/tszip/compression.py
@@ -124,6 +124,14 @@ def __init__(self, name, array, delta_filter=False):
         self.delta_filter = delta_filter
 
     def compress(self, root, compressor):
+        array_type = "nparray"
+        if isinstance(self.array, str):
+            self.array = np.frombuffer(self.array.encode("utf-8"), np.int8)
+            array_type = "str"
+        elif isinstance(self.array, bytes):
+            self.array = np.frombuffer(self.array, np.int8)
+            array_type = "bytes"
+
         shape = self.array.shape
         chunks = shape
         if shape[0] == 0:
@@ -140,6 +148,7 @@ def compress(self, root, compressor):
             filters=filters,
             compressor=compressor,
         )
+        compressed_array.attrs["tszip_type"] = array_type
         compressed_array[:] = self.array
         ratio = 0
         if compressed_array.nbytes > 0:
@@ -192,13 +201,6 @@ def compress_zarr(ts, root, variants_only=False):
     # Sequence length is stored as an attr for compatibility with older versions of tszip
     del columns["sequence_length"]
 
-    # Schemas, metadata and units need to be converted to arrays
-    for name in columns:
-        if name.endswith("metadata_schema") or name == "time_units":
-            columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
-        if name.endswith("metadata"):
-            columns[name] = np.frombuffer(columns[name], np.int8)
-
     # Some columns benefit from being quantised
     coordinates = np.unique(
         np.hstack(
@@ -302,7 +304,12 @@ def decompress_zarr(root):
                     )
                 else:
                     dict_repr.setdefault(key, {})[sub_key] = sub_value
-        elif key.endswith("metadata_schema") or key == "time_units":
+        elif value.attrs.get("tszip_type") == "str":
+            dict_repr[key] = bytes(value).decode("utf-8")
+        elif value.attrs.get("tszip_type") == "bytes":
+            dict_repr[key] = bytes(value)
+        # We manually check these for legacy tszips that didn't store type info.
+        elif key.endswith("metadata_schema"):
             dict_repr[key] = bytes(value).decode("utf-8")
         elif key.endswith("metadata"):
             dict_repr[key] = bytes(value)