From b002f3827677cd7a46d8259a358250bc3db69630 Mon Sep 17 00:00:00 2001
From: Ben Jeffery <ben.jeffery@bdi.ox.ac.uk>
Date: Tue, 25 Jan 2022 15:07:06 +0000
Subject: [PATCH] Add support for reference sequences

---
 CHANGELOG.rst             |  2 ++
 tests/test_compression.py | 15 ++++++++++++++-
 tszip/compression.py      | 14 +++++++++++---
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 7700789..5c655e2 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,6 +4,8 @@
 
 - Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55)
 
+- Add support for reference sequence (benjeffery, #59)
+
 --------------------
 [0.2.0] - 2021-11-08
 --------------------
diff --git a/tests/test_compression.py b/tests/test_compression.py
index 51c430f..8975042 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -188,6 +188,19 @@ def test_small_msprime_complex_mutations(self):
                 )
         self.verify(tables.tree_sequence())
 
+    def test_ref_seq(self):
+        ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2)
+        tables = ts.tables
+        tables.reference_sequence.metadata_schema = (
+            tskit.MetadataSchema.permissive_json()
+        )
+        tables.reference_sequence.metadata = {"some": "data"}
+        tables.reference_sequence.data = "ACTG"
+        # NOTE: it's unclear whether we'll want to have this set at the same time as
+        # 'data', but it's useful to have something in all columns for now.
+        tables.reference_sequence.url = "http://example.com/a_reference"
+        self.verify(tables.tree_sequence())
+
     def test_mutation_parent_example(self):
         tables = tskit.TableCollection(1)
         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
@@ -263,7 +276,7 @@ def verify(self, ts):
             path = pathlib.Path(tmpdir) / "treeseq.tsz"
             tszip.compress(ts, path)
             other_ts = tszip.decompress(path)
-        self.assertEqual(ts.tables, other_ts.tables)
+        ts.tables.assert_equals(other_ts.tables)
 
 
 class TestMetadata(unittest.TestCase):
diff --git a/tszip/compression.py b/tszip/compression.py
index 1324e41..7326988 100644
--- a/tszip/compression.py
+++ b/tszip/compression.py
@@ -194,7 +194,11 @@ def compress_zarr(ts, root, variants_only=False):
 
     # Schemas, metadata and units need to be converted to arrays
     for name in columns:
-        if name.endswith("metadata_schema") or name == "time_units":
+        if name.endswith("metadata_schema") or name in [
+            "time_units",
+            "reference_sequence/data",
+            "reference_sequence/url",
+        ]:
             columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
         if name.endswith("metadata"):
             columns[name] = np.frombuffer(columns[name], np.int8)
@@ -296,10 +300,15 @@ def decompress_zarr(root):
             for sub_key, sub_value in value.items():
                 if f"{key}/{sub_key}" in quantised_arrays:
                     dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value]
-                elif sub_key.endswith("metadata_schema"):
+                elif sub_key.endswith("metadata_schema") or (key, sub_key) in [
+                    ("reference_sequence", "data"),
+                    ("reference_sequence", "url"),
+                ]:
                     dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode(
                         "utf-8"
                     )
+                elif (key, sub_key) == ("reference_sequence", "metadata"):
+                    dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value)
                 else:
                     dict_repr.setdefault(key, {})[sub_key] = sub_value
         elif key.endswith("metadata_schema") or key == "time_units":
@@ -308,7 +317,6 @@ def decompress_zarr(root):
             dict_repr[key] = bytes(value)
         else:
             dict_repr[key] = value
-
     return tskit.TableCollection.fromdict(dict_repr).tree_sequence()