From b002f3827677cd7a46d8259a358250bc3db69630 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 25 Jan 2022 15:07:06 +0000 Subject: [PATCH] Add support for reference sequences --- CHANGELOG.rst | 2 ++ tests/test_compression.py | 15 ++++++++++++++- tszip/compression.py | 14 +++++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7700789..5c655e2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,8 @@ - Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55) +- Add support for reference sequence (benjeffery, #59) + -------------------- [0.2.0] - 2021-11-08 -------------------- diff --git a/tests/test_compression.py b/tests/test_compression.py index 51c430f..8975042 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -188,6 +188,19 @@ def test_small_msprime_complex_mutations(self): ) self.verify(tables.tree_sequence()) + def test_ref_seq(self): + ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2) + tables = ts.tables + tables.reference_sequence.metadata_schema = ( + tskit.MetadataSchema.permissive_json() + ) + tables.reference_sequence.metadata = {"some": "data"} + tables.reference_sequence.data = "ACTG" + # NOTE: it's unclear whether we'll want to have this set at the same time as + # 'data', but it's useful to have something in all columns for now. + tables.reference_sequence.url = "http://example.com/a_reference" + self.verify(tables.tree_sequence()) + def test_mutation_parent_example(self): tables = tskit.TableCollection(1) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) @@ -263,7 +276,7 @@ def verify(self, ts): path = pathlib.Path(tmpdir) / "treeseq.tsz" tszip.compress(ts, path) other_ts = tszip.decompress(path) - self.assertEqual(ts.tables, other_ts.tables) + ts.tables.assert_equals(other_ts.tables) class TestMetadata(unittest.TestCase): diff --git a/tszip/compression.py b/tszip/compression.py index 1324e41..7326988 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -194,7 +194,11 @@ def compress_zarr(ts, root, variants_only=False): # Schemas, metadata and units need to be converted to arrays for name in columns: - if name.endswith("metadata_schema") or name == "time_units": + if name.endswith("metadata_schema") or name in [ + "time_units", + "reference_sequence/data", + "reference_sequence/url", + ]: columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8) if name.endswith("metadata"): columns[name] = np.frombuffer(columns[name], np.int8) @@ -296,10 +300,15 @@ def decompress_zarr(root): for sub_key, sub_value in value.items(): if f"{key}/{sub_key}" in quantised_arrays: dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value] - elif sub_key.endswith("metadata_schema"): + elif sub_key.endswith("metadata_schema") or (key, sub_key) in [ + ("reference_sequence", "data"), + ("reference_sequence", "url"), + ]: dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode( "utf-8" ) + elif (key, sub_key) == ("reference_sequence", "metadata"): + dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value) else: dict_repr.setdefault(key, {})[sub_key] = sub_value elif key.endswith("metadata_schema") or key == "time_units": @@ -308,7 +317,6 @@ def decompress_zarr(root): dict_repr[key] = bytes(value) else: dict_repr[key] = value - return tskit.TableCollection.fromdict(dict_repr).tree_sequence()