Skip to content

Add support for reference sequences #59

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

- Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55)

- Add support for reference sequence (benjeffery, #59)

--------------------
[0.2.0] - 2021-11-08
--------------------
Expand Down
15 changes: 14 additions & 1 deletion tests/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,19 @@ def test_small_msprime_complex_mutations(self):
)
self.verify(tables.tree_sequence())

def test_ref_seq(self):
ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2)
tables = ts.tables
tables.reference_sequence.metadata_schema = (
tskit.MetadataSchema.permissive_json()
)
tables.reference_sequence.metadata = {"some": "data"}
tables.reference_sequence.data = "ACTG"
# NOTE: it's unclear whether we'll want to have this set at the same time as
# 'data', but it's useful to have something in all columns for now.
tables.reference_sequence.url = "http://example.com/a_reference"
self.verify(tables.tree_sequence())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume verify calls tables.assert_equals() somewhere along the line? Otherwise we won't actually be testing much here I think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have verified that verify actually verifies. I changed to use assert_equals anyway as it gives better errors.


def test_mutation_parent_example(self):
tables = tskit.TableCollection(1)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
Expand Down Expand Up @@ -263,7 +276,7 @@ def verify(self, ts):
path = pathlib.Path(tmpdir) / "treeseq.tsz"
tszip.compress(ts, path)
other_ts = tszip.decompress(path)
self.assertEqual(ts.tables, other_ts.tables)
ts.tables.assert_equals(other_ts.tables)


class TestMetadata(unittest.TestCase):
Expand Down
14 changes: 11 additions & 3 deletions tszip/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,11 @@ def compress_zarr(ts, root, variants_only=False):

# Schemas, metadata and units need to be converted to arrays
for name in columns:
if name.endswith("metadata_schema") or name == "time_units":
if name.endswith("metadata_schema") or name in [
"time_units",
"reference_sequence/data",
"reference_sequence/url",
]:
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
if name.endswith("metadata"):
columns[name] = np.frombuffer(columns[name], np.int8)
Expand Down Expand Up @@ -296,10 +300,15 @@ def decompress_zarr(root):
for sub_key, sub_value in value.items():
if f"{key}/{sub_key}" in quantised_arrays:
dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value]
elif sub_key.endswith("metadata_schema"):
elif sub_key.endswith("metadata_schema") or (key, sub_key) in [
("reference_sequence", "data"),
("reference_sequence", "url"),
]:
dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode(
"utf-8"
)
elif (key, sub_key) == ("reference_sequence", "metadata"):
dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value)
else:
dict_repr.setdefault(key, {})[sub_key] = sub_value
elif key.endswith("metadata_schema") or key == "time_units":
Expand All @@ -308,7 +317,6 @@ def decompress_zarr(root):
dict_repr[key] = bytes(value)
else:
dict_repr[key] = value

return tskit.TableCollection.fromdict(dict_repr).tree_sequence()


Expand Down