Skip to content

Add tskit.load #75

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
--------------------
[0.2.3] - 2023-XX-XX
--------------------

- Add `tszip.load` which loads both compressed and uncompressed trees sequences
(benjeffery, #75)


--------------------
[0.2.2] - 2022-02-22
--------------------
Expand Down
30 changes: 30 additions & 0 deletions tests/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import msprime
import numpy as np
import pytest
import tskit
import zarr

Expand Down Expand Up @@ -383,3 +384,32 @@ def test_save_dir(self):
ts = msprime.simulate(10, random_seed=1)
with self.assertRaises(OSError):
tszip.compress(ts, self.path.parent)


class TestLoad:
def test_missing_file(self):
path = "/no/such/file"
with pytest.raises(FileNotFoundError):
tszip.load(path)

def test_load_dir(self):
with pytest.raises(OSError):
tszip.load(pathlib.Path(__file__).parent)

def test_wrong_format(self, tmpdir):
path = pathlib.Path(tmpdir) / "treeseq.tsz"
with open(str(path), "w") as f:
f.write("")
with pytest.raises(EOFError):
tszip.load(path)
for contents in ["1234", "X" * 1024]:
with open(str(path), "w") as f:
f.write(contents)
with pytest.raises(tskit.FileFormatError):
tszip.load(path)

def test_open_both(self):
files = pathlib.Path(__file__).parent / "files"
ts = tszip.load(files / "1.0.0.trees.tsz")
ts2 = tszip.load(files / "1.0.0.trees")
assert ts == ts2
1 change: 1 addition & 0 deletions tszip/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@
# SOFTWARE.
from .compression import compress # NOQA
from .compression import decompress # NOQA
from .compression import load # NOQA
from .compression import print_summary # NOQA
from .provenance import __version__ # NOQA
31 changes: 31 additions & 0 deletions tszip/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,34 @@ def visitor(array):
if verbosity > 0:
for line in str(array.info).splitlines():
print("\t", line)


def load(path):
"""
Open a tszip or normal tskit file. This is a convenience function that
determines if the file needs to be decompressed or not, returning
the tree sequence instance in either case.

:param str path: The location of the tszip compressed file or
standard tskit file to load.
:rtype: tskit.TreeSequence
:return: A :class:`tskit.TreeSequence` instance corresponding to
the specified file.
"""
path = str(path)

# Determine if the file is a zip file, this seems more robust than
# checking the file extension, or depending on exceptions. Note that
# `is_zipfile` not only checks the header but also the EOCD record at
# then end of the file. This means we read the file twice, but as
# tree sequences are usually less than a few GB this should not
# be a problem.
with open(path, "rb") as f:
is_zip = zipfile.is_zipfile(f)
if is_zip:
return decompress(path)
else:
# Open everything else with tskit. We could check for a
# kastore header here, but this way we get all the normal
# tskit exceptions on error
return tskit.load(path)