Use encode_array from sgkit.

tomwhite · tomwhite · commit 374861712e30 · 2020-07-21T10:31:58.000+01:00
Make coverage 100%.
Add GH Action to run test and build.
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ dask[array]
 dask[dataframe]
 fsspec
 numpy
+scipy
 xarray
 bgen_reader
-git+https://github.com/tomwhite/sgkit@dosages
+git+https://github.com/pystatgen/sgkit
diff --git a/sgkit_bgen/bgen_reader.py b/sgkit_bgen/bgen_reader.py
@@ -12,6 +12,7 @@
 from xarray import Dataset
 
 from sgkit import create_genotype_dosage_dataset
+from sgkit.utils import encode_array
 
 PathType = Union[str, Path]
 
@@ -88,9 +89,11 @@ def split(allele_row):
 
     def __getitem__(self, idx):
         if not isinstance(idx, tuple):
-            raise IndexError(f"Indexer must be tuple (received {type(idx)})")
+            raise IndexError(  # pragma: no cover
+                f"Indexer must be tuple (received {type(idx)})"
+            )
         if len(idx) != self.ndim:
-            raise IndexError(
+            raise IndexError(  # pragma: no cover
                 f"Indexer must be two-item tuple (received {len(idx)} slices)"
             )
 
@@ -138,9 +141,9 @@ def read_bgen(
     path : PathType
         Path to BGEN file.
     chunks : Union[str, int, tuple], optional
-        Chunk size for genotype (i.e. `.bed`) data, by default "auto"
+        Chunk size for genotype data, by default "auto"
     lock : bool, optional
-        Whether or not to synchronize concurrent reads of `.bed`
+        Whether or not to synchronize concurrent reads of
         file blocks, by default False. This is passed through to
         [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
     persist : bool, optional
@@ -152,9 +155,7 @@ def read_bgen(
 
     bgen_reader = BgenReader(path, persist)
 
-    variant_contig_names, variant_contig = np.unique(
-        np.array(bgen_reader.contig, dtype=str), return_inverse=True
-    )
+    variant_contig, variant_contig_names = encode_array(bgen_reader.contig.compute())
     variant_contig_names = list(variant_contig_names)
     variant_contig = variant_contig.astype("int16")
 
diff --git a/sgkit_bgen/tests/data/complex.23bits.no.samples.bgen b/sgkit_bgen/tests/data/complex.23bits.no.samples.bgen
diff --git a/sgkit_bgen/tests/test_bgen_reader.py b/sgkit_bgen/tests/test_bgen_reader.py
@@ -16,3 +16,15 @@ def test_read_bgen_with_sample_file(shared_datadir):
     ds = read_bgen(path)
     # Check the sample IDs are the ones from the .sample file
     assert ds["sample/id"].values.tolist() == ["s0", "s1", "s2", "s3"]
+
+
+def test_read_bgen_with_no_samples(shared_datadir):
+    path = shared_datadir / "complex.23bits.no.samples.bgen"
+    ds = read_bgen(path)
+    # Check the sample IDs are generated
+    assert ds["sample/id"].values.tolist() == [
+        "sample_0",
+        "sample_1",
+        "sample_2",
+        "sample_3",
+    ]