Add PC Relate

ravwojdyla · ravwojdyla · commit 8ff5684da7f3 · 2020-09-12T21:23:08.000+02:00
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -8,3 +8,4 @@ statsmodels
 zarr
 sphinx
 sphinx_rtd_theme
+sklearn
diff --git a/setup.cfg b/setup.cfg
@@ -61,7 +61,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,sklearn,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
@@ -86,6 +86,10 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 [mypy-setuptools]
 ignore_missing_imports = True
+[mypy-sgkit_plink.*]
+ignore_missing_imports = True
+[mypy-sklearn.*]
+ignore_missing_imports = True
 [mypy-sgkit.*]
 allow_redefinition = True
 [mypy-sgkit.tests.*]
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -11,6 +11,7 @@
 from .stats.aggregation import count_call_alleles, count_variant_alleles
 from .stats.association import gwas_linear_regression
 from .stats.hwe import hardy_weinberg_test
+from .stats.pc_relate import pc_relate
 from .stats.regenie import regenie
 
 __all__ = [
@@ -27,4 +28,5 @@
     "read_vcfzarr",
     "regenie",
     "hardy_weinberg_test",
+    "pc_relate",
 ]
diff --git a/sgkit/stats/pc_relate.py b/sgkit/stats/pc_relate.py
@@ -0,0 +1,144 @@
+from typing import Tuple
+
+import dask.array as da
+import xarray as xr
+
+from sgkit.typing import ArrayLike
+
+
+def gramian(a: ArrayLike) -> ArrayLike:
+    """Returns gramian matrix of the given matrix"""
+    return a.T.dot(a)
+
+
+def _impute_genotype_call_with_variant_mean(
+    call_g: xr.DataArray, call_g_mask: xr.DataArray
+) -> xr.DataArray:
+    call_g_present = ~call_g_mask  # type: ignore[operator]
+    variant_mean = call_g.where(call_g_present).mean(dim="samples")
+    imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
+    return imputed_call_g
+
+
+def _collapse_ploidy(ds: xr.Dataset) -> Tuple[xr.DataArray, xr.DataArray]:
+    call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
+    call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy"))  # type: ignore[no-untyped-call]
+    return call_g, call_g_mask
+
+
+def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
+    """Compute PC-Relate as described in Conomos, et al. 2016 [1].
+
+    Parameters
+    ----------
+    ds : `xr.Dataset`
+        Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
+            genotype calls: "call_genotype" (SxVxD)
+            genotype calls mask: "call_genotype_mask" (SxVxD)
+            sample PCs: "sample_pcs" (PCxS)
+    maf : float
+        individual minor allele frequency filter. If an individual's estimated
+        individual-specific minor allele frequency at a SNP is less than this value,
+        that SNP will be excluded from the analysis for that individual.
+        The default value is 0.01. Must be between (0.0, 0.1).
+
+
+    This method computes the kinship coefficient matrix. The kinship coefficient for
+    a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
+    a random allele selected from ``i`` and a random allele selected from ``j`` at
+    a locus are IBD. Several of the most common family relationships and their
+    corresponding kinship coefficient:
+
+    +--------------------------------------------------+---------------------+
+    | Relationship                                     | Kinship coefficient |
+    +==================================================+=====================+
+    | Individual-self                                  | 1/2                 |
+    +--------------------------------------------------+---------------------+
+    | full sister/full brother                         | 1/4                 |
+    +--------------------------------------------------+---------------------+
+    | mother/father/daughter/son                       | 1/4                 |
+    +--------------------------------------------------+---------------------+
+    | grandmother/grandfather/granddaughter/grandson   | 1/8                 |
+    +--------------------------------------------------+---------------------+
+    | aunt/uncle/niece/nephew                          | 1/8                 |
+    +--------------------------------------------------+---------------------+
+    | first cousin                                     | 1/16                |
+    +--------------------------------------------------+---------------------+
+    | half-sister/half-brother                         | 1/8                 |
+    +--------------------------------------------------+---------------------+
+
+    Warnings
+    --------
+    This function is only applicable to diploid, biallelic datasets.
+
+    Returns
+    -------
+    Dataset
+        Dataset containing (S = num samples):
+        pc_relate_phi: (S,S) ArrayLike
+            pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].
+
+    References
+    ----------
+    - [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
+        "Model-Free Estimation of Recent Genetic Relatedness."
+        American Journal of Human Genetics 98 (1): 127–48.
+
+    Raises
+    ------
+    ValueError
+        If ploidy of provided dataset != 2
+    ValueError
+        If maximum number of alleles in provided dataset != 2
+    ValueError
+        Input dataset is missing any of the required variables
+    ValueError
+        If maf is not in (0.0, 1.0)
+    """
+    if maf <= 0.0 or maf >= 1.0:
+        raise ValueError("MAF must be between (0.0, 1.0)")
+    if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
+        raise ValueError("PC Relate only works for diploid genotypes")
+    if "alleles" in ds.dims and ds.dims["alleles"] != 2:
+        raise ValueError("PC Relate only works for biallelic genotypes")
+    if "call_genotype" not in ds:
+        raise ValueError("Input dataset must contain call_genotype")
+    if "call_genotype_mask" not in ds:
+        raise ValueError("Input dataset must contain call_genotype_mask")
+    if "sample_pcs" not in ds:
+        raise ValueError("Input dataset must contain sample_pcs variable")
+
+    call_g, call_g_mask = _collapse_ploidy(ds)
+    imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
+
+    # 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
+    # is a length D vector of regression coefficients for each of the PCs
+    pcs = ds["sample_pcs"]
+    pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
+    # Note: dask qr decomp requires no chunking in one dimension, and because number of
+    # components should be smaller than number of samples in most cases, we disable
+    # chunking on components
+    pcsi = pcsi.T.rechunk((None, -1))
+
+    q, r = da.linalg.qr(pcsi)
+    # mu, eq: 3
+    half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
+    mu = pcsi.dot(half_beta).T
+    # phi, eq: 4
+    mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
+    mu_mask = da.ma.masked_array(mu, mask=mask)
+    variance = mu_mask * (1.0 - mu_mask)
+    variance = da.ma.filled(variance, fill_value=0.0)
+    stddev = da.sqrt(variance)
+    centered_af = call_g / 2 - mu_mask
+    centered_af = da.ma.filled(centered_af, fill_value=0.0)
+    # NOTE: gramian could be a performance bottleneck, and we could explore
+    #       performance improvements like (or maybe sth else):
+    #       * calculating only the pairs we are interested in
+    #       * using an optimized einsum.
+    assert centered_af.shape == call_g.shape
+    assert stddev.shape == call_g.shape
+    phi = gramian(centered_af) / gramian(stddev)
+    # NOTE: phi is of shape (S x S), S = num samples
+    assert phi.shape == (call_g.shape[1],) * 2
+    return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})
diff --git a/sgkit/tests/test_pc_relate.py b/sgkit/tests/test_pc_relate.py
@@ -0,0 +1,188 @@
+import numpy as np
+import pandas as pd
+import pytest
+import xarray as xr
+from hypothesis import given, settings
+from hypothesis.extra.numpy import arrays
+from sklearn.decomposition import PCA
+
+from sgkit.stats.pc_relate import (
+    _collapse_ploidy,
+    _impute_genotype_call_with_variant_mean,
+    gramian,
+    pc_relate,
+)
+from sgkit.testing import simulate_genotype_call_dataset
+
+
+def test_pc_relate__genotype_inputs_checks() -> None:
+    g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
+    with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"):
+        pc_relate(g_wrong_ploidy)
+
+    g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
+    with pytest.raises(
+        ValueError, match="PC Relate only works for biallelic genotypes"
+    ):
+        pc_relate(g_non_biallelic)
+
+    g_no_pcs = simulate_genotype_call_dataset(100, 10)
+    with pytest.raises(
+        ValueError, match="Input dataset must contain sample_pcs variable"
+    ):
+        pc_relate(g_no_pcs)
+
+    with pytest.raises(ValueError, match="Input dataset must contain call_genotype"):
+        pc_relate(g_no_pcs.drop_vars("call_genotype"))
+
+    with pytest.raises(
+        ValueError, match="Input dataset must contain call_genotype_mask"
+    ):
+        pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))
+
+
+def test_pc_relate__maf_inputs_checks() -> None:
+    g = simulate_genotype_call_dataset(100, 10)
+    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
+        pc_relate(g, maf=-1)
+    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
+        pc_relate(g, maf=1.0)
+    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
+        pc_relate(g, maf=0.0)
+
+
+@given(arrays(np.int8, (3, 5)))
+@settings(max_examples=10)
+def test_gramian_is_symmetric(a: np.ndarray) -> None:
+    b = gramian(a)
+    assert np.allclose(b, b.T)
+
+
+def test_collapse_ploidy() -> None:
+    g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
+    assert g.call_genotype.shape == (1000, 10, 2)
+    assert g.call_genotype_mask.shape == (1000, 10, 2)
+
+    # Test individual cases:
+    g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1
+    g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1
+    g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0
+    g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0
+
+    g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0
+    g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1
+    g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0
+    g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0
+
+    g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1
+    g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1
+    g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1
+    g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0
+
+    call_g, call_g_mask = _collapse_ploidy(g)
+    assert call_g.shape == (1000, 10)
+    assert call_g_mask.shape == (1000, 10)
+    assert call_g.isel(variants=1, samples=1) == 2
+    assert call_g.isel(variants=2, samples=2) == 1
+    assert call_g.isel(variants=3, samples=3) == -1
+    assert call_g_mask.isel(variants=1, samples=1) == 0
+    assert call_g_mask.isel(variants=3, samples=3) == 1
+
+
+def test_impute_genotype_call_with_variant_mean() -> None:
+    g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
+    call_g, call_g_mask = _collapse_ploidy(g)
+    # Test individual cases:
+    call_g.loc[dict(variants=2)] = 1
+    call_g.loc[dict(variants=2, samples=1)] = 2
+    call_g_mask.loc[dict(variants=2)] = False
+    call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True
+    imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
+    assert imputed_call_g.isel(variants=2, samples=1) == 2
+    assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all()
+    assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()
+
+
+def test_pc_relate__values_within_range() -> None:
+    n_samples = 100
+    g = simulate_genotype_call_dataset(1000, n_samples)
+    call_g, _ = _collapse_ploidy(g)
+    pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
+    g["sample_pcs"] = (("components", "samples"), pcs.T)
+    phi = pc_relate(g)
+    assert phi.pc_relate_phi.shape == (n_samples, n_samples)
+    data_np = phi.pc_relate_phi.data.compute()  # to be able to use fancy indexing below
+    upper_phi = data_np[np.triu_indices_from(data_np, 1)]
+    assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
+
+
+def test_pc_relate__identical_sample_should_be_05() -> None:
+    n_samples = 100
+    g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1)
+    call_g, _ = _collapse_ploidy(g)
+    pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
+    g["sample_pcs"] = (("components", "samples"), pcs.T)
+    # Add identical sample
+    g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0)
+    phi = pc_relate(g)
+    assert phi.pc_relate_phi.shape == (n_samples, n_samples)
+    assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)
+
+
+def test_pc_relate__parent_child_relationship() -> None:
+    # Eric's source: https://github.com/pystatgen/sgkit/pull/228#discussion_r487436876
+
+    # Create a dataset that is 2/3 founders and 1/3 progeny
+    seed = 1
+    rs = np.random.RandomState(seed)
+    ds = simulate_genotype_call_dataset(1000, 300, seed=seed)
+    ds["sample_type"] = xr.DataArray(
+        np.repeat(["mother", "father", "child"], 100), dims="samples"
+    )
+    sample_groups = ds.groupby("sample_type").groups
+
+    def simulate_new_generation(ds: xr.Dataset) -> xr.Dataset:
+        # Generate progeny genotypes as a combination of randomly
+        # selected haplotypes from each parents
+        idx = sample_groups["mother"] + sample_groups["father"]
+        gt = ds.call_genotype.isel(samples=idx).values
+        idx = rs.randint(0, 2, size=gt.shape[:2])
+        # Collapse to haplotype across ploidy dim using indexer
+        # * shape = (samples, variants)
+        ht = gt[np.ix_(*map(range, gt.shape[:2])) + (idx,)].T
+        gt_child = np.stack([ht[sample_groups[t]] for t in ["mother", "father"]]).T
+        ds["call_genotype"].values = np.concatenate((gt, gt_child), axis=1)
+        return ds
+
+    # Redefine the progeny genotypes
+    ds = simulate_new_generation(ds)
+
+    # Infer kinship
+    call_g, _ = _collapse_ploidy(ds)
+    pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
+    ds["sample_pcs"] = (("components", "samples"), pcs.T)
+    ds["pc_relate_phi"] = pc_relate(ds)["pc_relate_phi"].compute()
+
+    # Check that all coefficients are in expected ranges
+    cts = (
+        ds["pc_relate_phi"]
+        .to_series()
+        .reset_index()
+        .pipe(lambda df: df.loc[df.sample_x >= df.sample_y]["pc_relate_phi"])
+        .pipe(
+            pd.cut,
+            bins=[p for phi in [0, 0.25, 0.5] for p in [phi - 0.1, phi + 0.1]],
+            labels=[
+                "unrelated",
+                "unclassified",
+                "parent/child",
+                "unclassified",
+                "self",
+            ],
+            ordered=False,
+        )
+        .value_counts()
+    )
+    assert cts["parent/child"] == len(sample_groups["child"]) * 2
+    assert cts["self"] == ds.dims["samples"]
+    assert cts["unclassified"] == 0

-Original file line number
+Diff line change
 zarr
 sphinx
 sphinx_rtd_theme
 +sklearn