Add PC Relate

ravwojdyla · ravwojdyla · commit 070ce72eccb5 · 2020-09-08T17:11:42.000+02:00
diff --git a/setup.cfg b/setup.cfg
@@ -61,7 +61,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,statsmodels,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
@@ -86,6 +86,8 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 [mypy-setuptools]
 ignore_missing_imports = True
+[mypy-sgkit_plink.*]
+ignore_missing_imports = True
 [mypy-sgkit.*]
 allow_redefinition = True
 [mypy-sgkit.tests.*]
diff --git a/sgkit/stats/pc_relate.py b/sgkit/stats/pc_relate.py
@@ -0,0 +1,144 @@
+from typing import Tuple
+
+import dask.array as da
+import xarray as xr
+
+from sgkit.typing import ArrayLike
+
+
+def gramian(a: ArrayLike) -> ArrayLike:
+    """Returns gramian matrix of the given matrix"""
+    return a.T.dot(a)
+
+
+def _impute_genotype_call_with_variant_mean(
+    call_g: xr.DataArray, call_g_mask: xr.DataArray
+) -> xr.DataArray:
+    call_g_present = ~call_g_mask  # type: ignore[operator]
+    variant_mean = call_g.where(call_g_present).mean(dim="samples")
+    imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
+    return imputed_call_g
+
+
+def _collapse_ploidy(ds: xr.Dataset) -> Tuple[xr.DataArray, xr.DataArray]:
+    call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
+    call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy"))  # type: ignore[no-untyped-call]
+    return call_g, call_g_mask
+
+
+def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
+    """Compute PC-Relate as described in Conomos, et al. 2016 [1].
+
+    Parameters
+    ----------
+    ds : `xr.Dataset`
+        Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
+        * genotype calls: "call_genotype" (SxVxD)
+        * genotype calls mask: "call_genotype_mask" (SxVxD)
+        * sample PCs: "sample_pcs" (PCxS)
+    maf : float
+        individual minor allele frequency filter. If an individual's estimated
+        individual-specific minor allele frequency at a SNP is less than this value,
+        that SNP will be excluded from the analysis for that individual.
+        The default value is 0.01. Must be between (0.0, 0.1).
+
+
+    This method computes the kinship coefficient matrix. The kinship coefficient for
+    a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
+    a random allele selected from ``i`` and a random allele selected from ``j`` at
+    a locus are IBD. Several of the most common family relationships and their
+    corresponding kinship coefficient:
+
+    +--------------------------------------------------+---------------------+
+    | Relationship                                     | Kinship coefficient |
+    +==================================================+=====================+
+    | Individual-self                                  | 1/2                 |
+    +--------------------------------------------------+---------------------+
+    | full sister/full brother                         | 1/4                 |
+    +--------------------------------------------------+---------------------+
+    | mother/father/daughter/son                       | 1/4                 |
+    +--------------------------------------------------+---------------------+
+    | grandmother/grandfather/granddaughter/grandson   | 1/8                 |
+    +--------------------------------------------------+---------------------+
+    | aunt/uncle/niece/nephew                          | 1/8                 |
+    +--------------------------------------------------+---------------------+
+    | first cousin                                     | 1/16                |
+    +--------------------------------------------------+---------------------+
+    | half-sister/half-brother                         | 1/8                 |
+    +--------------------------------------------------+---------------------+
+
+    Warnings
+    --------
+    This function is only applicable to diploid, biallelic datasets.
+
+    Returns
+    -------
+    Dataset
+        Dataset containing (S = num samples):
+        pc_relate_phi: (S,S) ArrayLike
+            pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].
+
+    References
+    ----------
+    - [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
+        "Model-Free Estimation of Recent Genetic Relatedness."
+        American Journal of Human Genetics 98 (1): 127–48.
+
+    Raises
+    ------
+    ValueError
+        If ploidy of provided dataset != 2
+    ValueError
+        If maximum number of alleles in provided dataset != 2
+    ValueError
+        Input dataset is missing any of the required variables
+    ValueError
+        If maf is not in (0.0, 1.0)
+    """
+    if maf <= 0.0 or maf >= 1.0:
+        raise ValueError("MAF must be between (0.0, 1.0)")
+    if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
+        raise ValueError("PC Relate only works for diploid genotypes")
+    if "alleles" in ds.dims and ds.dims["alleles"] != 2:
+        raise ValueError("PC Relate only works for biallelic genotypes")
+    if "call_genotype" not in ds:
+        raise ValueError("Input dataset must contain call_genotype")
+    if "call_genotype_mask" not in ds:
+        raise ValueError("Input dataset must contain call_genotype_mask")
+    if "sample_pcs" not in ds:
+        raise ValueError("Input dataset must contain sample_pcs variable")
+
+    call_g, call_g_mask = _collapse_ploidy(ds)
+    imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
+
+    # 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
+    # is a length D vector of regression coefficients for each of the PCs
+    pcs = ds["sample_pcs"]
+    pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
+    # Note: dask qr decomp requires no chunking in one dimension, and because number of
+    # components should be smaller than number of samples in most cases, we disable
+    # chunking on components
+    pcsi = pcsi.T.rechunk((None, -1))
+
+    q, r = da.linalg.qr(pcsi)
+    # mu, eq: 3
+    half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
+    mu = pcsi.dot(half_beta).T
+    # phi, eq: 4
+    mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
+    mu_mask = da.ma.masked_array(mu, mask=mask)
+    variance = mu_mask * (1.0 - mu_mask)
+    variance = da.ma.filled(variance, fill_value=0.0)
+    stddev = da.sqrt(variance)
+    centered_af = call_g / 2 - mu_mask
+    centered_af = da.ma.filled(centered_af, fill_value=0.0)
+    # NOTE: gramian could be a performance bottleneck, and we could explore
+    #       performance improvements like (or maybe sth else):
+    #       * calculating only the pairs we are interested in
+    #       * using an optimized einsum.
+    assert centered_af.shape == call_g.shape
+    assert stddev.shape == call_g.shape
+    phi = gramian(centered_af) / gramian(stddev)
+    # NOTE: phi is of shape (S x S), S = num samples
+    assert phi.shape == (call_g.shape[1],) * 2
+    return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})
diff --git a/sgkit/tests/test_pc_relate.py b/sgkit/tests/test_pc_relate.py
@@ -0,0 +1,127 @@
+import numpy as np
+import pytest
+from hypothesis import given, settings
+from hypothesis.extra.numpy import arrays
+from statsmodels.multivariate.pca import PCA
+
+from sgkit.stats.pc_relate import (
+    _collapse_ploidy,
+    _impute_genotype_call_with_variant_mean,
+    gramian,
+    pc_relate,
+)
+from sgkit.testing import simulate_genotype_call_dataset
+
+
+def test_pc_relate__genotype_inputs_checks() -> None:
+    g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
+    with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"):
+        pc_relate(g_wrong_ploidy)
+
+    g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
+    with pytest.raises(
+        ValueError, match="PC Relate only works for biallelic genotypes"
+    ):
+        pc_relate(g_non_biallelic)
+
+    g_no_pcs = simulate_genotype_call_dataset(100, 10)
+    with pytest.raises(
+        ValueError, match="Input dataset must contain sample_pcs variable"
+    ):
+        pc_relate(g_no_pcs)
+
+    with pytest.raises(ValueError, match="Input dataset must contain call_genotype"):
+        pc_relate(g_no_pcs.drop_vars("call_genotype"))
+
+    with pytest.raises(
+        ValueError, match="Input dataset must contain call_genotype_mask"
+    ):
+        pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))
+
+
+def test_pc_relate__maf_inputs_checks() -> None:
+    g = simulate_genotype_call_dataset(100, 10)
+    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
+        pc_relate(g, maf=-1)
+    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
+        pc_relate(g, maf=1.0)
+    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
+        pc_relate(g, maf=0.0)
+
+
+@given(arrays(np.int8, (3, 5)))
+@settings(max_examples=10)
+def test_gramian_is_symmetric(a: np.ndarray) -> None:
+    b = gramian(a)
+    assert np.allclose(b, b.T)
+
+
+def test_collapse_ploidy() -> None:
+    g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
+    assert g.call_genotype.shape == (1000, 10, 2)
+    assert g.call_genotype_mask.shape == (1000, 10, 2)
+
+    # Sprinkle some tests data, this is a bit verbose, but in tests verbosity is not bad:
+    g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1
+    g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1
+    g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0
+    g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0
+
+    g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0
+    g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1
+    g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0
+    g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0
+
+    g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1
+    g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1
+    g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1
+    g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0
+
+    call_g, call_g_mask = _collapse_ploidy(g)
+    assert call_g.shape == (1000, 10)
+    assert call_g_mask.shape == (1000, 10)
+    assert call_g.isel(variants=1, samples=1) == 2
+    assert call_g.isel(variants=2, samples=2) == 1
+    assert call_g.isel(variants=3, samples=3) == -1
+    assert call_g_mask.isel(variants=1, samples=1) == 0
+    assert call_g_mask.isel(variants=3, samples=3) == 1
+
+
+def test_impute_genotype_call_with_variant_mean() -> None:
+    g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
+    call_g, call_g_mask = _collapse_ploidy(g)
+    # Sprinkle some tests data
+    call_g.loc[dict(variants=2)] = 1
+    call_g.loc[dict(variants=2, samples=1)] = 2
+    call_g_mask.loc[dict(variants=2)] = False
+    call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True
+    imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
+    assert imputed_call_g.isel(variants=2, samples=1) == 2
+    assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all()
+    assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()
+
+
+def test_pc_relate__values_within_range() -> None:
+    n_samples = 100
+    g = simulate_genotype_call_dataset(1000, n_samples)
+    call_g, _ = _collapse_ploidy(g)
+    pcs = PCA(call_g, ncomp=2).loadings
+    g["sample_pcs"] = (("components", "samples"), pcs.T)
+    phi = pc_relate(g)
+    assert phi.pc_relate_phi.shape == (n_samples, n_samples)
+    data_np = phi.pc_relate_phi.data.compute()  # to be able to use fancy indexing below
+    upper_phi = data_np[np.triu_indices_from(data_np, 1)]
+    assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
+
+
+def test_pc_relate__identical_sample_should_be_05() -> None:
+    n_samples = 100
+    g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1)
+    call_g, _ = _collapse_ploidy(g)
+    pcs = PCA(call_g, ncomp=2).loadings
+    g["sample_pcs"] = (("components", "samples"), pcs.T)
+    # add identical sample
+    g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0)
+    phi = pc_relate(g)
+    assert phi.pc_relate_phi.shape == (n_samples, n_samples)
+    assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)