Skip to content

Commit 070ce72

Browse files
committed
Add PC Relate
1 parent 480761a commit 070ce72

File tree

3 files changed

+274
-1
lines changed

3 files changed

+274
-1
lines changed

setup.cfg

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ ignore =
6161
profile = black
6262
default_section = THIRDPARTY
6363
known_first_party = sgkit
64-
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
64+
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,statsmodels,xarray,yaml,zarr
6565
multi_line_output = 3
6666
include_trailing_comma = True
6767
force_grid_wrap = 0
@@ -86,6 +86,8 @@ ignore_missing_imports = True
8686
ignore_missing_imports = True
8787
[mypy-setuptools]
8888
ignore_missing_imports = True
89+
[mypy-sgkit_plink.*]
90+
ignore_missing_imports = True
8991
[mypy-sgkit.*]
9092
allow_redefinition = True
9193
[mypy-sgkit.tests.*]

sgkit/stats/pc_relate.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
from typing import Tuple
2+
3+
import dask.array as da
4+
import xarray as xr
5+
6+
from sgkit.typing import ArrayLike
7+
8+
9+
def gramian(a: ArrayLike) -> ArrayLike:
10+
"""Returns gramian matrix of the given matrix"""
11+
return a.T.dot(a)
12+
13+
14+
def _impute_genotype_call_with_variant_mean(
15+
call_g: xr.DataArray, call_g_mask: xr.DataArray
16+
) -> xr.DataArray:
17+
call_g_present = ~call_g_mask # type: ignore[operator]
18+
variant_mean = call_g.where(call_g_present).mean(dim="samples")
19+
imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
20+
return imputed_call_g
21+
22+
23+
def _collapse_ploidy(ds: xr.Dataset) -> Tuple[xr.DataArray, xr.DataArray]:
24+
call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
25+
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call]
26+
return call_g, call_g_mask
27+
28+
29+
def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
30+
"""Compute PC-Relate as described in Conomos, et al. 2016 [1].
31+
32+
Parameters
33+
----------
34+
ds : `xr.Dataset`
35+
Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
36+
* genotype calls: "call_genotype" (SxVxD)
37+
* genotype calls mask: "call_genotype_mask" (SxVxD)
38+
* sample PCs: "sample_pcs" (PCxS)
39+
maf : float
40+
individual minor allele frequency filter. If an individual's estimated
41+
individual-specific minor allele frequency at a SNP is less than this value,
42+
that SNP will be excluded from the analysis for that individual.
43+
The default value is 0.01. Must be between (0.0, 0.1).
44+
45+
46+
This method computes the kinship coefficient matrix. The kinship coefficient for
47+
a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
48+
a random allele selected from ``i`` and a random allele selected from ``j`` at
49+
a locus are IBD. Several of the most common family relationships and their
50+
corresponding kinship coefficient:
51+
52+
+--------------------------------------------------+---------------------+
53+
| Relationship | Kinship coefficient |
54+
+==================================================+=====================+
55+
| Individual-self | 1/2 |
56+
+--------------------------------------------------+---------------------+
57+
| full sister/full brother | 1/4 |
58+
+--------------------------------------------------+---------------------+
59+
| mother/father/daughter/son | 1/4 |
60+
+--------------------------------------------------+---------------------+
61+
| grandmother/grandfather/granddaughter/grandson | 1/8 |
62+
+--------------------------------------------------+---------------------+
63+
| aunt/uncle/niece/nephew | 1/8 |
64+
+--------------------------------------------------+---------------------+
65+
| first cousin | 1/16 |
66+
+--------------------------------------------------+---------------------+
67+
| half-sister/half-brother | 1/8 |
68+
+--------------------------------------------------+---------------------+
69+
70+
Warnings
71+
--------
72+
This function is only applicable to diploid, biallelic datasets.
73+
74+
Returns
75+
-------
76+
Dataset
77+
Dataset containing (S = num samples):
78+
pc_relate_phi: (S,S) ArrayLike
79+
pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].
80+
81+
References
82+
----------
83+
- [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
84+
"Model-Free Estimation of Recent Genetic Relatedness."
85+
American Journal of Human Genetics 98 (1): 127–48.
86+
87+
Raises
88+
------
89+
ValueError
90+
If ploidy of provided dataset != 2
91+
ValueError
92+
If maximum number of alleles in provided dataset != 2
93+
ValueError
94+
Input dataset is missing any of the required variables
95+
ValueError
96+
If maf is not in (0.0, 1.0)
97+
"""
98+
if maf <= 0.0 or maf >= 1.0:
99+
raise ValueError("MAF must be between (0.0, 1.0)")
100+
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
101+
raise ValueError("PC Relate only works for diploid genotypes")
102+
if "alleles" in ds.dims and ds.dims["alleles"] != 2:
103+
raise ValueError("PC Relate only works for biallelic genotypes")
104+
if "call_genotype" not in ds:
105+
raise ValueError("Input dataset must contain call_genotype")
106+
if "call_genotype_mask" not in ds:
107+
raise ValueError("Input dataset must contain call_genotype_mask")
108+
if "sample_pcs" not in ds:
109+
raise ValueError("Input dataset must contain sample_pcs variable")
110+
111+
call_g, call_g_mask = _collapse_ploidy(ds)
112+
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
113+
114+
# 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
115+
# is a length D vector of regression coefficients for each of the PCs
116+
pcs = ds["sample_pcs"]
117+
pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
118+
# Note: dask qr decomp requires no chunking in one dimension, and because number of
119+
# components should be smaller than number of samples in most cases, we disable
120+
# chunking on components
121+
pcsi = pcsi.T.rechunk((None, -1))
122+
123+
q, r = da.linalg.qr(pcsi)
124+
# mu, eq: 3
125+
half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
126+
mu = pcsi.dot(half_beta).T
127+
# phi, eq: 4
128+
mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
129+
mu_mask = da.ma.masked_array(mu, mask=mask)
130+
variance = mu_mask * (1.0 - mu_mask)
131+
variance = da.ma.filled(variance, fill_value=0.0)
132+
stddev = da.sqrt(variance)
133+
centered_af = call_g / 2 - mu_mask
134+
centered_af = da.ma.filled(centered_af, fill_value=0.0)
135+
# NOTE: gramian could be a performance bottleneck, and we could explore
136+
# performance improvements like (or maybe sth else):
137+
# * calculating only the pairs we are interested in
138+
# * using an optimized einsum.
139+
assert centered_af.shape == call_g.shape
140+
assert stddev.shape == call_g.shape
141+
phi = gramian(centered_af) / gramian(stddev)
142+
# NOTE: phi is of shape (S x S), S = num samples
143+
assert phi.shape == (call_g.shape[1],) * 2
144+
return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})

sgkit/tests/test_pc_relate.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import numpy as np
2+
import pytest
3+
from hypothesis import given, settings
4+
from hypothesis.extra.numpy import arrays
5+
from statsmodels.multivariate.pca import PCA
6+
7+
from sgkit.stats.pc_relate import (
8+
_collapse_ploidy,
9+
_impute_genotype_call_with_variant_mean,
10+
gramian,
11+
pc_relate,
12+
)
13+
from sgkit.testing import simulate_genotype_call_dataset
14+
15+
16+
def test_pc_relate__genotype_inputs_checks() -> None:
17+
g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
18+
with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"):
19+
pc_relate(g_wrong_ploidy)
20+
21+
g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
22+
with pytest.raises(
23+
ValueError, match="PC Relate only works for biallelic genotypes"
24+
):
25+
pc_relate(g_non_biallelic)
26+
27+
g_no_pcs = simulate_genotype_call_dataset(100, 10)
28+
with pytest.raises(
29+
ValueError, match="Input dataset must contain sample_pcs variable"
30+
):
31+
pc_relate(g_no_pcs)
32+
33+
with pytest.raises(ValueError, match="Input dataset must contain call_genotype"):
34+
pc_relate(g_no_pcs.drop_vars("call_genotype"))
35+
36+
with pytest.raises(
37+
ValueError, match="Input dataset must contain call_genotype_mask"
38+
):
39+
pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))
40+
41+
42+
def test_pc_relate__maf_inputs_checks() -> None:
43+
g = simulate_genotype_call_dataset(100, 10)
44+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
45+
pc_relate(g, maf=-1)
46+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
47+
pc_relate(g, maf=1.0)
48+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
49+
pc_relate(g, maf=0.0)
50+
51+
52+
@given(arrays(np.int8, (3, 5)))
53+
@settings(max_examples=10)
54+
def test_gramian_is_symmetric(a: np.ndarray) -> None:
55+
b = gramian(a)
56+
assert np.allclose(b, b.T)
57+
58+
59+
def test_collapse_ploidy() -> None:
60+
g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
61+
assert g.call_genotype.shape == (1000, 10, 2)
62+
assert g.call_genotype_mask.shape == (1000, 10, 2)
63+
64+
# Sprinkle some tests data, this is a bit verbose, but in tests verbosity is not bad:
65+
g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1
66+
g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1
67+
g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0
68+
g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0
69+
70+
g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0
71+
g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1
72+
g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0
73+
g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0
74+
75+
g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1
76+
g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1
77+
g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1
78+
g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0
79+
80+
call_g, call_g_mask = _collapse_ploidy(g)
81+
assert call_g.shape == (1000, 10)
82+
assert call_g_mask.shape == (1000, 10)
83+
assert call_g.isel(variants=1, samples=1) == 2
84+
assert call_g.isel(variants=2, samples=2) == 1
85+
assert call_g.isel(variants=3, samples=3) == -1
86+
assert call_g_mask.isel(variants=1, samples=1) == 0
87+
assert call_g_mask.isel(variants=3, samples=3) == 1
88+
89+
90+
def test_impute_genotype_call_with_variant_mean() -> None:
91+
g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
92+
call_g, call_g_mask = _collapse_ploidy(g)
93+
# Sprinkle some tests data
94+
call_g.loc[dict(variants=2)] = 1
95+
call_g.loc[dict(variants=2, samples=1)] = 2
96+
call_g_mask.loc[dict(variants=2)] = False
97+
call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True
98+
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
99+
assert imputed_call_g.isel(variants=2, samples=1) == 2
100+
assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all()
101+
assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()
102+
103+
104+
def test_pc_relate__values_within_range() -> None:
105+
n_samples = 100
106+
g = simulate_genotype_call_dataset(1000, n_samples)
107+
call_g, _ = _collapse_ploidy(g)
108+
pcs = PCA(call_g, ncomp=2).loadings
109+
g["sample_pcs"] = (("components", "samples"), pcs.T)
110+
phi = pc_relate(g)
111+
assert phi.pc_relate_phi.shape == (n_samples, n_samples)
112+
data_np = phi.pc_relate_phi.data.compute() # to be able to use fancy indexing below
113+
upper_phi = data_np[np.triu_indices_from(data_np, 1)]
114+
assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
115+
116+
117+
def test_pc_relate__identical_sample_should_be_05() -> None:
118+
n_samples = 100
119+
g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1)
120+
call_g, _ = _collapse_ploidy(g)
121+
pcs = PCA(call_g, ncomp=2).loadings
122+
g["sample_pcs"] = (("components", "samples"), pcs.T)
123+
# add identical sample
124+
g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0)
125+
phi = pc_relate(g)
126+
assert phi.pc_relate_phi.shape == (n_samples, n_samples)
127+
assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)

0 commit comments

Comments
 (0)