Skip to content

Commit 8ff5684

Browse files
committed
Add PC Relate
1 parent 480761a commit 8ff5684

File tree

5 files changed

+340
-1
lines changed

5 files changed

+340
-1
lines changed

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ statsmodels
88
zarr
99
sphinx
1010
sphinx_rtd_theme
11+
sklearn

setup.cfg

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ ignore =
6161
profile = black
6262
default_section = THIRDPARTY
6363
known_first_party = sgkit
64-
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
64+
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,sklearn,xarray,yaml,zarr
6565
multi_line_output = 3
6666
include_trailing_comma = True
6767
force_grid_wrap = 0
@@ -86,6 +86,10 @@ ignore_missing_imports = True
8686
ignore_missing_imports = True
8787
[mypy-setuptools]
8888
ignore_missing_imports = True
89+
[mypy-sgkit_plink.*]
90+
ignore_missing_imports = True
91+
[mypy-sklearn.*]
92+
ignore_missing_imports = True
8993
[mypy-sgkit.*]
9094
allow_redefinition = True
9195
[mypy-sgkit.tests.*]

sgkit/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .stats.aggregation import count_call_alleles, count_variant_alleles
1212
from .stats.association import gwas_linear_regression
1313
from .stats.hwe import hardy_weinberg_test
14+
from .stats.pc_relate import pc_relate
1415
from .stats.regenie import regenie
1516

1617
__all__ = [
@@ -27,4 +28,5 @@
2728
"read_vcfzarr",
2829
"regenie",
2930
"hardy_weinberg_test",
31+
"pc_relate",
3032
]

sgkit/stats/pc_relate.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
from typing import Tuple
2+
3+
import dask.array as da
4+
import xarray as xr
5+
6+
from sgkit.typing import ArrayLike
7+
8+
9+
def gramian(a: ArrayLike) -> ArrayLike:
10+
"""Returns gramian matrix of the given matrix"""
11+
return a.T.dot(a)
12+
13+
14+
def _impute_genotype_call_with_variant_mean(
15+
call_g: xr.DataArray, call_g_mask: xr.DataArray
16+
) -> xr.DataArray:
17+
call_g_present = ~call_g_mask # type: ignore[operator]
18+
variant_mean = call_g.where(call_g_present).mean(dim="samples")
19+
imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
20+
return imputed_call_g
21+
22+
23+
def _collapse_ploidy(ds: xr.Dataset) -> Tuple[xr.DataArray, xr.DataArray]:
24+
call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
25+
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call]
26+
return call_g, call_g_mask
27+
28+
29+
def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
30+
"""Compute PC-Relate as described in Conomos, et al. 2016 [1].
31+
32+
Parameters
33+
----------
34+
ds : `xr.Dataset`
35+
Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
36+
genotype calls: "call_genotype" (SxVxD)
37+
genotype calls mask: "call_genotype_mask" (SxVxD)
38+
sample PCs: "sample_pcs" (PCxS)
39+
maf : float
40+
individual minor allele frequency filter. If an individual's estimated
41+
individual-specific minor allele frequency at a SNP is less than this value,
42+
that SNP will be excluded from the analysis for that individual.
43+
The default value is 0.01. Must be between (0.0, 0.1).
44+
45+
46+
This method computes the kinship coefficient matrix. The kinship coefficient for
47+
a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
48+
a random allele selected from ``i`` and a random allele selected from ``j`` at
49+
a locus are IBD. Several of the most common family relationships and their
50+
corresponding kinship coefficient:
51+
52+
+--------------------------------------------------+---------------------+
53+
| Relationship | Kinship coefficient |
54+
+==================================================+=====================+
55+
| Individual-self | 1/2 |
56+
+--------------------------------------------------+---------------------+
57+
| full sister/full brother | 1/4 |
58+
+--------------------------------------------------+---------------------+
59+
| mother/father/daughter/son | 1/4 |
60+
+--------------------------------------------------+---------------------+
61+
| grandmother/grandfather/granddaughter/grandson | 1/8 |
62+
+--------------------------------------------------+---------------------+
63+
| aunt/uncle/niece/nephew | 1/8 |
64+
+--------------------------------------------------+---------------------+
65+
| first cousin | 1/16 |
66+
+--------------------------------------------------+---------------------+
67+
| half-sister/half-brother | 1/8 |
68+
+--------------------------------------------------+---------------------+
69+
70+
Warnings
71+
--------
72+
This function is only applicable to diploid, biallelic datasets.
73+
74+
Returns
75+
-------
76+
Dataset
77+
Dataset containing (S = num samples):
78+
pc_relate_phi: (S,S) ArrayLike
79+
pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].
80+
81+
References
82+
----------
83+
- [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
84+
"Model-Free Estimation of Recent Genetic Relatedness."
85+
American Journal of Human Genetics 98 (1): 127–48.
86+
87+
Raises
88+
------
89+
ValueError
90+
If ploidy of provided dataset != 2
91+
ValueError
92+
If maximum number of alleles in provided dataset != 2
93+
ValueError
94+
Input dataset is missing any of the required variables
95+
ValueError
96+
If maf is not in (0.0, 1.0)
97+
"""
98+
if maf <= 0.0 or maf >= 1.0:
99+
raise ValueError("MAF must be between (0.0, 1.0)")
100+
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
101+
raise ValueError("PC Relate only works for diploid genotypes")
102+
if "alleles" in ds.dims and ds.dims["alleles"] != 2:
103+
raise ValueError("PC Relate only works for biallelic genotypes")
104+
if "call_genotype" not in ds:
105+
raise ValueError("Input dataset must contain call_genotype")
106+
if "call_genotype_mask" not in ds:
107+
raise ValueError("Input dataset must contain call_genotype_mask")
108+
if "sample_pcs" not in ds:
109+
raise ValueError("Input dataset must contain sample_pcs variable")
110+
111+
call_g, call_g_mask = _collapse_ploidy(ds)
112+
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
113+
114+
# 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
115+
# is a length D vector of regression coefficients for each of the PCs
116+
pcs = ds["sample_pcs"]
117+
pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
118+
# Note: dask qr decomp requires no chunking in one dimension, and because number of
119+
# components should be smaller than number of samples in most cases, we disable
120+
# chunking on components
121+
pcsi = pcsi.T.rechunk((None, -1))
122+
123+
q, r = da.linalg.qr(pcsi)
124+
# mu, eq: 3
125+
half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
126+
mu = pcsi.dot(half_beta).T
127+
# phi, eq: 4
128+
mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
129+
mu_mask = da.ma.masked_array(mu, mask=mask)
130+
variance = mu_mask * (1.0 - mu_mask)
131+
variance = da.ma.filled(variance, fill_value=0.0)
132+
stddev = da.sqrt(variance)
133+
centered_af = call_g / 2 - mu_mask
134+
centered_af = da.ma.filled(centered_af, fill_value=0.0)
135+
# NOTE: gramian could be a performance bottleneck, and we could explore
136+
# performance improvements like (or maybe sth else):
137+
# * calculating only the pairs we are interested in
138+
# * using an optimized einsum.
139+
assert centered_af.shape == call_g.shape
140+
assert stddev.shape == call_g.shape
141+
phi = gramian(centered_af) / gramian(stddev)
142+
# NOTE: phi is of shape (S x S), S = num samples
143+
assert phi.shape == (call_g.shape[1],) * 2
144+
return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})

sgkit/tests/test_pc_relate.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
import xarray as xr
5+
from hypothesis import given, settings
6+
from hypothesis.extra.numpy import arrays
7+
from sklearn.decomposition import PCA
8+
9+
from sgkit.stats.pc_relate import (
10+
_collapse_ploidy,
11+
_impute_genotype_call_with_variant_mean,
12+
gramian,
13+
pc_relate,
14+
)
15+
from sgkit.testing import simulate_genotype_call_dataset
16+
17+
18+
def test_pc_relate__genotype_inputs_checks() -> None:
19+
g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
20+
with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"):
21+
pc_relate(g_wrong_ploidy)
22+
23+
g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
24+
with pytest.raises(
25+
ValueError, match="PC Relate only works for biallelic genotypes"
26+
):
27+
pc_relate(g_non_biallelic)
28+
29+
g_no_pcs = simulate_genotype_call_dataset(100, 10)
30+
with pytest.raises(
31+
ValueError, match="Input dataset must contain sample_pcs variable"
32+
):
33+
pc_relate(g_no_pcs)
34+
35+
with pytest.raises(ValueError, match="Input dataset must contain call_genotype"):
36+
pc_relate(g_no_pcs.drop_vars("call_genotype"))
37+
38+
with pytest.raises(
39+
ValueError, match="Input dataset must contain call_genotype_mask"
40+
):
41+
pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))
42+
43+
44+
def test_pc_relate__maf_inputs_checks() -> None:
45+
g = simulate_genotype_call_dataset(100, 10)
46+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
47+
pc_relate(g, maf=-1)
48+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
49+
pc_relate(g, maf=1.0)
50+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
51+
pc_relate(g, maf=0.0)
52+
53+
54+
@given(arrays(np.int8, (3, 5)))
55+
@settings(max_examples=10)
56+
def test_gramian_is_symmetric(a: np.ndarray) -> None:
57+
b = gramian(a)
58+
assert np.allclose(b, b.T)
59+
60+
61+
def test_collapse_ploidy() -> None:
62+
g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
63+
assert g.call_genotype.shape == (1000, 10, 2)
64+
assert g.call_genotype_mask.shape == (1000, 10, 2)
65+
66+
# Test individual cases:
67+
g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1
68+
g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1
69+
g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0
70+
g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0
71+
72+
g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0
73+
g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1
74+
g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0
75+
g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0
76+
77+
g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1
78+
g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1
79+
g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1
80+
g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0
81+
82+
call_g, call_g_mask = _collapse_ploidy(g)
83+
assert call_g.shape == (1000, 10)
84+
assert call_g_mask.shape == (1000, 10)
85+
assert call_g.isel(variants=1, samples=1) == 2
86+
assert call_g.isel(variants=2, samples=2) == 1
87+
assert call_g.isel(variants=3, samples=3) == -1
88+
assert call_g_mask.isel(variants=1, samples=1) == 0
89+
assert call_g_mask.isel(variants=3, samples=3) == 1
90+
91+
92+
def test_impute_genotype_call_with_variant_mean() -> None:
93+
g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
94+
call_g, call_g_mask = _collapse_ploidy(g)
95+
# Test individual cases:
96+
call_g.loc[dict(variants=2)] = 1
97+
call_g.loc[dict(variants=2, samples=1)] = 2
98+
call_g_mask.loc[dict(variants=2)] = False
99+
call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True
100+
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
101+
assert imputed_call_g.isel(variants=2, samples=1) == 2
102+
assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all()
103+
assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()
104+
105+
106+
def test_pc_relate__values_within_range() -> None:
107+
n_samples = 100
108+
g = simulate_genotype_call_dataset(1000, n_samples)
109+
call_g, _ = _collapse_ploidy(g)
110+
pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
111+
g["sample_pcs"] = (("components", "samples"), pcs.T)
112+
phi = pc_relate(g)
113+
assert phi.pc_relate_phi.shape == (n_samples, n_samples)
114+
data_np = phi.pc_relate_phi.data.compute() # to be able to use fancy indexing below
115+
upper_phi = data_np[np.triu_indices_from(data_np, 1)]
116+
assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
117+
118+
119+
def test_pc_relate__identical_sample_should_be_05() -> None:
120+
n_samples = 100
121+
g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1)
122+
call_g, _ = _collapse_ploidy(g)
123+
pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
124+
g["sample_pcs"] = (("components", "samples"), pcs.T)
125+
# Add identical sample
126+
g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0)
127+
phi = pc_relate(g)
128+
assert phi.pc_relate_phi.shape == (n_samples, n_samples)
129+
assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)
130+
131+
132+
def test_pc_relate__parent_child_relationship() -> None:
133+
# Eric's source: https://github.com/pystatgen/sgkit/pull/228#discussion_r487436876
134+
135+
# Create a dataset that is 2/3 founders and 1/3 progeny
136+
seed = 1
137+
rs = np.random.RandomState(seed)
138+
ds = simulate_genotype_call_dataset(1000, 300, seed=seed)
139+
ds["sample_type"] = xr.DataArray(
140+
np.repeat(["mother", "father", "child"], 100), dims="samples"
141+
)
142+
sample_groups = ds.groupby("sample_type").groups
143+
144+
def simulate_new_generation(ds: xr.Dataset) -> xr.Dataset:
145+
# Generate progeny genotypes as a combination of randomly
146+
# selected haplotypes from each parents
147+
idx = sample_groups["mother"] + sample_groups["father"]
148+
gt = ds.call_genotype.isel(samples=idx).values
149+
idx = rs.randint(0, 2, size=gt.shape[:2])
150+
# Collapse to haplotype across ploidy dim using indexer
151+
# * shape = (samples, variants)
152+
ht = gt[np.ix_(*map(range, gt.shape[:2])) + (idx,)].T
153+
gt_child = np.stack([ht[sample_groups[t]] for t in ["mother", "father"]]).T
154+
ds["call_genotype"].values = np.concatenate((gt, gt_child), axis=1)
155+
return ds
156+
157+
# Redefine the progeny genotypes
158+
ds = simulate_new_generation(ds)
159+
160+
# Infer kinship
161+
call_g, _ = _collapse_ploidy(ds)
162+
pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
163+
ds["sample_pcs"] = (("components", "samples"), pcs.T)
164+
ds["pc_relate_phi"] = pc_relate(ds)["pc_relate_phi"].compute()
165+
166+
# Check that all coefficients are in expected ranges
167+
cts = (
168+
ds["pc_relate_phi"]
169+
.to_series()
170+
.reset_index()
171+
.pipe(lambda df: df.loc[df.sample_x >= df.sample_y]["pc_relate_phi"])
172+
.pipe(
173+
pd.cut,
174+
bins=[p for phi in [0, 0.25, 0.5] for p in [phi - 0.1, phi + 0.1]],
175+
labels=[
176+
"unrelated",
177+
"unclassified",
178+
"parent/child",
179+
"unclassified",
180+
"self",
181+
],
182+
ordered=False,
183+
)
184+
.value_counts()
185+
)
186+
assert cts["parent/child"] == len(sample_groups["child"]) * 2
187+
assert cts["self"] == ds.dims["samples"]
188+
assert cts["unclassified"] == 0

0 commit comments

Comments
 (0)