Skip to content

Commit 6b75dcc

Browse files
committed
Add PC Relate
1 parent dbdd147 commit 6b75dcc

File tree

6 files changed

+350
-2
lines changed

6 files changed

+350
-2
lines changed

docs/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Methods
3232
regenie
3333
variant_stats
3434
Tajimas_D
35+
pc_relate
3536

3637
Utilities
3738
=========

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ zarr
99
msprime
1010
sphinx
1111
sphinx_rtd_theme
12+
sklearn

setup.cfg

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ ignore =
6262
profile = black
6363
default_section = THIRDPARTY
6464
known_first_party = sgkit
65-
known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,typing_extensions,xarray,yaml,zarr
65+
known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,sklearn,typing_extensions,xarray,yaml,zarr
6666
multi_line_output = 3
6767
include_trailing_comma = True
6868
force_grid_wrap = 0
@@ -87,6 +87,10 @@ ignore_missing_imports = True
8787
ignore_missing_imports = True
8888
[mypy-setuptools]
8989
ignore_missing_imports = True
90+
[mypy-sgkit_plink.*]
91+
ignore_missing_imports = True
92+
[mypy-sklearn.*]
93+
ignore_missing_imports = True
9094
[mypy-sgkit.*]
9195
allow_redefinition = True
9296
[mypy-sgkit.tests.*]

sgkit/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .api import ( # noqa: F401
1+
from .api import (
22
DIM_ALLELE,
33
DIM_PLOIDY,
44
DIM_SAMPLE,
@@ -11,6 +11,7 @@
1111
from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats
1212
from .stats.association import gwas_linear_regression
1313
from .stats.hwe import hardy_weinberg_test
14+
from .stats.pc_relate import pc_relate
1415
from .stats.popgen import Fst, Tajimas_D, divergence, diversity
1516
from .stats.regenie import regenie
1617

@@ -33,4 +34,5 @@
3334
"divergence",
3435
"Fst",
3536
"Tajimas_D",
37+
"pc_relate",
3638
]

sgkit/stats/pc_relate.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
from typing import Tuple
2+
3+
import dask.array as da
4+
import xarray as xr
5+
6+
from sgkit.typing import ArrayLike
7+
from sgkit.utils import merge_datasets
8+
9+
10+
def gramian(a: ArrayLike) -> ArrayLike:
11+
"""Returns gramian matrix of the given matrix"""
12+
return a.T.dot(a)
13+
14+
15+
def _impute_genotype_call_with_variant_mean(
16+
call_g: xr.DataArray, call_g_mask: xr.DataArray
17+
) -> xr.DataArray:
18+
call_g_present = ~call_g_mask # type: ignore[operator]
19+
variant_mean = call_g.where(call_g_present).mean(dim="samples")
20+
imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
21+
return imputed_call_g
22+
23+
24+
def _collapse_ploidy(ds: xr.Dataset) -> Tuple[xr.DataArray, xr.DataArray]:
25+
call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
26+
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call]
27+
return call_g, call_g_mask
28+
29+
30+
def pc_relate(ds: xr.Dataset, maf: float = 0.01, merge: bool = True) -> xr.Dataset:
31+
"""Compute PC-Relate as described in Conomos, et al. 2016 [1].
32+
33+
This method computes the kinship coefficient matrix. The kinship coefficient for
34+
a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
35+
a random allele selected from ``i`` and a random allele selected from ``j`` at
36+
a locus are IBD. Several of the most common family relationships and their
37+
corresponding kinship coefficient:
38+
39+
+--------------------------------------------------+---------------------+
40+
| Relationship | Kinship coefficient |
41+
+==================================================+=====================+
42+
| Individual-self | 1/2 |
43+
+--------------------------------------------------+---------------------+
44+
| full sister/full brother | 1/4 |
45+
+--------------------------------------------------+---------------------+
46+
| mother/father/daughter/son | 1/4 |
47+
+--------------------------------------------------+---------------------+
48+
| grandmother/grandfather/granddaughter/grandson | 1/8 |
49+
+--------------------------------------------------+---------------------+
50+
| aunt/uncle/niece/nephew | 1/8 |
51+
+--------------------------------------------------+---------------------+
52+
| first cousin | 1/16 |
53+
+--------------------------------------------------+---------------------+
54+
| half-sister/half-brother | 1/8 |
55+
+--------------------------------------------------+---------------------+
56+
57+
Parameters
58+
----------
59+
ds : Dataset
60+
Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC)
61+
62+
- genotype calls: "call_genotype" (SxVxD)
63+
- genotype calls mask: "call_genotype_mask" (SxVxD)
64+
- sample PCs: "sample_pcs" (PCxS)
65+
maf : float
66+
individual minor allele frequency filter. If an individual's estimated
67+
individual-specific minor allele frequency at a SNP is less than this value,
68+
that SNP will be excluded from the analysis for that individual.
69+
The default value is 0.01. Must be between (0.0, 0.1).
70+
merge : bool, optional
71+
If True (the default), merge the input dataset and the computed
72+
output variables into a single dataset. Output variables will
73+
overwrite any input variables with the same name, and a warning
74+
will be issued in this case.
75+
If False, return only the computed output variables.
76+
77+
Warnings
78+
--------
79+
This function is only applicable to diploid, biallelic datasets.
80+
81+
Returns
82+
-------
83+
Dataset
84+
Dataset containing (S = num samples):
85+
pc_relate_phi: (S,S) ArrayLike
86+
pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].
87+
88+
References
89+
----------
90+
[1] - Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
91+
"Model-Free Estimation of Recent Genetic Relatedness."
92+
American Journal of Human Genetics 98 (1): 127–48.
93+
94+
Raises
95+
------
96+
ValueError
97+
If ploidy of provided dataset != 2
98+
ValueError
99+
If maximum number of alleles in provided dataset != 2
100+
ValueError
101+
Input dataset is missing any of the required variables
102+
ValueError
103+
If maf is not in (0.0, 1.0)
104+
"""
105+
if maf <= 0.0 or maf >= 1.0:
106+
raise ValueError("MAF must be between (0.0, 1.0)")
107+
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
108+
raise ValueError("PC Relate only works for diploid genotypes")
109+
if "alleles" in ds.dims and ds.dims["alleles"] != 2:
110+
raise ValueError("PC Relate only works for biallelic genotypes")
111+
if "call_genotype" not in ds:
112+
raise ValueError("Input dataset must contain call_genotype")
113+
if "call_genotype_mask" not in ds:
114+
raise ValueError("Input dataset must contain call_genotype_mask")
115+
if "sample_pcs" not in ds:
116+
raise ValueError("Input dataset must contain sample_pcs variable")
117+
118+
call_g, call_g_mask = _collapse_ploidy(ds)
119+
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
120+
121+
# 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
122+
# is a length D vector of regression coefficients for each of the PCs
123+
pcs = ds["sample_pcs"]
124+
pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
125+
# Note: dask qr decomp requires no chunking in one dimension, and because number of
126+
# components should be smaller than number of samples in most cases, we disable
127+
# chunking on components
128+
pcsi = pcsi.T.rechunk((None, -1))
129+
130+
q, r = da.linalg.qr(pcsi)
131+
# mu, eq: 3
132+
half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
133+
mu = pcsi.dot(half_beta).T
134+
# phi, eq: 4
135+
mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
136+
mu_mask = da.ma.masked_array(mu, mask=mask)
137+
variance = mu_mask * (1.0 - mu_mask)
138+
variance = da.ma.filled(variance, fill_value=0.0)
139+
stddev = da.sqrt(variance)
140+
centered_af = call_g / 2 - mu_mask
141+
centered_af = da.ma.filled(centered_af, fill_value=0.0)
142+
# NOTE: gramian could be a performance bottleneck, and we could explore
143+
# performance improvements like (or maybe sth else):
144+
# * calculating only the pairs we are interested in
145+
# * using an optimized einsum.
146+
assert centered_af.shape == call_g.shape
147+
assert stddev.shape == call_g.shape
148+
phi = gramian(centered_af) / gramian(stddev)
149+
# NOTE: phi is of shape (S x S), S = num samples
150+
assert phi.shape == (call_g.shape[1],) * 2
151+
new_ds = xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})
152+
return merge_datasets(ds, new_ds) if merge else new_ds

sgkit/tests/test_pc_relate.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
import xarray as xr
5+
from hypothesis import given, settings
6+
from hypothesis.extra.numpy import arrays
7+
from sklearn.decomposition import PCA
8+
9+
from sgkit.stats.pc_relate import (
10+
_collapse_ploidy,
11+
_impute_genotype_call_with_variant_mean,
12+
gramian,
13+
pc_relate,
14+
)
15+
from sgkit.testing import simulate_genotype_call_dataset
16+
17+
18+
def test_pc_relate__genotype_inputs_checks() -> None:
19+
g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
20+
with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"):
21+
pc_relate(g_wrong_ploidy)
22+
23+
g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
24+
with pytest.raises(
25+
ValueError, match="PC Relate only works for biallelic genotypes"
26+
):
27+
pc_relate(g_non_biallelic)
28+
29+
g_no_pcs = simulate_genotype_call_dataset(100, 10)
30+
with pytest.raises(
31+
ValueError, match="Input dataset must contain sample_pcs variable"
32+
):
33+
pc_relate(g_no_pcs)
34+
35+
with pytest.raises(ValueError, match="Input dataset must contain call_genotype"):
36+
pc_relate(g_no_pcs.drop_vars("call_genotype"))
37+
38+
with pytest.raises(
39+
ValueError, match="Input dataset must contain call_genotype_mask"
40+
):
41+
pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))
42+
43+
44+
def test_pc_relate__maf_inputs_checks() -> None:
45+
g = simulate_genotype_call_dataset(100, 10)
46+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
47+
pc_relate(g, maf=-1)
48+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
49+
pc_relate(g, maf=1.0)
50+
with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
51+
pc_relate(g, maf=0.0)
52+
53+
54+
@given(arrays(np.int8, (3, 5)))
55+
@settings(max_examples=10)
56+
def test_gramian_is_symmetric(a: np.ndarray) -> None:
57+
b = gramian(a)
58+
assert np.allclose(b, b.T)
59+
60+
61+
def test_collapse_ploidy() -> None:
62+
g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
63+
assert g.call_genotype.shape == (1000, 10, 2)
64+
assert g.call_genotype_mask.shape == (1000, 10, 2)
65+
66+
# Test individual cases:
67+
g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1
68+
g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1
69+
g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0
70+
g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0
71+
72+
g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0
73+
g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1
74+
g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0
75+
g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0
76+
77+
g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1
78+
g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1
79+
g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1
80+
g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0
81+
82+
call_g, call_g_mask = _collapse_ploidy(g)
83+
assert call_g.shape == (1000, 10)
84+
assert call_g_mask.shape == (1000, 10)
85+
assert call_g.isel(variants=1, samples=1) == 2
86+
assert call_g.isel(variants=2, samples=2) == 1
87+
assert call_g.isel(variants=3, samples=3) == -1
88+
assert call_g_mask.isel(variants=1, samples=1) == 0
89+
assert call_g_mask.isel(variants=3, samples=3) == 1
90+
91+
92+
def test_impute_genotype_call_with_variant_mean() -> None:
93+
g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
94+
call_g, call_g_mask = _collapse_ploidy(g)
95+
# Test individual cases:
96+
call_g.loc[dict(variants=2)] = 1
97+
call_g.loc[dict(variants=2, samples=1)] = 2
98+
call_g_mask.loc[dict(variants=2)] = False
99+
call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True
100+
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
101+
assert imputed_call_g.isel(variants=2, samples=1) == 2
102+
assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all()
103+
assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()
104+
105+
106+
def test_pc_relate__values_within_range() -> None:
107+
n_samples = 100
108+
g = simulate_genotype_call_dataset(1000, n_samples)
109+
call_g, _ = _collapse_ploidy(g)
110+
pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
111+
g["sample_pcs"] = (("components", "samples"), pcs.T)
112+
phi = pc_relate(g)
113+
assert phi.pc_relate_phi.shape == (n_samples, n_samples)
114+
data_np = phi.pc_relate_phi.data.compute() # to be able to use fancy indexing below
115+
upper_phi = data_np[np.triu_indices_from(data_np, 1)]
116+
assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
117+
118+
119+
def test_pc_relate__identical_sample_should_be_05() -> None:
120+
n_samples = 100
121+
g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1)
122+
call_g, _ = _collapse_ploidy(g)
123+
pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
124+
g["sample_pcs"] = (("components", "samples"), pcs.T)
125+
# Add identical sample
126+
g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0)
127+
phi = pc_relate(g)
128+
assert phi.pc_relate_phi.shape == (n_samples, n_samples)
129+
assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)
130+
131+
132+
def test_pc_relate__parent_child_relationship() -> None:
133+
# Eric's source: https://github.com/pystatgen/sgkit/pull/228#discussion_r487436876
134+
135+
# Create a dataset that is 2/3 founders and 1/3 progeny
136+
seed = 1
137+
rs = np.random.RandomState(seed)
138+
ds = simulate_genotype_call_dataset(1000, 300, seed=seed)
139+
ds["sample_type"] = xr.DataArray(
140+
np.repeat(["mother", "father", "child"], 100), dims="samples"
141+
)
142+
sample_groups = ds.groupby("sample_type").groups
143+
144+
def simulate_new_generation(ds: xr.Dataset) -> xr.Dataset:
145+
# Generate progeny genotypes as a combination of randomly
146+
# selected haplotypes from each parents
147+
idx = sample_groups["mother"] + sample_groups["father"]
148+
gt = ds.call_genotype.isel(samples=idx).values
149+
idx = rs.randint(0, 2, size=gt.shape[:2])
150+
# Collapse to haplotype across ploidy dim using indexer
151+
# * shape = (samples, variants)
152+
ht = gt[np.ix_(*map(range, gt.shape[:2])) + (idx,)].T
153+
gt_child = np.stack([ht[sample_groups[t]] for t in ["mother", "father"]]).T
154+
ds["call_genotype"].values = np.concatenate((gt, gt_child), axis=1)
155+
return ds
156+
157+
# Redefine the progeny genotypes
158+
ds = simulate_new_generation(ds)
159+
160+
# Infer kinship
161+
call_g, _ = _collapse_ploidy(ds)
162+
pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
163+
ds["sample_pcs"] = (("components", "samples"), pcs.T)
164+
ds["pc_relate_phi"] = pc_relate(ds)["pc_relate_phi"].compute()
165+
166+
# Check that all coefficients are in expected ranges
167+
cts = (
168+
ds["pc_relate_phi"]
169+
.to_series()
170+
.reset_index()
171+
.pipe(lambda df: df.loc[df.sample_x >= df.sample_y]["pc_relate_phi"])
172+
.pipe(
173+
pd.cut,
174+
bins=[p for phi in [0, 0.25, 0.5] for p in [phi - 0.1, phi + 0.1]],
175+
labels=[
176+
"unrelated",
177+
"unclassified",
178+
"parent/child",
179+
"unclassified",
180+
"self",
181+
],
182+
ordered=False,
183+
)
184+
.value_counts()
185+
)
186+
assert cts["parent/child"] == len(sample_groups["child"]) * 2
187+
assert cts["self"] == ds.dims["samples"]
188+
assert cts["unclassified"] == 0

0 commit comments

Comments
 (0)