-
Notifications
You must be signed in to change notification settings - Fork 4
[ENH] KCD and Bregman tests #15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
92c73cf
0bab643
9e747ee
be297d7
02ce950
cce98a2
578f577
8fd5997
3cf25e9
b877dc9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from . import fisherz, kci | ||
from . import discrepancy, independence | ||
from ._version import __version__ # noqa: F401 | ||
from .independence import Methods, independence_test | ||
from .api import Methods, independence_test | ||
from .discrepancy import bregman, kcd # noqa: F401 | ||
from .independence import fisherz, kci # noqa: F401 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from . import bregman, kcd |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
from typing import Callable, Optional | ||
bloebp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
import numpy as np | ||
from joblib import Parallel, delayed | ||
from numpy.typing import ArrayLike | ||
from sklearn.base import BaseEstimator | ||
from sklearn.linear_model import LogisticRegression | ||
|
||
from pywhy_stats.kernel_utils import _default_regularization | ||
|
||
|
||
def _preprocess_propensity_data( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since it is not really preprocessing something bur rather validates that the parameter/inputs are correctly specified, what about calling it |
||
group_ind: ArrayLike, | ||
propensity_model: Optional[BaseEstimator], | ||
propensity_weights: Optional[ArrayLike], | ||
): | ||
if group_ind.ndim != 1: | ||
raise RuntimeError("group_ind must be a 1d array.") | ||
if len(np.unique(group_ind)) != 2: | ||
raise RuntimeError( | ||
f"There should only be two groups. Found {len(np.unique(group_ind))} groups." | ||
) | ||
if propensity_model is not None and propensity_weights is not None: | ||
raise ValueError( | ||
"Both propensity model and propensity estimates are specified. Only one is allowed." | ||
) | ||
if propensity_weights is not None: | ||
if propensity_weights.shape[0] != len(group_ind): | ||
raise ValueError( | ||
f"There are {propensity_weights.shape[0]} pre-defined estimates, while " | ||
f"there are {len(group_ind)} samples." | ||
) | ||
if propensity_weights.shape[1] != len(np.unique(group_ind.squeeze())): | ||
raise ValueError( | ||
f"There are {propensity_weights.shape[1]} group pre-defined estimates, while " | ||
f"there are {len(np.unique(group_ind))} unique groups." | ||
) | ||
|
||
|
||
def _compute_propensity_scores( | ||
group_ind: ArrayLike, | ||
propensity_model: Optional[BaseEstimator] = None, | ||
propensity_weights: Optional[ArrayLike] = None, | ||
n_jobs: Optional[int] = None, | ||
random_state: Optional[int] = None, | ||
**kwargs, | ||
): | ||
if propensity_model is None: | ||
K: ArrayLike = kwargs.get("K") | ||
|
||
# compute a default penalty term if using a kernel matrix | ||
# C is the inverse of the regularization parameter | ||
if K.shape[0] == K.shape[1]: | ||
# default regularization is 1 / (2 * K) | ||
propensity_penalty_ = _default_regularization(K) | ||
C = 1 / (2 * propensity_penalty_) | ||
bloebp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
# defaults to no regularization | ||
propensity_penalty_ = 0.0 | ||
C = 1.0 | ||
|
||
# default model is logistic regression | ||
propensity_model_ = LogisticRegression( | ||
penalty="l2", | ||
n_jobs=n_jobs, | ||
warm_start=True, | ||
solver="lbfgs", | ||
random_state=random_state, | ||
C=C, | ||
) | ||
else: | ||
propensity_model_ = propensity_model | ||
|
||
# either use pre-defined propensity weights, or estimate them | ||
if propensity_weights is None: | ||
K = kwargs.get("K") | ||
# fit and then obtain the probabilities of treatment | ||
# for each sample (i.e. the propensity scores) | ||
propensity_weights = propensity_model_.fit(K, group_ind.ravel()).predict_proba(K)[:, 1] | ||
else: | ||
propensity_weights = propensity_weights[:, 1] | ||
return propensity_weights | ||
|
||
|
||
def compute_null( | ||
func: Callable, | ||
e_hat: ArrayLike, | ||
X: ArrayLike, | ||
Y: ArrayLike, | ||
null_reps: int = 1000, | ||
n_jobs=None, | ||
seed=None, | ||
**kwargs, | ||
) -> ArrayLike: | ||
"""Estimate null distribution using propensity weights. | ||
|
||
Parameters | ||
---------- | ||
func : Callable | ||
The function to compute the test statistic. | ||
e_hat : Array-like of shape (n_samples,) | ||
The predicted propensity score for ``group_ind == 1``. | ||
X : Array-Like of shape (n_samples, n_features_x) | ||
The X (covariates) array. | ||
Y : Array-Like of shape (n_samples, n_features_y) | ||
The Y (outcomes) array. | ||
null_reps : int, optional | ||
Number of times to sample null, by default 1000. | ||
n_jobs : int, optional | ||
Number of jobs to run in parallel, by default None. | ||
seed : int, optional | ||
Random generator, or random seed, by default None. | ||
|
||
Returns | ||
------- | ||
null_dist : Array-like of shape (n_samples,) | ||
The null distribution of test statistics. | ||
""" | ||
rng = np.random.default_rng(seed) | ||
n_samps = X.shape[0] | ||
|
||
# compute the test statistic on the conditionally permuted | ||
# dataset, where each group label is resampled for each sample | ||
# according to its propensity score | ||
null_dist = Parallel(n_jobs=n_jobs)( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that using the parallel jobs here would ignore the previously set random seed. This can be fixed doing something like here: The idea is to generate random seeds based on the current (seeded) random generator and provide these seeds to the parallel processes. In that way, the generated seeds are deterministic and, thus, the parallel processes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this one is fine because it uses the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I see. Just to confirm, the only random thing here is the |
||
[ | ||
delayed(func)(X, Y, group_ind=rng.binomial(1, e_hat, size=n_samps), **kwargs) | ||
for _ in range(null_reps) | ||
] | ||
) | ||
return np.asarray(null_dist) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like "conditional k-sample test", since this is I think quite a good understanding. We should then also rename the module accordingly, it is still "discrepancy".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay perhaps,
conditional_ksample
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good!