Source code for dodiscover.ci.clf_test

from typing import Callable, Optional, Set, Tuple, Union

import numpy as np
import pandas as pd
import scipy.special
import sklearn
import sklearn.metrics
from sklearn.utils import check_random_state

from dodiscover.typing import Column

from .base import BaseConditionalIndependenceTest, ClassifierCIMixin
from .typing import Classifier


[docs]class ClassifierCITest(BaseConditionalIndependenceTest, ClassifierCIMixin): def __init__( self, clf: Classifier, metric: Callable = sklearn.metrics.accuracy_score, bootstrap: bool = False, n_iter: int = 20, correct_bias: bool = True, threshold: float = 0.03, test_size: Union[int, float] = 0.3, random_state: Optional[int] = None, ) -> None: """Classifier conditional independence test (CCIT). Implements the classifier conditional independence test in :footcite:`Sen2017model`. If a Z variable is not passed in, then will run a standard independence test using the classifier :footcite:`Lopez2016revisiting`. Parameters ---------- clf : instance of sklearn.base.BaseEstimator or pytorch model An instance of a classification model. If a PyTorch model is used, then the user must pass the PyTorch model through ``skorch`` to turn the Neural Network into an object that is sklearn-compliant API. metric : Callable of sklearn metric A metric function to measure the performance of the classification model. bootstrap : bool, optional Whether or not to repeat runs, by default False. n_iter : int, optional If ``bootstrap`` is True, then how many iterations to run, by default 20. threshold : float The threshold to apply to reject the null hypothesis. See Notes. test_size : Union[int, float], optional The size of the teset set, by default 0.25. If less than 1, then will take a fraction of ``n_samples``. random_state : int, optional The random seed that is used to seed via ``np.random.defaultrng``. Notes ----- A general problem with machine-learning prediction based approaches is, that they don't find all kind of dependencies, only when they impact the expectation. For instance, a dependency with respect to the variance would not be captured by the CCIT: .. code-block:: python import numpy as np from dowhy.gcm import kernel_based, regression_based X = np.random.normal(0, 1, 1000) Y = [] for x in X: Y.append(np.random.normal(0, abs(x))) Y = np.array(Y) Z = np.random.normal(0, 1, 1000) print("Correct result:", kernel_based(X, Y, Z)) print("Wrong result", regression_based(X, Y, Z)) clf = RandomForestClassifier() ci_estimator = ClassifierCITest(clf) df = pd.DataFrame({'x': X, 'y': Y, 'z': Z}) _, pvalue = ci_estimator.test(df, {"x"}, {"z"}, {"y"}) print("Wrong result", pvalue) References ---------- .. footbibliography:: """ self.clf = clf self.metric = metric self.correct_bias = correct_bias self.bootstrap = bootstrap self.n_iter = n_iter self.threshold = threshold self.test_size = test_size # set the internal random state generator self.random_state = check_random_state(random_state) def _compute_test_statistic( self, df: pd.DataFrame, x_vars: Set[Column], y_vars: Set[Column], z_covariates: Optional[Set[Column]] = None, ) -> Tuple[float, float]: # generate training and testing data X_train, Y_train, X_test, Y_test = self.generate_train_test_data( df, x_vars, y_vars, z_covariates ) Y_train = Y_train.ravel() Y_test = Y_test.ravel() # fit the classifier on training data self.clf.fit(X_train, Y_train) # evaluate on test data and compute metric Y_pred = self.clf.predict(X_test) metric = self.metric(Y_test, Y_pred) binary_pvalue = 1.0 if not self.correct_bias: if metric < 0.5 - self.threshold: binary_pvalue = 0.0 metric = metric - 0.5 else: n_dims_x = len(x_vars) # now run CCITv2 in the paper and remove the X data X_train = X_train[:, n_dims_x:] X_test = X_test[:, n_dims_x:] # fit the classifier on training data self.clf.fit(X_train, Y_train) # evaluate on test data and compute metric Y_pred = self.clf.predict(X_test) biased_metric = self.metric(Y_test, Y_pred) if metric < biased_metric - self.threshold: binary_pvalue = 0.0 metric = metric - biased_metric return metric, binary_pvalue
[docs] def test( self, df: pd.DataFrame, x_vars: Set[Column], y_vars: Set[Column], z_covariates: Optional[Set[Column]] = None, ) -> Tuple[float, float]: self._check_test_input(df, x_vars, y_vars, z_covariates) n_samples = df.shape[0] if self.bootstrap: boot_metrics = [] boot_pvalues = [] # run test generation on bootstrapped dataset N times for _ in range(self.n_iter): sampled_df = df.sample( n=n_samples, axis=0, replace=True, random_state=self.random_state ) metric, pvalue = self._compute_test_statistic( sampled_df, x_vars, y_vars, z_covariates ) boot_metrics.append(metric) boot_pvalues.append(pvalue) # compute final pvalue metric = np.mean(boot_metrics) std_metric = np.std(boot_metrics) + 1e-8 sigma = std_metric / np.sqrt(self.n_iter) else: metric, pvalue = self._compute_test_statistic(df, x_vars, y_vars, z_covariates) sigma = 1 / np.sqrt(n_samples) # normal distribution CDF = (1 + Erf(x / \sqrt{2})) / 2 # \Phi(x, \mu, \sigma) = 0.5 * (1 + Erf((x - \mu) / (\sigma * sqrt{2})) # metric here is estimated x - \mu (under the null; 0.5 or the bias corrected version) # and \sigma is the estimated standard deviation pvalue = 0.5 * scipy.special.erfc(metric / (np.sqrt(2) * sigma)) self.stat_ = metric self.stat_sigma_ = sigma self.pvalue_ = pvalue return metric, pvalue