Source code for dowhy.gcm.unit_change

"""This module provides the APIs for attributing the change in the output value of a deterministic mechanism for a statistical unit.
"""

from abc import abstractmethod
from typing import List, Optional

import numpy as np
import pandas as pd
from sklearn.linear_model._base import LinearModel
from sklearn.utils.validation import check_is_fitted

from dowhy.gcm.fcms import PredictionModel
from dowhy.gcm.ml.regression import SklearnRegressionModel
from dowhy.gcm.shapley import ShapleyConfig, estimate_shapley_values


[docs]class LinearPredictionModel:
    @property
    @abstractmethod
    def coefficients(self) -> np.ndarray:
        pass


[docs]class SklearnLinearRegressionModel(SklearnRegressionModel, LinearPredictionModel):
    def __init__(self, sklearn_mdl: LinearModel) -> None:
        super(SklearnLinearRegressionModel, self).__init__(sklearn_mdl)

    @property
    def coefficients(self) -> np.ndarray:
        check_is_fitted(self.sklearn_model)
        return self.sklearn_model.coef_


[docs]def unit_change(
    background_df: pd.DataFrame,
    foreground_df: pd.DataFrame,
    input_column_names: List[str],
    background_mechanism: PredictionModel,
    foreground_mechanism: Optional[PredictionModel] = None,
    shapley_config: Optional[ShapleyConfig] = None,
) -> pd.DataFrame:
    """
    This function attributes the change in the output value of a deterministic mechanism for a statistical unit to each input and optionally for the mechanism if `foreground_mechanism` is provided.
    The technical method is described in the following research paper:
    Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.

    :param background_df: The background dataset.
    :param foreground_df: The foreground dataset.
    :param input_column_names: The names of the input columns.
    :param background_mechanism: The background mechanism. If the mechanism does not change, then this mechanism is used for attribution.
    :param foreground_mechanism: The foreground mechanism. If provided, the method also attributes the output change to the change in the mechanism.
    :param shapley_config: The configuration for calculating Shapley values.
    :return: A dataframe containing the contributions of each input and optionally the mechanism to the change in the output values of the deterministic mechanism(s) for given inputs.
    """
    if foreground_mechanism:
        if isinstance(background_mechanism, LinearPredictionModel):
            return unit_change_linear(
                background_mechanism, background_df, foreground_mechanism, foreground_df, input_column_names
            )
        else:
            return unit_change_nonlinear(
                background_mechanism,
                background_df,
                foreground_mechanism,
                foreground_df,
                input_column_names,
                shapley_config,
            )

    if isinstance(background_mechanism, LinearPredictionModel):
        return unit_change_linear_input_only(background_mechanism, background_df, foreground_df, input_column_names)
    else:
        return unit_change_nonlinear_input_only(
            background_mechanism, background_df, foreground_df, input_column_names, shapley_config
        )


[docs]def unit_change_nonlinear(
    background_mechanism: PredictionModel,
    background_df: pd.DataFrame,
    foreground_mechanism: PredictionModel,
    foreground_df: pd.DataFrame,
    input_column_names: List[str],
    shapley_config: Optional[ShapleyConfig] = None,
) -> pd.DataFrame:
    """
    Calculates the contributions of mechanism and each input to the change in the output values of a non-linear deterministic mechanism.
    The technical method is described in the following research paper:
    Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.

    :param background_mechanism: The background mechanism.
    :param background_df: The background data.
    :param foreground_mechanism: The foreground mechanism.
    :param foreground_df: The foreground data.
    :param input_column_names: The names of the input (features) columns in both dataframes.
    :param shapley_config: The configuration for calculating Shapley values.
    :return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
    """
    _check_if_input_columns_exist(background_df, foreground_df, input_column_names)

    def payoff(player_indicator: List[int]) -> np.ndarray:
        """The last cell in the binary vector represents the player 'mechanism'."""
        input_arrays = []
        for i, is_player_active in enumerate(player_indicator[:-1]):
            selected_df = foreground_df if is_player_active else background_df
            input_arrays.append(selected_df[input_column_names[i]].to_numpy())
        mechanism = foreground_mechanism if player_indicator[-1] else background_mechanism
        return mechanism.predict(np.column_stack(input_arrays)).flatten()

    contributions = estimate_shapley_values(payoff, len(input_column_names) + 1, shapley_config)
    root_causes = input_column_names + ["f"]
    return pd.DataFrame(contributions, columns=root_causes)


[docs]def unit_change_linear(
    background_mechanism: LinearPredictionModel,
    background_df: pd.DataFrame,
    foreground_mechanism: LinearPredictionModel,
    foreground_df: pd.DataFrame,
    input_column_names: List[str],
) -> pd.DataFrame:
    """
    Calculates the contributions of mechanism and each input to the change in the output values of a linear deterministic mechanism.

    :param background_mechanism: The linear background mechanism.
    :param background_df: The background data.
    :param foreground_mechanism: The linear foreground mechanism.
    :param foreground_df: The foreground data.
    :param input_column_names: The names of the input columns in both dataframes.
    :return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
    """
    _check_if_input_columns_exist(background_df, foreground_df, input_column_names)

    coeffs_total = background_mechanism.coefficients + foreground_mechanism.coefficients  # p x 1
    coeffs_diff = foreground_mechanism.coefficients - background_mechanism.coefficients  # p x 1

    input_total = foreground_df[input_column_names].to_numpy() + background_df[input_column_names].to_numpy()  # n x p
    input_diff = foreground_df[input_column_names].to_numpy() - background_df[input_column_names].to_numpy()  # n x p

    contribution_input = 0.5 * np.einsum("ij,ki->ki", coeffs_total.reshape(-1, 1), input_diff)
    contribution_mechanism = 0.5 * np.einsum("ij,ki->k", coeffs_diff.reshape(-1, 1), input_total)
    contribution_df = pd.DataFrame(contribution_input, columns=input_column_names)
    contribution_df["f"] = contribution_mechanism  # TODO: Handle the case where 'f' is an input column name
    return contribution_df


[docs]def unit_change_nonlinear_input_only(
    mechanism: PredictionModel,
    background_df: pd.DataFrame,
    foreground_df: pd.DataFrame,
    input_column_names: List[str],
    shapley_config: Optional[ShapleyConfig] = None,
) -> pd.DataFrame:
    """
    Calculates the contributions of each input to the change in the output values of a non-linear deterministic mechanism.
    The technical method is a modification of the attribution method described in the following research paper, without mechanism as a player:
    Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.

    :param mechanism: The mechanism.
    :param background_df: The background data.
    :param foreground_df: The foreground data.
    :param input_column_names: The names of the input (features) columns in both dataframes.
    :param shapley_config: The configuration for calculating Shapley values.
    :return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
    """
    _check_if_input_columns_exist(background_df, foreground_df, input_column_names)

    def payoff(player_indicator: List[int]) -> np.ndarray:
        input_arrays = []
        for i, is_player_active in enumerate(player_indicator):
            selected_df = foreground_df if is_player_active else background_df
            input_arrays.append(selected_df[input_column_names[i]].to_numpy())
        return mechanism.predict(np.column_stack(input_arrays)).flatten()

    contributions = estimate_shapley_values(payoff, len(input_column_names), shapley_config)
    return pd.DataFrame(contributions, columns=input_column_names)


[docs]def unit_change_linear_input_only(
    mechanism: LinearPredictionModel,
    background_df: pd.DataFrame,
    foreground_df: pd.DataFrame,
    input_column_names: List[str],
) -> pd.DataFrame:
    """
    Calculates the contributions of each input to the change in the output values of a linear deterministic mechanism.

    :param mechanism: The linear mechanism.
    :param background_df: The background data.
    :param foreground_df: The foreground data.
    :param input_column_names: The names of the input (features) columns in both dataframes.
    :return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
    """
    _check_if_input_columns_exist(background_df, foreground_df, input_column_names)

    input_diff = foreground_df[input_column_names].to_numpy() - background_df[input_column_names].to_numpy()  # n x p
    contribution_input = np.einsum("ij,ki->ki", mechanism.coefficients.reshape(-1, 1), input_diff)
    return pd.DataFrame(contribution_input, columns=input_column_names)


def _check_if_input_columns_exist(
    background_df: pd.DataFrame, foreground_df: pd.DataFrame, input_column_names: List[str]
) -> None:
    if not len(set(background_df.columns).intersection(input_column_names)) == len(input_column_names) or not len(
        set(foreground_df.columns).intersection(input_column_names)
    ) == len(input_column_names):
        raise ValueError("Input column names not found in either the background or the foreground data.")