"""This module provides functionality to answer what-if questions."""fromtypingimportAny,Callable,Dict,Iterable,List,Optional,Unionimportnetworkxasnximportnumpyasnpimportpandasaspdfromdowhy.gcm._noiseimportcompute_noise_from_datafromdowhy.gcm.causal_mechanismsimportClassifierFCMfromdowhy.gcm.causal_modelsimport(InvertibleStructuralCausalModel,ProbabilisticCausalModel,StructuralCausalModel,validate_causal_dag,)fromdowhy.gcm.fitting_samplingimportdraw_samplesfromdowhy.graphimport(DirectedGraph,get_ordered_predecessors,is_root_node,node_connected_subgraph_view,validate_node_in_graph,)
[docs]definterventional_samples(causal_model:ProbabilisticCausalModel,interventions:Dict[Any,Callable[[np.ndarray],Union[float,np.ndarray]]],observed_data:Optional[pd.DataFrame]=None,num_samples_to_draw:Optional[int]=None,)->pd.DataFrame:"""Performs intervention on nodes in the causal graph. :param causal_model: The probabilistic causal model we perform this intervention on . :param interventions: Dictionary containing the interventions we want to perform, keyed by node name. An intervention is a function that takes a value as input and returns another value. For example, `{'X': lambda x: 2}` mimics the atomic intervention *do(X:=2)*. A soft intervention can be formulated as `{'X': lambda x: 0.2 * x}`. :param observed_data: Optionally, data on which to perform interventions. If None are given, data is generated based on the generative models. :param num_samples_to_draw: Sample size to draw from the interventional distribution. :return: Samples from the interventional distribution. """validate_causal_dag(causal_model.graph)fornodeininterventions:validate_node_in_graph(causal_model.graph,node)ifobserved_dataisNoneandnum_samples_to_drawisNone:raiseValueError("Either observed_samples or num_samples_to_draw need to be set!")ifobserved_dataisnotNoneandnum_samples_to_drawisnotNone:raiseValueError("Either observed_samples or num_samples_to_draw need to be set, not both!")ifnum_samples_to_drawisnotNone:observed_data=draw_samples(causal_model,num_samples_to_draw)return_interventional_samples(causal_model,observed_data,interventions)
def_interventional_samples(pcm:ProbabilisticCausalModel,observed_data:pd.DataFrame,interventions:Dict[Any,Callable[[np.ndarray],np.ndarray]],)->pd.DataFrame:samples=observed_data.copy()affected_nodes=_get_nodes_affected_by_intervention(pcm.graph,interventions.keys())sorted_nodes=nx.topological_sort(pcm.graph)# Simulating interventions by propagating the effects through the graph. For this, we iterate over the nodes based# on their topological order.fornodeinsorted_nodes:ifnodenotinaffected_nodes:continueifis_root_node(pcm.graph,node):node_data=samples[node].to_numpy()else:node_data=pcm.causal_mechanism(node).draw_samples(_parent_samples_of(node,pcm,samples))# After drawing samples of the node based on the data generation process, we apply the corresponding# intervention. The inputs of downstream nodes are therefore based on the outcome of the intervention in this# node.samples[node]=_evaluate_intervention(node,interventions,node_data.reshape(-1))returnsamplesdef_get_nodes_affected_by_intervention(causal_graph:DirectedGraph,target_nodes:Iterable[Any])->List[Any]:result=[]fornodeinnx.topological_sort(causal_graph):ifnodeintarget_nodes:result.append(node)continuefortarget_nodeintarget_nodes:iftarget_nodeinnx.ancestors(causal_graph,source=node):result.append(node)breakreturnresult
[docs]defcounterfactual_samples(causal_model:Union[StructuralCausalModel,InvertibleStructuralCausalModel],interventions:Dict[Any,Callable[[np.ndarray],Union[float,np.ndarray]]],observed_data:Optional[pd.DataFrame]=None,noise_data:Optional[pd.DataFrame]=None,)->pd.DataFrame:"""Estimates counterfactual data for observed data if we were to perform specified interventions. This function implements the 3-step process for computing counterfactuals by Pearl (see https://ftp.cs.ucla.edu/pub/stat_ser/r485.pdf). :param causal_model: The (invertible) structural causal model we perform this intervention on. If noise_data is None and observed_data is provided, this must be an invertible structural model, otherwise, this can be either a structural causal model or an invertible one. :param interventions: Dictionary containing the interventions we want to perform keyed by node name. An intervention is a function that takes a value as input and returns another value. For example, `{'X': lambda x: 2}` mimics the atomic intervention *do(X:=2)*. :param observed_data: Factual data that we observe for the nodes in the causal graph. :param noise_data: Data of noise terms corresponding to nodes in the causal graph. If not provided, these have to be estimated from observed data. Then we require causal models of nodes to be invertible. :return: Estimated counterfactual data. """fornodeininterventions:validate_node_in_graph(causal_model.graph,node)validate_causal_dag(causal_model.graph)ifobserved_dataisNoneandnoise_dataisNone:raiseValueError("Either observed_data or noise_data need to be given!")ifobserved_dataisnotNoneandnoise_dataisnotNone:raiseValueError("Either observed_data or noise_data can be given, not both!")ifnoise_dataisNoneandobserved_dataisnotNone:ifnotisinstance(causal_model,InvertibleStructuralCausalModel):raiseValueError("Since no noise_data is given, this has to be estimated from the given ""observed_data. This can only be done with InvertibleStructuralCausalModel.")# Abduction: For invertible SCMs, we recover exact noise values from data.noise_data=compute_noise_from_data(causal_model,observed_data)# Action + Prediction: Propagate the intervention downstream using recovered noise values.return_counterfactual_samples(causal_model,interventions,noise_data)
def_counterfactual_samples(scm:StructuralCausalModel,interventions:Dict[Any,Callable[[np.ndarray],Union[float,np.ndarray]]],noise_data:pd.DataFrame,)->pd.DataFrame:topologically_sorted_nodes=list(nx.topological_sort(scm.graph))samples=pd.DataFrame(np.empty((noise_data.shape[0],len(topologically_sorted_nodes))),columns=topologically_sorted_nodes)fornodeintopologically_sorted_nodes:ifis_root_node(scm.graph,node):node_data=noise_data[node].to_numpy()else:node_data=scm.causal_mechanism(node).evaluate(_parent_samples_of(node,scm,samples),noise_data[node].to_numpy())samples[node]=_evaluate_intervention(node,interventions,node_data.reshape(-1))returnsamplesdef_evaluate_intervention(node:Any,interventions:Dict[Any,Callable[[np.ndarray],np.ndarray]],pre_intervention_data:np.ndarray)->np.ndarray:# Check if we need to apply an intervention on the given node.ifnodeininterventions:# Apply intervention function to the data of the node.post_intervention_data=np.array(list(map(interventions[node],pre_intervention_data)))# Check if the intervention function changes the shape of the data.ifpre_intervention_data.shape!=post_intervention_data.shape:raiseRuntimeError("Dimension of data corresponding to the node `%s` after intervention is different than before ""intervention."%node)returnpost_intervention_dataelse:returnpre_intervention_data
[docs]defaverage_causal_effect(causal_model:ProbabilisticCausalModel,target_node:Any,interventions_alternative:Dict[Any,Callable[[np.ndarray],Union[float,np.ndarray]]],interventions_reference:Dict[Any,Callable[[np.ndarray],Union[float,np.ndarray]]],observed_data:Optional[pd.DataFrame]=None,num_samples_to_draw:Optional[int]=None,)->float:"""Estimates the average causal effect (ACE) on the target of two different sets of interventions. The interventions can be specified through the parameters `interventions_alternative` and `interventions_reference`. For example, if the alternative intervention is do(T := 1) and the reference intervention is do(T := 0), then the average causal effect is given by ACE = E[Y | do(T := 1)] - E[Y | do(T := 0)]: >>> average_causal_effect(causal_model, 'Y', {'T': lambda _ : 1}, {'T': lambda _ : 0}) We can also specify more complex interventions on multiple nodes: >>> average_causal_effect(causal_model, >>> 'Y', >>> {'T': lambda _ : 1, 'X0': lambda x : x + 1}, >>> {'T': lambda _ : 0, 'X0': lambda x : x * 2}) In the above, we would estimate ACE = E[Y | do(T := 1), do(X0 := X0 + 1)] - E[Y | do(T := 0), do(X0 := X0 * 2)]. Note: The target node can be a continuous real-valued variable or a categorical variable with at most two classes (i.e. binary). :param causal_model: The probabilistic causal model we perform this intervention on . :param target_node: Target node for which the ACE is estimated. :param interventions_alternative: Dictionary defining the interventions for the alternative values. :param interventions_reference: Dictionary defining the interventions for the reference values. :param observed_data: Factual data that we observe for the nodes in the causal graph. By default, new data is sampled using the causal model. If observational data is available, providing them might improve the accuracy by mitigating issues due to a misspecified graph and/or causal models. :param num_samples_to_draw: Number of samples drawn from the causal model for estimating ACE if no observed data is given. :return: The estimated average causal effect (ACE). """# For estimating the effect, we only need to consider the nodes that have a directed path to the target node, i.e.# all ancestors of the target.causal_model=ProbabilisticCausalModel(node_connected_subgraph_view(causal_model.graph,target_node))validate_causal_dag(causal_model.graph)fornodeininterventions_alternative:validate_node_in_graph(causal_model.graph,node)fornodeininterventions_reference:validate_node_in_graph(causal_model.graph,node)ifobserved_dataisNoneandnum_samples_to_drawisNone:raiseValueError("Either observed_samples or num_samples_to_draw need to be set!")ifobserved_dataisnotNoneandnum_samples_to_drawisnotNone:raiseValueError("Either observed_samples or num_samples_to_draw need to be set, not both!")ifnum_samples_to_drawisnotNone:observed_data=draw_samples(causal_model,num_samples_to_draw)samples_from_target_alt=_interventional_samples(causal_model,observed_data,interventions_alternative)[target_node].to_numpy()samples_from_target_ref=_interventional_samples(causal_model,observed_data,interventions_reference)[target_node].to_numpy()target_causal_model=causal_model.causal_mechanism(target_node)ifisinstance(target_causal_model,ClassifierFCM):# The target node can be a continuous real-valued variable or a categorical variable with at most two classes# (i.e. binary).ifobserved_data[target_node].nunique()>2:raiseValueError("Cannot estimate average treatment effect of categorical data with more than 2 categories!")class_names=target_causal_model.get_class_names(np.array([0,1]))samples_from_target_alt[samples_from_target_alt==class_names[0]]=0samples_from_target_alt[samples_from_target_alt==class_names[1]]=1samples_from_target_ref[samples_from_target_ref==class_names[0]]=0samples_from_target_ref[samples_from_target_ref==class_names[1]]=1returnnp.mean(samples_from_target_alt)-np.mean(samples_from_target_ref)