Source code for pywhy_graphs.functional.multidomain

from typing import Callable, List, Optional, Tuple, Union

import networkx as nx
import numpy as np

from pywhy_graphs.algorithms import (
    add_all_snode_combinations,
    compute_invariant_domains_per_node,
    find_connected_pairs,
)
from pywhy_graphs.classes import AugmentedGraph
from pywhy_graphs.functional.utils import _preprocess_parameter_inputs
from pywhy_graphs.typing import Node

from .additive import generate_edge_functions_for_node
from .linear import generate_noise_for_node


[docs] def make_graph_multidomain( G: nx.DiGraph, n_domains: int = 2, n_nodes_with_s_nodes: Union[int, Tuple[int]] = 1, n_invariances_to_try: int = 1, node_mean_lims: Optional[List[float]] = None, node_std_lims: Optional[List[float]] = None, edge_functions: Optional[List[Callable[[float], float]]] = None, edge_weight_lims: Optional[List[float]] = None, random_state=None, ) -> nx.DiGraph: r"""Convert an existing linear Gaussian DAG to a multi-domain selection diagram model. The multi-domain selection diagram model is a generalization of the regular causal diagram in that S-nodes represent possible changes in mechanisms for the underlying node. In particular, missing S-node edges to a specific node implies invariances in the distribution of that node across domain. For example, if you have a graph :math:`X \rightarrow Y`, then the S-node :math:`S^{1,2} \rightarrow Y` represents the change in the distribution of :math:`Y` given a change in domain. If there is no S-node :math:`S^{1,2} \rightarrow Y`, then the distribution of :math:`Y` is invariant across domain 1 and 2. Parameters ---------- G : NetworkX DiGraph The graph to sample data from. The graph will be modified in-place to get the weights and functions of the edges. n_domains : int The number of domains to split the graph into. By default 2. n_nodes_with_s_nodes : int | tuple[int] The number of nodes to have S-node edges. By default 1. If a tuple, then will sample uniformly a number between the two values. n_invariances_to_try : int The number of invariances to try to set by deleting S-nodes. By default 1. More S-nodes than what is specified by this parameter may be deleted if there are inconsistencies in the S-nodes. See Notes for details. node_mean_lims : Optional[List[float]], optional The lower and upper bounds of the mean of the Gaussian random variable, by default None, which defaults to a mean of 0. node_std_lims : Optional[List[float]], optional The lower and upper bounds of the std of the Gaussian random variable, by default None, which defaults to a std of 1. edge_functions : List[Callable[float]], optional The set of edge functions that take in an iid sample from the parent and computes a transformation (possibly nonlinear), such as ``(lambda x: x**2, lambda x: x)``, by default None, which defaults to the identity function ``lambda x: x``. edge_weight_lims : Optional[List[float]], optional The lower and upper bounds of the edge weight, by default None, which defaults to a weight of 1. random_state : int, optional Random seed, by default None. Returns ------- G : NetworkX DiGraph NetworkX graph with the edge weights and functions set with node attributes set with ``'parent_functions'``, and ``'gaussian_noise_function'``. Moreover the graph attribute ``'linear_gaussian'`` is set to ``True``. See Also -------- make_graph_linear_gaussian : Create a linear Gaussian graph Notes ----- To determine the missing S-node structure, we first construct all possible S-nodes given the number of domains, ``n_domains``. The total number of S-nodes will then be :math:`\binom{n_{domains}}{2}`. Then, we randomly sample a subset of nodes in the graph with S-node edges. The remaining nodes will be missing S-node edges. Then among the nodes with S-node edges, we will randomly sample a subset of S-nodes to be missing edges. At this stage, there may be inconsistency in the S-nodes connected still. For example, if we have the S-nodes :math:`S^{1,2} \rightarrow Y` among 3 domains, then we must have either one of the other S-nodes, or none at all. This is because the missing :math:`S^{2,3} \rightarrow Y` and :math:`S^{1,3} \rightarrow Y` implies that the distribution of :math:`Y` is invariant across domains 1 and 3 and 2 and 3, which also implies they are invariant between domain 1 and 3. To fix this, for each node with S-node connections, we will delete random set of S-nodes and construct a connected component of the S-nodes domains to then remove any remaining S-nodes to keep the graph consistent. """ G = G.copy() if hasattr(G, "get_graphs"): directed_G = G.get_graphs("directed") else: directed_G = G G = AugmentedGraph(incoming_directed_edges=G) if not nx.is_directed_acyclic_graph(directed_G): raise ValueError("The input graph must be a DAG.") if not G.graph.get("linear_gaussian", True): raise ValueError("The input graph must be a linear Gaussian graph.") if not isinstance(n_nodes_with_s_nodes, tuple): n_nodes_with_s_nodes_ = (n_nodes_with_s_nodes, n_nodes_with_s_nodes) rng = np.random.default_rng(random_state) s_node_domains = dict() # choose nodes with S-nodes n_nodes = rng.integers(n_nodes_with_s_nodes_[0], n_nodes_with_s_nodes_[1] + 1) # choose the nodes to have S-nodes at random node_idx = rng.integers(0, G.number_of_nodes(), size=n_nodes) nodes_with_s_nodes = [G.nodes(data=False)[idx] for idx in node_idx] # compute all possible S-nodes given the number of domains G, s_node_domains = add_all_snode_combinations(G, n_domains) all_poss_snodes = set(G.s_nodes) for node in nodes_with_s_nodes: for s_node in all_poss_snodes: # XXX: maybe use directed_G? G.add_edge(s_node, node) s_nodes = G.s_nodes # loop through each node with S-nodes for node in nodes_with_s_nodes: if n_invariances_to_try == 0 or len(s_nodes) == 0: indices = [] remove_s_node = [] else: indices = rng.integers(len(s_nodes), size=n_invariances_to_try) remove_s_node = [s_nodes[idx] for idx in indices] # find all connected pairs tuples = [] for s_node in remove_s_node: source_domain, target_domain = G.nodes(data=True)[s_node]["domain_ids"] tuples.append((source_domain, target_domain)) connected_pairs = find_connected_pairs(tuples, n_domains) invariant_domains = set() for domain_pair in connected_pairs: # remove all the S-nodes that are not in the connected component s_node = s_node_domains[domain_pair] G.remove_edge(s_node, node) # for each removed S-node, there are invariances in the SCM for this node invariant_domains.add(domain_pair[0]) invariant_domains.add(domain_pair[1]) # now set the functional relationships based on the invariances G.nodes()[node]["invariant_domains"] = invariant_domains # now set a random function for each domain that is not invariant generate_multidomain_noise_for_node( G, node, n_domains=n_domains, node_mean_lims=node_mean_lims, node_std_lims=node_std_lims, random_state=random_state, check_s_node_consistency=False, ) # sample edge functions and weights as a function of the parents generate_edge_functions_for_node( G, node=node, edge_weight_lims=edge_weight_lims, edge_functions=edge_functions, random_state=random_state, ) G.graph["linear_gaussian"] = True G.graph["S-nodes"] = s_nodes G.graph["n_domains"] = n_domains return G
def generate_multidomain_noise_for_node( G, node: Node, n_domains: int, node_mean_lims, node_std_lims, check_s_node_consistency: bool = True, random_state=None, ): """Sample a linear function for the exogenous noise of a node with S-nodes. Parameters ---------- G : AugmentedGraph The selection diagram to sample from. node : Node The node to sample exogenous noise for. n_domains : int The number of domains to sample from. node_mean_lims : Optional[List[float]], optional The lower and upper bounds of the mean of the Gaussian random variable, by default None, which defaults to a mean of 0. node_std_lims : Optional[List[float]], optional The lower and upper bounds of the std of the Gaussian random variable, by default None, which defaults to a std of 1. check_s_node_consistency : bool, optional Whether to check that the S-nodes are consistent with the invariant domains, by default True. random_state : int, optional Random seed, by default None. Returns ------- _type_ _description_ """ rng = np.random.default_rng(random_state) if check_s_node_consistency: # compute all possible S-nodes given the number of domains G, _ = add_all_snode_combinations(G, n_domains) all_poss_snodes = set(G.s_nodes) # for each node with S-nodes and compute the invariant domains G = compute_invariant_domains_per_node(G, node, all_poss_snodes, n_domains=n_domains) else: if "invariant_domains" not in G.nodes()[node]: raise ValueError("Must specify invariant domains for node {}.".format(node)) # compute the invariant domains invariant_domains = G.nodes()[node]["invariant_domains"] # now set a random function for each domain that is not invariant domain_noise_params = dict() for idx, domain_id in enumerate(range(1, n_domains + 1)): if domain_id in invariant_domains: continue domain_mean_lims = node_mean_lims[idx] domain_std_lims = node_std_lims[idx] # for domains that are not invariant, we need to set the noise function to a # new random function mean = rng.uniform(low=domain_mean_lims[0], high=domain_mean_lims[1]) std = rng.uniform(low=domain_std_lims[0], high=domain_std_lims[1]) # set the node attribute "functions" to hold the weight and function wrt each parent domain_noise_params[domain_id] = {"mean": mean, "std": std} G.nodes()[node]["domain_gaussian_noise_function"] = domain_noise_params return G def sample_multidomain_lin_functions( G: AugmentedGraph, node_mean_lims: Optional[List[float]] = None, node_std_lims: Optional[List[float]] = None, edge_functions: Optional[List[Callable[[float], float]]] = None, edge_weight_lims: Optional[List[float]] = None, random_state=None, ): """Given a selection diagram, sample linear functions for each node. Parameters ---------- G : AugmentedGraph The selection diagram to sample from. Should have S-nodes to indicate different domains. node_mean_lims : Optional[List[float]], optional The lower and upper bounds of the mean of the Gaussian random variable, by default None, which defaults to a mean of 0. If there is a list, then it should be a list of length ``n_domains`` meaning that each domain has a different mean range. node_std_lims : Optional[List[float]], optional The lower and upper bounds of the std of the Gaussian random variable, by default None, which defaults to a std of 1. If there is a list, then it should be a list of length ``n_domains`` meaning that each domain has a different std range. edge_functions : List[Callable[float]], optional The set of edge functions that take in an iid sample from the parent and computes a transformation (possibly nonlinear), such as ``(lambda x: x**2, lambda x: x)``, by default None, which defaults to the identity function ``lambda x: x``. edge_weight_lims : Optional[List[float]], optional The lower and upper bounds of the edge weight, by default None, which defaults to a weight of 1. random_state : int, optional Random seed, by default None. Returns ------- G : AugmentedGraph The selection diagram with the sampled functions and weights. """ s_node_domains = dict() n_domains = len(G.domains) s_nodes = set(G.s_nodes) if len(s_nodes) == 0: return G ( node_mean_lims_, node_std_lims_, edge_functions_, edge_weight_lims_, ) = _preprocess_parameter_inputs( node_mean_lims, node_std_lims, edge_functions, edge_weight_lims, multi_domain=True, n_domains=n_domains, ) # compute all nodes that have S-node connections nodes_with_s_nodes = [] for s_node in s_nodes: if "domain_ids" not in G.nodes(data=True)[s_node]: raise ValueError("Must specify domain_ids for S-node {}.".format(s_node)) domain_pair = G.nodes(data=True)[s_node]["domain_ids"] s_node_domains[domain_pair] = s_node nodes_with_s_nodes.extend( [node for node in G.successors(s_node) if G.has_edge(s_node, node)] ) # compute all possible S-nodes given the number of domains G, s_node_domains = add_all_snode_combinations(G, n_domains) all_poss_snodes = set(G.s_nodes) for node in G.nodes: if node in nodes_with_s_nodes: # for each node with S-nodes and compute the invariant domains G = compute_invariant_domains_per_node(G, node, all_poss_snodes, n_domains=n_domains) # now set a random function for each domain that is not invariant G = generate_multidomain_noise_for_node( G, node, n_domains=n_domains, node_mean_lims=node_mean_lims_, node_std_lims=node_std_lims_, random_state=random_state, check_s_node_consistency=False, ) else: # sample single-domain noise G = generate_noise_for_node( G, node, node_mean_lims_[0], node_std_lims_[0], random_state=random_state ) # sample edge functions and weights as a function of the parents generate_edge_functions_for_node( G, node=node, edge_weight_lims=edge_weight_lims_, edge_functions=edge_functions_, random_state=random_state, ) G.graph["linear_gaussian"] = True G.graph["n_domains"] = n_domains return G