Source code for humancompatible.explain.glance.counterfactual_tree.counterfactual_tree

from typing import Union, Any, List, Optional, Dict, Tuple, Callable
from ..base import GlobalCounterfactualMethod, LocalCounterfactualMethod
from ..iterative_merges.iterative_merges import C_GLANCE, _select_action_max_eff
import pandas as pd
from ..utils.metadata_requests import _decide_local_cf_method
from ..utils.centroid import centroid_pandas
from ..utils.action import extract_actions_pandas, apply_action_pandas
from sklearn.inspection import permutation_importance
from ..iterative_merges.iterative_merges import cumulative
from ..counterfactual_costs import build_dist_func_dataframe
from .node import Node
import numpy as np
from tqdm import tqdm


[docs] class T_GLANCE: """ A class to generate counterfactual explanations using a decision tree-like structure. This class allows users to create a tree structure for counterfactual generation, optimizing effectiveness and cost based on specified features. It supports both local and global methods for generating counterfactuals. Attributes: ---------- model : Any The predictive model used for generating counterfactuals. split_features : Union[List, int] Features to split the tree. Can be a list of feature names or an integer specifying the number of top features to use based on permutation importance. partition_counterfactuals : int The number of partitions to create for counterfactuals. child_count : int The number of children each node can have. global_method : Union[GlobalCounterfactualMethod, str] The global counterfactual generation method to use. local_method : Union[LocalCounterfactualMethod, str] The local counterfactual generation method to use. num_local_counterfactuals : int The number of local counterfactuals to generate. node : Node The root node of the counterfactual tree. node_instances : pd.DataFrame The instances that were used to build the counterfactual tree. dist_func_dataframe : Callable A distance function for calculating distances between instances. Methods: ------- fit(X, y, train_dataset=None, feat_to_vary="all", random_seed=13, numeric_features_names=None, categorical_features_names=None): Fits the counterfactual tree to the provided data. _local_group_eff_cost(instances): Calculates the effectiveness and cost of local counterfactuals for a group of instances. _group_eff_cost(instances): Calculates the effectiveness and cost of counterfactuals for a group of instances, utilizing local or global methods. partition_group(instances): Partitions the group of instances into a tree structure based on the specified features. cumulative_leaf_actions(): Computes the total effectiveness and cost of actions taken from leaf nodes of the tree. """ def __init__( self, model: Any, split_features: Union[List, int] = None, partition_counterfactuals: int = None, child_count: int = 2, global_method: Union[GlobalCounterfactualMethod, str] = None, local_method: Union[LocalCounterfactualMethod, str] = None, num_local_counterfactuals: int = 100, ): """ Initializes the CounterfactualTree instance. Parameters: ---------- model : Any The predictive model to use for generating counterfactuals. split_features : Union[List, int], optional Features to split the tree. If None, uses permutation importance to select. If an integer, selects the top N features. partition_counterfactuals : int, optional Number of partitions for counterfactual generation. child_count : int, optional Number of children for each node in the tree. Default is 2. global_method : Union[GlobalCounterfactualMethod, str], optional The global counterfactual generation method to use. local_method : Union[LocalCounterfactualMethod, str], optional The local counterfactual generation method to use. num_local_counterfactuals : int, optional Number of local counterfactuals to generate. Default is 100. """ self.model = model self.split_features = split_features self.partition_counterfactuals = partition_counterfactuals self.child_count = child_count self.global_method = global_method self.local_method = local_method self.num_local_counterfactuals = num_local_counterfactuals
[docs] def fit( self, X: pd.DataFrame, y: pd.Series, train_dataset: Optional[pd.DataFrame] = None, feat_to_vary: Optional[Union[List[str], str]] = "all", random_seed: int = 13, numeric_features_names: Optional[List[str]] = None, categorical_features_names: Optional[List[str]] = None, ): """ Fits the counterfactual tree to the provided data. Parameters: ---------- X : pd.DataFrame Features of the dataset. y : pd.Series Target variable. train_dataset : Optional[pd.DataFrame], optional The training dataset to use for local counterfactual generation methods. feat_to_vary : Optional[Union[List[str], str]], optional Features to vary in counterfactual generation. Default is "all". random_seed : int, optional Random seed for reproducibility. Default is 13. numeric_features_names : Optional[List[str]], optional List of numeric feature names. If None, they will be inferred from X. categorical_features_names : Optional[List[str]], optional List of categorical feature names. If None, they will be inferred from X. """ if self.split_features == None: perm_importance = permutation_importance( self.model, X, y, n_repeats=30, random_state=42 ) feature_names = X.columns mean_importance = perm_importance.importances_mean top_indices = mean_importance.argsort()[-2:][::-1] top_features = feature_names[top_indices] self.split_features = list(top_features) elif isinstance(self.split_features, int): perm_importance = permutation_importance( self.model, X, y, n_repeats=30, random_state=42 ) feature_names = X.columns mean_importance = perm_importance.importances_mean top_indices = mean_importance.argsort()[-self.split_features:][::-1] top_features = feature_names[top_indices] self.split_features = list(top_features) self.split_values = _get_split_values(X, self.split_features, self.child_count) if numeric_features_names is None: if categorical_features_names is None: numeric_features_names = X.select_dtypes( include=["number"] ).columns.tolist() else: numeric_features_names = X.columns.difference( categorical_features_names ).tolist() if categorical_features_names is None: categorical_features_names = X.columns.difference( numeric_features_names ).tolist() self.numerical_features_names = numeric_features_names self.categorical_features_names = categorical_features_names self.X = X self.y = y self.train_dataset = train_dataset self.random_seed = random_seed self.feat_to_vary = feat_to_vary self.dist_func_dataframe = build_dist_func_dataframe( self.X, self.numerical_features_names, self.categorical_features_names ) if self.local_method == None: backup = "Dice" else: backup = self.local_method self.cf_generator_backup = _decide_local_cf_method( method=backup, model=self.model, train_dataset=self.train_dataset, numeric_features_names=self.numerical_features_names, categorical_features_names=self.categorical_features_names, feat_to_vary=self.feat_to_vary, random_seed=random_seed, ) if self.global_method == None and self.local_method == None: self.generation_method = "Global-IM" if self.partition_counterfactuals == None: self.partition_counterfactuals = 3 self.cf_generator = C_GLANCE( self.model, final_clusters=self.partition_counterfactuals, verbose=False ) if self.train_dataset is None: raise ValueError( "You need to pass train_dataset for Dice if you want default Iterative merges." ) self.cf_generator.fit(X, y, self.train_dataset) elif self.global_method != None: self.generation_method = "Global" if self.partition_counterfactuals == None: self.partition_counterfactuals = 3 self.cf_generator = self.global_method else: self.generation_method = "Local" if self.partition_counterfactuals == None: self.partition_counterfactuals = 1
def _local_group_eff_cost(self, instances): """ Calculates the effectiveness and cost of local counterfactuals for a group of instances. Parameters: ---------- instances : pd.DataFrame The group of instances to analyze. Returns: ------- Tuple[float, float, List[Any]] A tuple containing the effectiveness, cost, and list of actions. """ centroid = centroid_pandas( instances, self.numerical_features_names, self.categorical_features_names, ) cfs = self.cf_generator_backup.explain_instances( centroid, self.num_local_counterfactuals, ) if cfs.shape[0] == 0: return 0, 0, [] actions = extract_actions_pandas( X=pd.concat([centroid] * cfs.shape[0]).set_index( cfs.index ), cfs=cfs, categorical_features=self.categorical_features_names, numerical_features=self.numerical_features_names, categorical_no_action_token="-", ) # actions = [action for _, action in actions.iterrows()] actions_info = _select_action_max_eff( self.model, instances, actions, self.dist_func_dataframe, self.numerical_features_names, self.categorical_features_names, self.partition_counterfactuals, ) if type(actions_info) is not list: actions_info = [actions_info] actions = [action for _, _, action in actions_info] eff, cost = cumulative( self.model, instances, actions, self.dist_func_dataframe, self.numerical_features_names, self.categorical_features_names, "-", ) return eff, cost, actions def _group_eff_cost( self, instances, ): """ Calculates the effectiveness and cost of counterfactuals for a group of instances, utilizing local or global methods. Parameters: ---------- instances : pd.DataFrame The group of instances to analyze. Returns: ------- Tuple[float, float, List[Any]] A tuple containing the effectiveness, cost, and list of actions. """ if self.generation_method == "Local": return self._local_group_eff_cost(instances) elif self.generation_method == "Global-IM": clusters = min(100, len(instances)) if clusters < self.partition_counterfactuals: return self._local_group_eff_cost(instances) else: self.cf_generator.initial_clusters = clusters eff, cost = self.cf_generator.explain_group(instances) actions = self.cf_generator.global_actions() elif self.generation_method == 'Global': eff, cost = self.cf_generator.explain_group(instances) actions = self.cf_generator.global_actions() else: raise ValueError("Generation method does not exist") return eff, cost, actions
[docs] def partition_group(self, instances: pd.DataFrame): """ Partitions the group of instances into a tree structure based on the specified features. Parameters: ---------- instances : pd.DataFrame The group of instances to partition. Returns: ------- Node The root node of the partitioned tree. """ def _partition_group( group, split_features, eff_prec=None, cost_prec=None, actions_prec=None ): if eff_prec == None: eff_node, cost_node, actions = self._group_eff_cost(group) else: eff_node, cost_node, actions = eff_prec, cost_prec, actions_prec node = Node( effectiveness=eff_node, cost=cost_node, actions=actions, size=len(group) ) possible_splits = [] for feature in split_features: eff_children, cost_children = 0, 0 children_info = [] for feature_split_values in self.split_values[feature]: split_df = group[group[feature].isin(feature_split_values)] if not split_df.empty: eff_child, cost_child, actions = self._group_eff_cost(split_df) eff_children += eff_child cost_children += cost_child children_info.append( ( feature_split_values, split_df, eff_child, cost_child, actions, ) ) possible_splits.append( (feature, eff_children, cost_children, children_info) ) if len(possible_splits) == 0: return node possible_splits = sorted(possible_splits, key=lambda x: -x[1]) node.split_feature = possible_splits[0][0] split_features.remove(node.split_feature) child_info = possible_splits[0][3] for child in child_info: child_node = _partition_group( child[1], split_features, child[2], child[3], child[4] ) node.add_child(child[0], child_node) return node self.node = _partition_group(instances, self.split_features) self.node_instances = instances return self.node
[docs] def cumulative_leaf_actions(self): """ Computes the total effectiveness and cost of actions taken from leaf nodes of the tree. Returns: ------- Tuple[float, float, int] A tuple containing the total effectiveness, total cost, and the number of actions taken. """ eff, cost = cumulative( self.model, self.node_instances, self.node.return_leafs_actions(), self.dist_func_dataframe, self.numerical_features_names, self.categorical_features_names, categorical_no_action_token="-", ) print(f"\nTOTAL EFFECTIVENESS: {eff / self.node_instances.shape[0]:.2%}") print(f"\nTOTAL COST: {(cost / eff if eff > 0 else 0):.2f}") return eff, cost, len(self.node.return_leafs_actions())
def _split_list(lst, n): """ Splits a list into n approximately equal parts. Parameters: ---------- lst : list The list to split. n : int The number of parts to split the list into. Returns: ------- Generator A generator yielding the split parts of the list. """ k, m = divmod(len(lst), n) return (lst[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) def _get_split_values(X, split_features, child_count): """ Generates split values for the specified features. Parameters: ---------- X : pd.DataFrame The dataset containing features. split_features : List[str] The list of features to create split values for. child_count : int The number of child splits to create for each feature. Returns: ------- Dict[str, List[List[Any]]] A dictionary mapping each feature to its corresponding split values. """ split_values = {} for feature in split_features: lst = sorted(list(X[feature].unique())) split_count = child_count if child_count == -1: split_count = len(lst) split_values[feature] = list(_split_list(lst, split_count)) return split_values