import pandas as pd
from ..base import LocalCounterfactualMethod
import numpy as np
from sklearn.inspection import permutation_importance
[docs]
class RandomSampling(LocalCounterfactualMethod):
"""
RandomSampling is a local counterfactual method that generates counterfactual instances
through random sampling based on the distribution of features in the unaffected training data.
This method identifies the most important features and the most frequent categories within the
unaffected training data to generate counterfactuals by sampling from these distributions.
Methods:
--------
__init__(model, n_most_important, n_categorical_most_frequent, numerical_features, categorical_features, random_state=None):
Initializes the RandomSampling instance with the specified parameters.
fit(X, y):
Fits the RandomSampling method to the provided training data by calculating feature importances and identifying unaffected instances.
_sample_instances(n_samples, fixed_feature_values, random_state=None):
Samples instances based on the specified feature distributions, fixing certain feature values while sampling others.
explain(instance, num_counterfactuals, n_samples=1000, random_state=None):
Generates counterfactual explanations for a given instance by sampling and modifying feature values.
explain_instances(instances, num_counterfactuals, n_samples=1000, random_state=None):
Generates counterfactuals for multiple instances by calling the explain method for each instance.
"""
def __init__(self, model, n_most_important, n_categorical_most_frequent, numerical_features, categorical_features, random_state=None):
"""
Initializes a new instance of the RandomSampling class.
Parameters:
----------
model : object
A machine learning model used for predictions and feature importance evaluation.
n_most_important : int
The number of most important features to consider when generating counterfactuals.
n_categorical_most_frequent : int
The number of most frequent categories to consider for categorical features.
numerical_features : List[str]
A list of continuous (numerical) feature names.
categorical_features : List[str]
A list of categorical feature names.
random_state : int, optional
Seed for random number generation to ensure reproducibility, by default None.
"""
self.model = model
self.n_most_important = n_most_important
self.n_categorical_most_frequent = n_categorical_most_frequent
self.numerical_features = numerical_features
self.categorical_features = categorical_features
self.random_state = random_state
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series):
"""
Fits the RandomSampling method to the provided training data by calculating feature importances and identifying unaffected instances.
Parameters:
----------
X : pd.DataFrame
The training dataset containing feature columns.
y : pd.Series
The target variable corresponding to the training dataset.
Returns:
-------
self : RandomSampling
Returns the fitted instance of RandomSampling.
"""
self.X_ = X
self.feature_names_ = X.columns.tolist()
# Permutation feature importance
result = permutation_importance(self.model, X, y, random_state=self.random_state)
self.feature_importances_ = result.importances_mean
top_k_indices = np.argsort(self.feature_importances_)[::-1][:self.n_most_important]
self.top_k_features_ = X.columns[top_k_indices]
train_preds = self.model.predict(X)
unaffected = X[train_preds == 1]
# Store min and max values for numerical features
self.numeric_min_ = unaffected[self.numerical_features].min()
self.numeric_max_ = unaffected[self.numerical_features].max()
for f in self.numerical_features:
if np.isnan(self.numeric_min_[f]):
self.numeric_min_[f] = X[f].min()
if np.isnan(self.numeric_max_[f]):
self.numeric_max_[f] = X[f].max()
# Get the top m most frequent categories for categorical features
self.categorical_top_m_ = {}
for col in self.categorical_features:
top_categories = unaffected[col].value_counts().index[:self.n_categorical_most_frequent]
if top_categories.empty:
top_categories = X[col].value_counts().index[:self.n_categorical_most_frequent]
self.categorical_top_m_[col] = top_categories
return self
def _sample_instances(self, n_samples: int, fixed_feature_values, random_state=None):
"""
Samples instances based on the specified feature distributions, fixing certain feature values while sampling others.
Parameters:
----------
n_samples : int
The number of instances to sample.
fixed_feature_values : dict
A dictionary of feature names and their fixed values during sampling.
random_state : int, optional
Seed for random number generation, by default None.
Returns:
-------
pd.DataFrame
A DataFrame containing the sampled instances with the same feature structure as the original data.
"""
if random_state is not None:
np.random.seed(random_state)
samples_columns = []
for col in self.X_.columns:
if col in fixed_feature_values:
column = [fixed_feature_values[col]] * n_samples
elif col in self.numerical_features:
column = np.random.uniform(self.numeric_min_[col], self.numeric_max_[col], n_samples)
else:
column = np.random.choice(self.categorical_top_m_[col], n_samples)
samples_columns.append(column)
return pd.DataFrame({col_name: column for col_name, column in zip(self.X_.columns, samples_columns)})
[docs]
def explain(self, instance, num_counterfactuals, n_samples=1000, random_state=None):
"""
Generates counterfactual explanations for a given instance by sampling and modifying feature values.
Parameters:
----------
instance : pd.DataFrame
A single row DataFrame representing the instance for which counterfactuals are generated.
num_counterfactuals : int
The number of counterfactuals to generate.
n_samples : int, optional
The number of samples to draw for generating counterfactuals, by default 1000.
random_state : int, optional
Seed for random number generation, by default None.
Returns:
-------
pd.DataFrame
A DataFrame containing the generated counterfactuals for the provided instance.
Raises:
-------
ValueError
If the input instance is not a single-row DataFrame or if its columns do not match the training dataset's columns.
"""
# Check if instance is a single row DataFrame
if not isinstance(instance, pd.DataFrame) or instance.shape[0] != 1:
raise ValueError("Input must be a single row DataFrame.")
# Check if the DataFrame columns match the features provided during initialization
if set(instance.columns) != set(self.X_.columns):
raise ValueError("Columns of the input instance do not match the columns used during fitting.")
fixed_feature_values = {}
for col in self.feature_names_:
if col not in self.top_k_features_:
fixed_feature_values[col] = instance[col].item()
random_instances = self._sample_instances(n_samples, fixed_feature_values, random_state)
# Generate copies of the query instance that will be changed one feature
# at a time to encourage sparsity.
cfs_df = None
candidate_cfs = instance.apply(lambda col: col.repeat(n_samples)).reset_index(drop=True)
# Loop to change one feature at a time, then two features, and so on.
for num_features_to_vary in range(1, len(self.top_k_features_)+1):
selected_features = np.random.choice(self.top_k_features_, (n_samples, 1), replace=True)
for k in range(n_samples):
candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
preds = self.model.predict(candidate_cfs)
if sum(preds) > 0:
rows_to_add = candidate_cfs[preds == 1]
if cfs_df is None:
cfs_df = rows_to_add.copy()
else:
cfs_df = pd.concat([cfs_df, rows_to_add])
cfs_df.drop_duplicates(inplace=True)
# Always change at least 2 features before stopping
if num_features_to_vary >= 2 and len(cfs_df) >= num_counterfactuals:
break
if cfs_df is None:
return None
assert isinstance(cfs_df, pd.DataFrame)
if len(cfs_df) > num_counterfactuals:
cfs_df = cfs_df.sample(num_counterfactuals)
cfs_df.reset_index(inplace=True, drop=True)
return cfs_df
[docs]
def explain_instances(
self, instances: pd.DataFrame, num_counterfactuals: int, n_samples=1000, random_state=None
) -> pd.DataFrame:
"""
Generates counterfactuals for multiple instances by calling the explain method for each instance.
Parameters:
----------
instances : pd.DataFrame
DataFrame containing instances for which counterfactual explanations are needed.
num_counterfactuals : int
The number of counterfactuals to generate for each instance.
n_samples : int, optional
The number of samples to draw for generating counterfactuals, by default 1000.
random_state : int, optional
Seed for random number generation, by default None.
Returns:
-------
pd.DataFrame
A DataFrame containing the generated counterfactuals for all provided instances.
"""
cfs = []
for i in range(instances.shape[0]):
cfs_instance = self.explain(instances.iloc[i:i+1], num_counterfactuals=num_counterfactuals, n_samples=n_samples, random_state=random_state)
if cfs_instance is not None:
cfs.append(cfs_instance)
ret = pd.concat(cfs, ignore_index=False) if cfs != [] else pd.DataFrame(columns=instances.columns).astype(instances.dtypes)
return ret