Source code for humancompatible.explain.glance.utils.action

from typing import List, Any, Optional

import numpy as np
import numpy.typing as npt
import pandas as pd



[docs]
def apply_action_pandas(
    X: pd.DataFrame,
    action: pd.Series,
    numerical_columns: List[str],
    categorical_columns: List[str],
    categorical_no_action_token: Any,
    numerical_no_action_token: Optional[Any] = None,
) -> pd.DataFrame:
    """Apply `action` to all rows of `X`. For numerical columns, add the
    respective component from `action`. For categorical columns, set the
    component of all rows to the value of `action`, unless it is equal to
    the `categorical_no_action_token`, in which case do nothing for this
    column.

    Args:
        X (pd.DataFrame): matrix of observations
        action (pd.Series): for each column / feature, the action to be applied
        numerical_columns (List[str]): numerical column names
        categorical_columns (List[str]): categorical column names
        categorical_no_action_token (Any): special value signifying no-action (i.e. equivalent to 0 for numerical columns)

    Returns:
        pd.DataFrame: new observations resulting from the action application.
    """
    assert (X.columns == action.index).all()
    if numerical_no_action_token is None:
        numerical_no_action_token = categorical_no_action_token

    ret = X.copy(deep=True)
    for col in numerical_columns:
        if action[col] != numerical_no_action_token:
            ret[col] = X[col] + action[col]
    for col in categorical_columns:
        if action[col] != categorical_no_action_token:
            ret[col] = action[col]
    ret = ret.astype(X.dtypes)

    return ret




[docs]
def apply_action_numpy(
    X: npt.NDArray[np.number],
    action: npt.NDArray[np.number],
    numerical_columns: List[int],
    categorical_columns: List[int],
    categorical_no_action_token: np.number,
) -> npt.NDArray[np.number]:
    """Apply `action` to all rows of `X`. For numerical columns, add the
    respective component from `action`. For categorical columns, set the
    component of all rows to the value of `action`, unless it is equal to
    the `categorical_no_action_token`, in which case do nothing for this
    column.

    Note: input array should have a numeric dtype. Thus, categorical columns
    should be encoded by numbers (e.g. Ordinal Encoding).

    Args:
        X (npt.NDArray[np.number]): matrix of observations
        action (npt.NDArray[np.number]): for each column / feature, the action to be applied
        numerical_columns (List[int]): numerical column indices
        categorical_columns (List[int]): categorical column indices
        categorical_no_action_token (np.number): special value signifying no-action (i.e. equivalent to 0 for numerical columns)

    Returns:
        npt.NDArray[np.number]: new observations resulting from the action application.
    """
    assert len(X.shape) == 2
    assert len(action.shape) == 1
    assert (
        X.shape[1] == action.shape[0]
    ), "action should have length equal to the number of columns"

    ret = X.copy()
    ret[:, numerical_columns] += action[numerical_columns]
    categorical_columns_masked = np.intersect1d(
        np.where(action != categorical_no_action_token)[0], categorical_columns
    )
    ret[:, categorical_columns_masked] = action[categorical_columns_masked]

    return ret




[docs]
def extract_actions_pandas(
    X: pd.DataFrame,
    cfs: pd.DataFrame,
    categorical_features: List[str],
    numerical_features: List[str],
    categorical_no_action_token: Any,
):
    """
    Extracts the actions needed to convert the original dataset `X` into the counterfactual dataset `cfs`.

    For categorical features, the function identifies changes between `X` and `cfs`.
    If no change is observed in a categorical feature, a specified `categorical_no_action_token` is used to denote that no action is needed. 
    For numerical features, the function computes the difference between the counterfactual and the original values.

    Parameters:
    ----------
    X : pd.DataFrame
        The original dataset, where each row represents an instance, and each column is a feature.
    cfs : pd.DataFrame
        The counterfactual dataset, which has the same structure as `X`. It represents the desired state after some action is applied.
    categorical_features : List[str]
        List of columns in `X` and `cfs` that are categorical.
    numerical_features : List[str]
        List of columns in `X` and `cfs` that are numerical.
    categorical_no_action_token : Any
        A token or value to insert into categorical features where no change is needed (i.e., the feature value in `X` is the same as in `cfs`).

    Returns:
    -------
    pd.DataFrame
        A DataFrame of the same shape as `X` and `cfs` where each value indicates the action required to transform `X` into `cfs`:
        - For categorical features: the value in `cfs` if it differs from `X`, otherwise `categorical_no_action_token`.
        - For numerical features: the difference between `cfs` and `X`.
    """
    actions = X.copy(deep=True)

    for col in categorical_features:
        are_equal_indicator = X[col] == cfs[col]
        actions.loc[are_equal_indicator, col] = categorical_no_action_token
        actions.loc[~are_equal_indicator, col] = cfs.loc[~are_equal_indicator, col]
    for col in numerical_features:
        actions[col] = cfs[col] - X[col]
    return actions



[docs]
def apply_actions_pandas_rows(
    X: pd.DataFrame,
    actions: pd.DataFrame,
    numerical_columns: List[str],
    categorical_columns: List[str],
    categorical_no_action_token: object,
) -> pd.DataFrame:
    """
    Applies a set of actions to transform the original dataset `X` based on the actions specified in the `actions` DataFrame.

    For numerical columns, the function adds the values from the `actions` DataFrame to the corresponding columns in `X`. 
    For categorical columns, if the action for a column is not equal to the `categorical_no_action_token`, the value from the `actions` DataFrame is used to update `X`. 
    Otherwise, the original value from `X` is retained.

    Parameters:
    ----------
    X : pd.DataFrame
        The original dataset, where each row represents an instance, and each column is a feature.
    actions : pd.DataFrame
        A DataFrame of the same shape as `X`, containing the actions to apply to each feature.
        - For numerical columns: contains the values to add to the corresponding features in `X`.
        - For categorical columns: contains either the new value to apply or the `categorical_no_action_token`.
    numerical_columns : List[str]
        List of columns in `X` and `actions` that are numerical.
    categorical_columns : List[str]
        List of columns in `X` and `actions` that are categorical.
    categorical_no_action_token : object
        A token or value indicating that no action should be taken for a categorical feature.

    Returns:
    -------
    pd.DataFrame
        A DataFrame of the same shape as `X` where the actions have been applied:
        - For numerical columns: each value is updated by adding the corresponding action from `actions`.
        - For categorical columns: updated values from `actions` are used where applicable; otherwise, the original values from `X` are retained.
    """
    ret = X.copy(deep=True)
    for col in numerical_columns:
        ret[col] = X[col] + actions[col]
    for col in categorical_columns:
        no_action_indicator = actions[col] == categorical_no_action_token
        ret.loc[~ no_action_indicator, col] = actions.loc[~ no_action_indicator, col].values
        ret.loc[no_action_indicator, col] = X.loc[no_action_indicator, col].values

    return ret



[docs]
def actions_mean_pandas(
    actions: pd.DataFrame,
    numerical_features: List[str],
    categorical_features: List[str],
    categorical_no_action_token: Any,
) -> pd.Series:
    """
    Computes the mean action for numerical features and the most frequent action for categorical features from a given actions DataFrame.

    For numerical features, the function calculates the mean of the actions across all instances. 
    For categorical features, it determines the most frequent value in the `actions` DataFrame, unless all values are equal to the `categorical_no_action_token`, 
    in which case the token is returned.

    Parameters:
    ----------
    actions : pd.DataFrame
        A DataFrame where each row represents an instance, and each column represents an action for a feature (either numerical or categorical).
    numerical_features : List[str]
        List of columns in `actions` that are numerical features.
    categorical_features : List[str]
        List of columns in `actions` that are categorical features.
    categorical_no_action_token : Any
        A token or value that indicates no action is needed for categorical features.

    Returns:
    -------
    pd.Series
        A Series where:
        - For numerical features, the values are the mean of the actions for each numerical column.
        - For categorical features, the values are the most frequent action in each categorical column, or the `categorical_no_action_token` if no action was needed.
    """
    ret = pd.Series(index=actions.columns, dtype="object")
    ret[numerical_features] = actions[numerical_features].mean()
    for col in categorical_features:
        if (actions[col] == categorical_no_action_token).all():
            ret[col] = categorical_no_action_token
        else:
            value_cnts = actions[col].value_counts()
            most_freq = (
                value_cnts.index[0]
                if value_cnts.index[0] != categorical_features
                else value_cnts.index[1]
            )
            ret[col] = most_freq

    return ret