Source code for humancompatible.explain.lice.data.DataHandler

from __future__ import annotations

from typing import Optional

import numpy as np
import pandas as pd

from .Features import (
    Binary,
    Categorical,
    Contiguous,
    Feature,
    Mixed,
    Monotonicity,
)
from .Types import CategValue, DataLike, FeatureID, OneDimData


[docs] class DataHandler: """ Handles all data processing, transforming raw pandas DataFrames or NumPy arrays into a normalized and encoded format. This class is designed to be initialized with training data and then used to consistently encode all subsequent data. It supports mixed data types, where some values are categorical, and normalizes contiguous data to a [0, 1] range. The output can be either one-hot encoded or direct data with mapped categorical values to negative integers. """ def __init__( self, X: DataLike, y: OneDimData | None = None, # trunk-ignore(ruff/B006) categ_map: dict[FeatureID, list[CategValue]] = {}, # trunk-ignore(ruff/B006) ordered: list[FeatureID] = [], # trunk-ignore(ruff/B006) bounds_map: dict[FeatureID, tuple[int, int]] = {}, # trunk-ignore(ruff/B006) discrete: list[FeatureID] = [], # trunk-ignore(ruff/B006) immutable: list[FeatureID] = [], # trunk-ignore(ruff/B006) monotonicity: dict[FeatureID, Monotonicity] = {}, # TODO more general causality # trunk-ignore(ruff/B006) causal_inc: list[tuple[FeatureID, FeatureID]] = [], # trunk-ignore(ruff/B006) greater_than: list[tuple[FeatureID, FeatureID]] = [], regression: bool = False, feature_names: Optional[list[str]] = None, target_name: Optional[str] = None, ): """ Initializes a DataHandler instance for data processing and encoding. Parameters: ----------- X : DataLike Input features. Can be a pandas DataFrame or a NumPy array. Expected shape: (num_samples, num_features). y : OneDimData | None, optional Target feature (e.g., labels for classification or regression targets). Expected shape: (num_samples,). Defaults to None. categ_map : dict[FeatureID, list[CategValue]], optional A dictionary where keys are feature identifiers (indices or names) and values are lists of unique categorical values for that feature. If a list is empty, all unique values of the feature are considered categorical. If a list is non-empty but doesn't cover all values, the feature is treated as mixed. Defaults to an empty dictionary. ordered : list[FeatureID], optional A list of feature identifiers that should be treated as ordered categorical. Defaults to an empty list. bounds_map : dict[FeatureID, tuple[int, int]], optional A dictionary where keys are feature identifiers and values are tuples (min, max) defining the real bounds for contiguous features. Defaults to an empty dictionary. discrete : list[FeatureID], optional A list of feature identifiers that should be treated as discrete contiguous. Defaults to an empty list. immutable : list[FeatureID], optional A list of feature identifiers that represent immutable features (cannot be changed). Defaults to an empty list. monotonicity : dict[FeatureID, Monotonicity], optional A dictionary where keys are feature identifiers and values specify the monotonicity constraint for that feature (can only decrease or only increase). Defaults to an empty dictionary. causal_inc : list[tuple[FeatureID, FeatureID]], optional A list of tuples, where each tuple (cause, effect) indicates that an increase in 'cause' must lead to an increase in 'effect'. Defaults to an empty list. greater_than : list[tuple[FeatureID, FeatureID]], optional A list of tuples, where each tuple (greater, smaller) indicates that 'greater' must be greater than 'smaller'. Defaults to an empty list. regression : bool, optional If True, the task is treated as regression; otherwise, it's classification. Defaults to False. feature_names : Optional[list[str]], optional A list of names for the input features. If None and `X` is a DataFrame, column names from `X` will be used. Defaults to None. target_name : Optional[str], optional The name of the target feature. If None and `y` is a pandas Series, its name will be used. If `X` is a DataFrame and `target_name` is provided, the target column will be extracted from `X`. Defaults to None. Raises: ------- ValueError If the length of `feature_names` does not match the number of features in `X`. """ if isinstance(X, pd.DataFrame): if target_name is not None: print("Taking target values from the X matrix") y = X[target_name] X = X.drop(columns=target_name) if feature_names is None: feature_names = X.columns X = X.to_numpy() if y is not None: if target_name is None: if isinstance(y, pd.Series): target_name = y.name else: target_name = "target" if regression: self.__target_feature = Contiguous(y, target_name) else: if len(np.unique(y)) > 2: self.__target_feature = Categorical(y, name=target_name) else: self.__target_feature = Binary(y, name=target_name) # TODO make the target values specifiable else: self.__target_feature = None n_features = X.shape[1] if feature_names is None: feature_names = [None] * n_features if len(feature_names) != n_features: raise ValueError("Incorrect length of list of feature names.") self.__input_features: list[Feature] = [] # stores lists of categorical values of applicable features, used for mapping to integer values for feat_i, feat_name in enumerate(feature_names): self.__input_features.append( self.__make_feature( X[:, feat_i], feat_name, categ_map.get(feat_name, None), bounds_map.get(feat_name, None), feat_name in ordered, feat_name in discrete, monotone=monotonicity.get(feat_name, Monotonicity.NONE), modifiable=feat_name not in immutable, ) ) self.__causal_inc = [ ( self.__input_features[self.feature_names.index(i)], self.__input_features[self.feature_names.index(j)], ) for i, j in causal_inc ] self.__greater_than = [ ( self.__input_features[self.feature_names.index(i)], self.__input_features[self.feature_names.index(j)], ) for i, j in greater_than ] @property def causal_inc(self) -> list[tuple[Feature, Feature]]: return self.__causal_inc @property def greater_than(self) -> list[tuple[Feature, Feature]]: return self.__greater_than def __make_feature( self, data: OneDimData, feat_name: Optional[str], categ_vals: Optional[list[CategValue]], real_bounds: Optional[list[CategValue]], ordered: bool, discrete: bool, monotone: bool, modifiable: bool, ) -> Feature: """ Internal helper method to create a Feature object based on provided metadata. Parameters: ----------- data : OneDimData The 1-dimensional array-like data for the feature. feat_name : Optional[str] The name of the feature. categ_vals : Optional[list[CategValue]] A list of unique categorical values for the feature. If None, the feature is treated as contiguous. real_bounds : Optional[list[CategValue]] A tuple (min, max) specifying the real bounds for contiguous features. ordered : bool True if the categorical feature is ordered. discrete : bool True if the contiguous feature is discrete. monotone : Monotonicity The monotonicity constraint for the feature. modifiable : bool True if the feature is modifiable. Returns: -------- Feature An instance of Binary, Categorical, Contiguous, or Mixed feature. Raises: ------- ValueError If an invalid feature type combination is encountered (e.g., mixed with ordered categorical). """ if categ_vals is None: return Contiguous( data, feat_name, bounds=real_bounds, discrete=discrete, monotone=monotone, modifiable=modifiable, ) else: if len(categ_vals) > 0: # if predefined mapping exists if np.any(~np.isin(data, categ_vals)): # if there are non-categorical values return Mixed( data, categ_vals, name=feat_name, bounds=real_bounds, monotone=monotone, modifiable=modifiable, ) elif len(categ_vals) > 2: return Categorical( data, categ_vals, name=feat_name, monotone=monotone, modifiable=modifiable, ordering=categ_vals if ordered else None, ) else: return Binary( data, categ_vals, name=feat_name, monotone=monotone, modifiable=modifiable, ) else: # fully categorical without pre-specified valuess if len(np.unique(data)) > 2: return Categorical( data, name=feat_name, monotone=monotone, modifiable=modifiable ) else: return Binary( data, name=feat_name, monotone=monotone, modifiable=modifiable ) @property def n_features(self) -> int: """ The number of input features. Returns: -------- int The total count of features in the input space. """ return len(self.__input_features) @property def features(self) -> list[Feature]: """ A list of Feature objects representing the input features. Returns: -------- list[Feature] A list containing instances of Feature (e.g., Contiguous, Categorical, etc.). """ return self.__input_features @property def target_feature(self) -> Feature: """ The Feature object representing the target variable. Returns: -------- Feature An instance of Feature (e.g., Contiguous, Categorical, or Binary) representing the target feature. """ return self.__target_feature @property def feature_names(self) -> list[str]: """ A list of names for all input features. Returns: -------- list[str] A list of strings, where each string is the name of an input feature. """ return [f.name for f in self.__input_features]
[docs] def encode( self, X: DataLike, normalize: bool = True, one_hot: bool = True ) -> np.ndarray[np.float64]: """ Encodes the input features according to the DataHandler's configuration. This method transforms raw input data into a format suitable for model training or inference, handling normalization and one-hot encoding as specified. Parameters: ----------- X : DataLike Input features, which can be a pandas DataFrame, pandas Series, or a NumPy array. Expected shape: (num_samples, num_features) for DataFrame/2D array, or (num_features,) for a single sample Series/1D array. normalize : bool, optional If True, contiguous features will be normalized to the [0, 1] range. Defaults to True. one_hot : bool, optional If True, categorical features will be one-hot encoded. If False, categorical values will be mapped to negative integers. Defaults to True. Returns: -------- np.ndarray[np.float64] The encoded input features. The shape depends on `one_hot`: - If `one_hot` is True: (num_samples, total_one_hot_features) - If `one_hot` is False: (num_samples, num_features) Raises: ------- ValueError If the input `X` has an unexpected shape or type that cannot be processed. """ if isinstance(X, pd.DataFrame): X = X.to_numpy() if isinstance(X, pd.Series): X = X.to_numpy() if len(X.shape) == 1: Xmat = X.reshape(1, -1) return self.encode(Xmat, normalize=normalize, one_hot=one_hot)[0] enc = [] for feat_i, feature in enumerate(self.__input_features): enc.append( feature.encode(X[:, feat_i], normalize, one_hot).reshape(X.shape[0], -1) ) return np.concatenate(enc, axis=1).astype(np.float64)
[docs] def encode_y( self, y: OneDimData, normalize: bool = True, one_hot: bool = True ) -> np.ndarray[np.float64]: """ Encodes the target feature (`y`) according to the DataHandler's configuration. This method transforms the raw target variable into a format suitable for model training or inference, handling normalization and one-hot encoding as specified. Parameters: ----------- y : OneDimData The target feature data. Can be a pandas Series or a NumPy array. Expected shape: (num_samples,). normalize : bool, optional If True, the target feature will be normalized (if it's contiguous). Defaults to True. one_hot : bool, optional If True, categorical target feature will be one-hot encoded. If False, categorical values will be mapped to negative integers. Defaults to True. Returns: -------- np.ndarray[np.float64] The encoded target feature. The shape depends on `one_hot` and the target type: - If `one_hot` is True and target is categorical: (num_samples, num_unique_target_values) - Otherwise: (num_samples,) """ return self.__target_feature.encode(y, normalize, one_hot)
[docs] def encode_all(self, X_all: np.ndarray, normalize: bool, one_hot: bool): """ Encodes both input features and the target feature when they are concatenated into a single NumPy array. Assumes the last column of `X_all` is the target feature. Parameters: ----------- X_all : np.ndarray A NumPy array where input features are in all columns except the last one, and the target feature is in the last column. Expected shape: (num_samples, num_features + 1). normalize : bool Whether to normalize contiguous features (both input and target). one_hot : bool Whether to perform one-hot encoding for categorical values (both input and target). Returns: -------- np.ndarray[np.float64] The combined encoded features and target. """ return np.concatenate( [ self.encode(X_all[:, :-1], normalize, one_hot), self.encode_y(X_all[:, -1], normalize, one_hot).reshape(-1, 1), ], axis=1, )
[docs] def decode( self, X: np.ndarray[np.float64], denormalize: bool = True, encoded_one_hot: bool = True, as_dataframe: bool = True, ) -> np.ndarray[np.float64]: """ Decodes the encoded input features back to their original format. This method reverses the encoding process, denormalizing contiguous features and converting one-hot encoded categorical features back to their original values. Parameters: ----------- X : np.ndarray[np.float64] The encoded input data matrix. Expected shape: (num_samples, num_encoded_features), where `num_encoded_features` can be higher than the original number of features due to one-hot encoding. denormalize : bool, optional If True, the denormalization process will be applied to contiguous features. Defaults to True. encoded_one_hot : bool, optional If True, it is assumed that the input `X` is one-hot encoded. Defaults to True. as_dataframe : bool, optional If True, the decoded features will be returned as a pandas DataFrame. If False, a NumPy array will be returned. Defaults to True. Returns: -------- np.ndarray[np.float64] | pd.DataFrame The decoded features in their original format. - If `as_dataframe` is True: a pandas DataFrame with original feature names. - If `as_dataframe` is False: a NumPy array. Expected shape: (num_samples, num_original_features). """ if X.shape[0] == 0: if as_dataframe: return pd.DataFrame([], columns=[f.name for f in self.__input_features]) return np.empty((0, self.n_features)) dec = [] curr_col = 0 for feature in self.__input_features: w = feature.encoding_width(encoded_one_hot) dec.append( feature.decode(X[:, curr_col : curr_col + w], denormalize, as_dataframe) ) curr_col += w if as_dataframe: return pd.concat(dec, axis=1) return np.concatenate([x.reshape(X.shape[0], -1) for x in dec], axis=1)
[docs] def decode_y( self, y: np.ndarray[np.float64], denormalize: bool = True, as_series: bool = True, ) -> np.ndarray[np.float64]: """ Decodes the encoded target feature (`y`) back to its original format. This method reverses the encoding process for the target variable, denormalizing if applicable and converting one-hot encoded forms back to their original values. Parameters: ----------- y : np.ndarray[np.float64] The encoded target feature data. Expected shape: (num_samples,) for non-one-hot encoded targets, or (num_samples, num_categorical_values) for one-hot encoded categorical targets. denormalize : bool, optional If True, denormalization will be applied to the target feature (if it's contiguous). Defaults to True. as_series : bool, optional If True, the decoded target feature will be returned as a pandas Series. If False, a NumPy array will be returned. Defaults to True. Returns: -------- np.ndarray[np.float64] | pd.Series The decoded target feature data in its original format. - If `as_series` is True: a pandas Series with the original target name. - If `as_series` is False: a NumPy array. Expected shape: (num_samples,). """ return self.__target_feature.decode(y, denormalize, as_series)
[docs] def encoding_width(self, one_hot: bool) -> int: """ Calculates the total width of the encoded input features. This method determines the number of columns that the encoded data matrix will have, considering whether one-hot encoding is applied. Parameters: ----------- one_hot : bool If True, the width for one-hot encoding will be considered. If False, the width for direct mapping (e.g., negative integers for categories) will be used. Returns: -------- int The total number of columns in the encoded input feature matrix. """ return sum([f.encoding_width(one_hot) for f in self.__input_features])
[docs] def allowed_changes(self, pre_vals, post_vals): """ Checks if a proposed change from `pre_vals` to `post_vals` is allowed based on feature constraints (immutability, monotonicity) and defined causal/greater-than relationships. Parameters: ----------- pre_vals : np.ndarray The original feature values for a single instance. Expected shape: (num_features,). post_vals : np.ndarray The proposed new feature values for the same instance. Expected shape: (num_features,). Returns: -------- bool True if all changes are allowed according to the defined constraints, False otherwise. Raises: ------- ValueError If an invalid feature type is encountered during the check of causal or greater-than relationships. """ for f, pre, pos in zip(self.features, pre_vals, post_vals): if not f.allowed_change(pre, pos): return False for cause, effect in self.__causal_inc: cause_i = self.features.index(cause) pre_cause = cause.encode(pre_vals[cause_i], normalize=False, one_hot=False) pos_cause = cause.encode(post_vals[cause_i], normalize=False, one_hot=False) if isinstance(cause, Categorical): applied = pos_cause in cause.greater_than(pre_cause) elif isinstance(cause, Contiguous): applied = pos_cause > pre_cause else: raise ValueError("invalid feature type") if applied: effect_i = self.features.index(effect) pre_effect = effect.encode( pre_vals[effect_i], normalize=False, one_hot=False ) pos_effect = effect.encode( post_vals[effect_i], normalize=False, one_hot=False ) if isinstance(effect, Categorical): if pos_effect not in effect.greater_than(pre_effect): return False elif isinstance(effect, Contiguous): if pos_effect <= pre_effect: return False else: raise ValueError("invalid feature type") for greater, smaller in self.__greater_than: if ( post_vals[self.features.index(smaller)] > post_vals[self.features.index(greater)] ): return False return True