from __future__ import annotations
from typing import Optional
import numpy as np
import pandas as pd
from .Features import (
Binary,
Categorical,
Contiguous,
Feature,
Mixed,
Monotonicity,
)
from .Types import CategValue, DataLike, FeatureID, OneDimData
[docs]
class DataHandler:
"""
Handles all data processing, transforming raw pandas DataFrames or NumPy arrays
into a normalized and encoded format.
This class is designed to be initialized with training data and then used to
consistently encode all subsequent data. It supports mixed data types, where
some values are categorical, and normalizes contiguous data to a [0, 1] range.
The output can be either one-hot encoded or direct data with mapped categorical
values to negative integers.
"""
def __init__(
self,
X: DataLike,
y: OneDimData | None = None,
# trunk-ignore(ruff/B006)
categ_map: dict[FeatureID, list[CategValue]] = {},
# trunk-ignore(ruff/B006)
ordered: list[FeatureID] = [],
# trunk-ignore(ruff/B006)
bounds_map: dict[FeatureID, tuple[int, int]] = {},
# trunk-ignore(ruff/B006)
discrete: list[FeatureID] = [],
# trunk-ignore(ruff/B006)
immutable: list[FeatureID] = [],
# trunk-ignore(ruff/B006)
monotonicity: dict[FeatureID, Monotonicity] = {},
# TODO more general causality
# trunk-ignore(ruff/B006)
causal_inc: list[tuple[FeatureID, FeatureID]] = [],
# trunk-ignore(ruff/B006)
greater_than: list[tuple[FeatureID, FeatureID]] = [],
regression: bool = False,
feature_names: Optional[list[str]] = None,
target_name: Optional[str] = None,
):
"""
Initializes a DataHandler instance for data processing and encoding.
Parameters:
-----------
X : DataLike
Input features. Can be a pandas DataFrame or a NumPy array.
Expected shape: (num_samples, num_features).
y : OneDimData | None, optional
Target feature (e.g., labels for classification or regression targets).
Expected shape: (num_samples,). Defaults to None.
categ_map : dict[FeatureID, list[CategValue]], optional
A dictionary where keys are feature identifiers (indices or names) and
values are lists of unique categorical values for that feature.
If a list is empty, all unique values of the feature are considered
categorical. If a list is non-empty but doesn't cover all values,
the feature is treated as mixed. Defaults to an empty dictionary.
ordered : list[FeatureID], optional
A list of feature identifiers that should be treated as ordered categorical.
Defaults to an empty list.
bounds_map : dict[FeatureID, tuple[int, int]], optional
A dictionary where keys are feature identifiers and values are tuples
(min, max) defining the real bounds for contiguous features.
Defaults to an empty dictionary.
discrete : list[FeatureID], optional
A list of feature identifiers that should be treated as discrete contiguous.
Defaults to an empty list.
immutable : list[FeatureID], optional
A list of feature identifiers that represent immutable features (cannot be changed).
Defaults to an empty list.
monotonicity : dict[FeatureID, Monotonicity], optional
A dictionary where keys are feature identifiers and values specify the
monotonicity constraint for that feature (can only decrease or only increase).
Defaults to an empty dictionary.
causal_inc : list[tuple[FeatureID, FeatureID]], optional
A list of tuples, where each tuple (cause, effect) indicates that
an increase in 'cause' must lead to an increase in 'effect'.
Defaults to an empty list.
greater_than : list[tuple[FeatureID, FeatureID]], optional
A list of tuples, where each tuple (greater, smaller) indicates that
'greater' must be greater than 'smaller'. Defaults to an empty list.
regression : bool, optional
If True, the task is treated as regression; otherwise, it's classification.
Defaults to False.
feature_names : Optional[list[str]], optional
A list of names for the input features. If None and `X` is a DataFrame,
column names from `X` will be used. Defaults to None.
target_name : Optional[str], optional
The name of the target feature. If None and `y` is a pandas Series,
its name will be used. If `X` is a DataFrame and `target_name` is
provided, the target column will be extracted from `X`. Defaults to None.
Raises:
-------
ValueError
If the length of `feature_names` does not match the number of features in `X`.
"""
if isinstance(X, pd.DataFrame):
if target_name is not None:
print("Taking target values from the X matrix")
y = X[target_name]
X = X.drop(columns=target_name)
if feature_names is None:
feature_names = X.columns
X = X.to_numpy()
if y is not None:
if target_name is None:
if isinstance(y, pd.Series):
target_name = y.name
else:
target_name = "target"
if regression:
self.__target_feature = Contiguous(y, target_name)
else:
if len(np.unique(y)) > 2:
self.__target_feature = Categorical(y, name=target_name)
else:
self.__target_feature = Binary(y, name=target_name)
# TODO make the target values specifiable
else:
self.__target_feature = None
n_features = X.shape[1]
if feature_names is None:
feature_names = [None] * n_features
if len(feature_names) != n_features:
raise ValueError("Incorrect length of list of feature names.")
self.__input_features: list[Feature] = []
# stores lists of categorical values of applicable features, used for mapping to integer values
for feat_i, feat_name in enumerate(feature_names):
self.__input_features.append(
self.__make_feature(
X[:, feat_i],
feat_name,
categ_map.get(feat_name, None),
bounds_map.get(feat_name, None),
feat_name in ordered,
feat_name in discrete,
monotone=monotonicity.get(feat_name, Monotonicity.NONE),
modifiable=feat_name not in immutable,
)
)
self.__causal_inc = [
(
self.__input_features[self.feature_names.index(i)],
self.__input_features[self.feature_names.index(j)],
)
for i, j in causal_inc
]
self.__greater_than = [
(
self.__input_features[self.feature_names.index(i)],
self.__input_features[self.feature_names.index(j)],
)
for i, j in greater_than
]
@property
def causal_inc(self) -> list[tuple[Feature, Feature]]:
return self.__causal_inc
@property
def greater_than(self) -> list[tuple[Feature, Feature]]:
return self.__greater_than
def __make_feature(
self,
data: OneDimData,
feat_name: Optional[str],
categ_vals: Optional[list[CategValue]],
real_bounds: Optional[list[CategValue]],
ordered: bool,
discrete: bool,
monotone: bool,
modifiable: bool,
) -> Feature:
"""
Internal helper method to create a Feature object based on provided metadata.
Parameters:
-----------
data : OneDimData
The 1-dimensional array-like data for the feature.
feat_name : Optional[str]
The name of the feature.
categ_vals : Optional[list[CategValue]]
A list of unique categorical values for the feature. If None, the feature
is treated as contiguous.
real_bounds : Optional[list[CategValue]]
A tuple (min, max) specifying the real bounds for contiguous features.
ordered : bool
True if the categorical feature is ordered.
discrete : bool
True if the contiguous feature is discrete.
monotone : Monotonicity
The monotonicity constraint for the feature.
modifiable : bool
True if the feature is modifiable.
Returns:
--------
Feature
An instance of Binary, Categorical, Contiguous, or Mixed feature.
Raises:
-------
ValueError
If an invalid feature type combination is encountered (e.g., mixed with ordered categorical).
"""
if categ_vals is None:
return Contiguous(
data,
feat_name,
bounds=real_bounds,
discrete=discrete,
monotone=monotone,
modifiable=modifiable,
)
else:
if len(categ_vals) > 0: # if predefined mapping exists
if np.any(~np.isin(data, categ_vals)):
# if there are non-categorical values
return Mixed(
data,
categ_vals,
name=feat_name,
bounds=real_bounds,
monotone=monotone,
modifiable=modifiable,
)
elif len(categ_vals) > 2:
return Categorical(
data,
categ_vals,
name=feat_name,
monotone=monotone,
modifiable=modifiable,
ordering=categ_vals if ordered else None,
)
else:
return Binary(
data,
categ_vals,
name=feat_name,
monotone=monotone,
modifiable=modifiable,
)
else:
# fully categorical without pre-specified valuess
if len(np.unique(data)) > 2:
return Categorical(
data, name=feat_name, monotone=monotone, modifiable=modifiable
)
else:
return Binary(
data, name=feat_name, monotone=monotone, modifiable=modifiable
)
@property
def n_features(self) -> int:
"""
The number of input features.
Returns:
--------
int
The total count of features in the input space.
"""
return len(self.__input_features)
@property
def features(self) -> list[Feature]:
"""
A list of Feature objects representing the input features.
Returns:
--------
list[Feature]
A list containing instances of Feature (e.g., Contiguous, Categorical, etc.).
"""
return self.__input_features
@property
def target_feature(self) -> Feature:
"""
The Feature object representing the target variable.
Returns:
--------
Feature
An instance of Feature (e.g., Contiguous, Categorical, or Binary)
representing the target feature.
"""
return self.__target_feature
@property
def feature_names(self) -> list[str]:
"""
A list of names for all input features.
Returns:
--------
list[str]
A list of strings, where each string is the name of an input feature.
"""
return [f.name for f in self.__input_features]
[docs]
def encode(
self, X: DataLike, normalize: bool = True, one_hot: bool = True
) -> np.ndarray[np.float64]:
"""
Encodes the input features according to the DataHandler's configuration.
This method transforms raw input data into a format suitable for model
training or inference, handling normalization and one-hot encoding
as specified.
Parameters:
-----------
X : DataLike
Input features, which can be a pandas DataFrame, pandas Series,
or a NumPy array.
Expected shape: (num_samples, num_features) for DataFrame/2D array,
or (num_features,) for a single sample Series/1D array.
normalize : bool, optional
If True, contiguous features will be normalized to the [0, 1] range.
Defaults to True.
one_hot : bool, optional
If True, categorical features will be one-hot encoded. If False,
categorical values will be mapped to negative integers.
Defaults to True.
Returns:
--------
np.ndarray[np.float64]
The encoded input features. The shape depends on `one_hot`:
- If `one_hot` is True: (num_samples, total_one_hot_features)
- If `one_hot` is False: (num_samples, num_features)
Raises:
-------
ValueError
If the input `X` has an unexpected shape or type that cannot be processed.
"""
if isinstance(X, pd.DataFrame):
X = X.to_numpy()
if isinstance(X, pd.Series):
X = X.to_numpy()
if len(X.shape) == 1:
Xmat = X.reshape(1, -1)
return self.encode(Xmat, normalize=normalize, one_hot=one_hot)[0]
enc = []
for feat_i, feature in enumerate(self.__input_features):
enc.append(
feature.encode(X[:, feat_i], normalize, one_hot).reshape(X.shape[0], -1)
)
return np.concatenate(enc, axis=1).astype(np.float64)
[docs]
def encode_y(
self, y: OneDimData, normalize: bool = True, one_hot: bool = True
) -> np.ndarray[np.float64]:
"""
Encodes the target feature (`y`) according to the DataHandler's configuration.
This method transforms the raw target variable into a format suitable for
model training or inference, handling normalization and one-hot encoding
as specified.
Parameters:
-----------
y : OneDimData
The target feature data. Can be a pandas Series or a NumPy array.
Expected shape: (num_samples,).
normalize : bool, optional
If True, the target feature will be normalized (if it's contiguous).
Defaults to True.
one_hot : bool, optional
If True, categorical target feature will be one-hot encoded. If False,
categorical values will be mapped to negative integers.
Defaults to True.
Returns:
--------
np.ndarray[np.float64]
The encoded target feature. The shape depends on `one_hot` and the target type:
- If `one_hot` is True and target is categorical: (num_samples, num_unique_target_values)
- Otherwise: (num_samples,)
"""
return self.__target_feature.encode(y, normalize, one_hot)
[docs]
def encode_all(self, X_all: np.ndarray, normalize: bool, one_hot: bool):
"""
Encodes both input features and the target feature when they are
concatenated into a single NumPy array.
Assumes the last column of `X_all` is the target feature.
Parameters:
-----------
X_all : np.ndarray
A NumPy array where input features are in all columns except the last one,
and the target feature is in the last column.
Expected shape: (num_samples, num_features + 1).
normalize : bool
Whether to normalize contiguous features (both input and target).
one_hot : bool
Whether to perform one-hot encoding for categorical values (both input and target).
Returns:
--------
np.ndarray[np.float64]
The combined encoded features and target.
"""
return np.concatenate(
[
self.encode(X_all[:, :-1], normalize, one_hot),
self.encode_y(X_all[:, -1], normalize, one_hot).reshape(-1, 1),
],
axis=1,
)
[docs]
def decode(
self,
X: np.ndarray[np.float64],
denormalize: bool = True,
encoded_one_hot: bool = True,
as_dataframe: bool = True,
) -> np.ndarray[np.float64]:
"""
Decodes the encoded input features back to their original format.
This method reverses the encoding process, denormalizing contiguous features
and converting one-hot encoded categorical features back to their original values.
Parameters:
-----------
X : np.ndarray[np.float64]
The encoded input data matrix.
Expected shape: (num_samples, num_encoded_features), where `num_encoded_features`
can be higher than the original number of features due to one-hot encoding.
denormalize : bool, optional
If True, the denormalization process will be applied to contiguous features.
Defaults to True.
encoded_one_hot : bool, optional
If True, it is assumed that the input `X` is one-hot encoded.
Defaults to True.
as_dataframe : bool, optional
If True, the decoded features will be returned as a pandas DataFrame.
If False, a NumPy array will be returned. Defaults to True.
Returns:
--------
np.ndarray[np.float64] | pd.DataFrame
The decoded features in their original format.
- If `as_dataframe` is True: a pandas DataFrame with original feature names.
- If `as_dataframe` is False: a NumPy array.
Expected shape: (num_samples, num_original_features).
"""
if X.shape[0] == 0:
if as_dataframe:
return pd.DataFrame([], columns=[f.name for f in self.__input_features])
return np.empty((0, self.n_features))
dec = []
curr_col = 0
for feature in self.__input_features:
w = feature.encoding_width(encoded_one_hot)
dec.append(
feature.decode(X[:, curr_col : curr_col + w], denormalize, as_dataframe)
)
curr_col += w
if as_dataframe:
return pd.concat(dec, axis=1)
return np.concatenate([x.reshape(X.shape[0], -1) for x in dec], axis=1)
[docs]
def decode_y(
self,
y: np.ndarray[np.float64],
denormalize: bool = True,
as_series: bool = True,
) -> np.ndarray[np.float64]:
"""
Decodes the encoded target feature (`y`) back to its original format.
This method reverses the encoding process for the target variable,
denormalizing if applicable and converting one-hot encoded forms
back to their original values.
Parameters:
-----------
y : np.ndarray[np.float64]
The encoded target feature data.
Expected shape: (num_samples,) for non-one-hot encoded targets,
or (num_samples, num_categorical_values) for one-hot encoded categorical targets.
denormalize : bool, optional
If True, denormalization will be applied to the target feature
(if it's contiguous). Defaults to True.
as_series : bool, optional
If True, the decoded target feature will be returned as a pandas Series.
If False, a NumPy array will be returned. Defaults to True.
Returns:
--------
np.ndarray[np.float64] | pd.Series
The decoded target feature data in its original format.
- If `as_series` is True: a pandas Series with the original target name.
- If `as_series` is False: a NumPy array.
Expected shape: (num_samples,).
"""
return self.__target_feature.decode(y, denormalize, as_series)
[docs]
def encoding_width(self, one_hot: bool) -> int:
"""
Calculates the total width of the encoded input features.
This method determines the number of columns that the encoded data
matrix will have, considering whether one-hot encoding is applied.
Parameters:
-----------
one_hot : bool
If True, the width for one-hot encoding will be considered. If False,
the width for direct mapping (e.g., negative integers for categories)
will be used.
Returns:
--------
int
The total number of columns in the encoded input feature matrix.
"""
return sum([f.encoding_width(one_hot) for f in self.__input_features])
[docs]
def allowed_changes(self, pre_vals, post_vals):
"""
Checks if a proposed change from `pre_vals` to `post_vals` is allowed
based on feature constraints (immutability, monotonicity) and
defined causal/greater-than relationships.
Parameters:
-----------
pre_vals : np.ndarray
The original feature values for a single instance.
Expected shape: (num_features,).
post_vals : np.ndarray
The proposed new feature values for the same instance.
Expected shape: (num_features,).
Returns:
--------
bool
True if all changes are allowed according to the defined constraints,
False otherwise.
Raises:
-------
ValueError
If an invalid feature type is encountered during the check of
causal or greater-than relationships.
"""
for f, pre, pos in zip(self.features, pre_vals, post_vals):
if not f.allowed_change(pre, pos):
return False
for cause, effect in self.__causal_inc:
cause_i = self.features.index(cause)
pre_cause = cause.encode(pre_vals[cause_i], normalize=False, one_hot=False)
pos_cause = cause.encode(post_vals[cause_i], normalize=False, one_hot=False)
if isinstance(cause, Categorical):
applied = pos_cause in cause.greater_than(pre_cause)
elif isinstance(cause, Contiguous):
applied = pos_cause > pre_cause
else:
raise ValueError("invalid feature type")
if applied:
effect_i = self.features.index(effect)
pre_effect = effect.encode(
pre_vals[effect_i], normalize=False, one_hot=False
)
pos_effect = effect.encode(
post_vals[effect_i], normalize=False, one_hot=False
)
if isinstance(effect, Categorical):
if pos_effect not in effect.greater_than(pre_effect):
return False
elif isinstance(effect, Contiguous):
if pos_effect <= pre_effect:
return False
else:
raise ValueError("invalid feature type")
for greater, smaller in self.__greater_than:
if (
post_vals[self.features.index(smaller)]
> post_vals[self.features.index(greater)]
):
return False
return True