Source code for humancompatible.explain.glance.utils.centroid

from typing import List
import pandas as pd
import numpy as np
import numpy.typing as npt
from statistics import multimode
from IPython.display import display


[docs] def centroid_pandas( X: pd.DataFrame, numerical_columns: List[str], categorical_columns: List[str], ) -> pd.DataFrame: """Calculates the centroid of the rows of a pandas DataFrame. Specifically, for the `numerical_columns` columns, the centroid has value the mean of all rows, while for the `categorical_columns` columns, the centroid has value the mode of all rows. Args: X (pd.DataFrame): matrix of observations numerical_columns (List[str]): numerical column names categorical_columns (List[str]): categorical column names Returns: pd.DataFrame: DataFrame whose single row is the centroid """ centroid = pd.DataFrame(columns=X.columns).astype(X.dtypes) centroid.loc[0, numerical_columns] = X[numerical_columns].mean(axis="index") if categorical_columns != []: centroid.loc[0, categorical_columns] = X[categorical_columns].apply( lambda col: multimode(col)[0] ) # centroid.loc[0, categorical_columns] = X[categorical_columns].mode().iloc[0] return centroid
[docs] def centroid_numpy( X: npt.NDArray[np.number], numerical_columns: List[int], categorical_columns: List[int], ) -> npt.NDArray[np.number]: """Calculates the centroid of the rows of a 2d numy array. Specifically, for the `numerical_columns` columns, the centroid has value the mean of all rows, while for the `categorical_columns` columns, the centroid has value the mode of all rows. Args: X (npt.NDArray[np.number]): matrix of observations numerical_columns (List[int]): numerical column indices categorical_columns (List[int]): categorical column indices Returns: npt.NDArray[np.number]: 2d numpy array whose single row is the centroid """ assert len(X.shape) == 2 centroid = np.zeros((1, X.shape[1])) centroid[:, numerical_columns] = X[:, numerical_columns].mean(axis=0) def most_frequent(x): unique_values, counts = np.unique(x, return_counts=True) most_common = unique_values[np.argmax(counts)] return most_common centroid[:, categorical_columns] = [ most_frequent(X[:, i]) for i in categorical_columns ] return centroid