Source code for humancompatible.explain.glance.utils.centroid

from typing import List
import pandas as pd
import numpy as np
import numpy.typing as npt
from statistics import multimode
from IPython.display import display



[docs]
def centroid_pandas(
    X: pd.DataFrame,
    numerical_columns: List[str],
    categorical_columns: List[str],
) -> pd.DataFrame:
    """Calculates the centroid of the rows of a pandas DataFrame. Specifically,
    for the `numerical_columns` columns, the centroid has value the mean of all
    rows, while for the `categorical_columns` columns, the centroid has value
    the mode of all rows.

    Args:
        X (pd.DataFrame): matrix of observations
        numerical_columns (List[str]): numerical column names
        categorical_columns (List[str]): categorical column names

    Returns:
        pd.DataFrame: DataFrame whose single row is the centroid
    """
    centroid = pd.DataFrame(columns=X.columns).astype(X.dtypes)

    centroid.loc[0, numerical_columns] = X[numerical_columns].mean(axis="index")
    if categorical_columns != []:
        centroid.loc[0, categorical_columns] = X[categorical_columns].apply(
            lambda col: multimode(col)[0]
        )
        # centroid.loc[0, categorical_columns] = X[categorical_columns].mode().iloc[0]

    return centroid




[docs]
def centroid_numpy(
    X: npt.NDArray[np.number],
    numerical_columns: List[int],
    categorical_columns: List[int],
) -> npt.NDArray[np.number]:
    """Calculates the centroid of the rows of a 2d numy array. Specifically,
    for the `numerical_columns` columns, the centroid has value the mean of all
    rows, while for the `categorical_columns` columns, the centroid has value
    the mode of all rows.

    Args:
        X (npt.NDArray[np.number]): matrix of observations
        numerical_columns (List[int]): numerical column indices
        categorical_columns (List[int]): categorical column indices

    Returns:
        npt.NDArray[np.number]: 2d numpy array whose single row is the centroid
    """
    assert len(X.shape) == 2
    centroid = np.zeros((1, X.shape[1]))

    centroid[:, numerical_columns] = X[:, numerical_columns].mean(axis=0)

    def most_frequent(x):
        unique_values, counts = np.unique(x, return_counts=True)
        most_common = unique_values[np.argmax(counts)]
        return most_common

    centroid[:, categorical_columns] = [
        most_frequent(X[:, i]) for i in categorical_columns
    ]

    return centroid