Source code for mlexplainer.utils.data_processing

"""Utility functions for data processing in ML Explainer."""

from pandas import concat, DataFrame, Series, merge



[docs]
def calculate_min_max_value(dataframe: DataFrame, feature: str):
    """
    Calculate the minimum and maximum values of a feature in a DataFrame.

    Args:
        dataframe (DataFrame): The DataFrame containing the feature.
        feature (str): The name of the feature to calculate min and max values.

    Returns:
        tuple: A tuple containing the minimum and maximum values of the feature.
    """
    if dataframe[feature].dtype == "category":
        return 0, dataframe[feature].value_counts().shape[0] - 1

    return dataframe[feature].min(), dataframe[feature].max()




[docs]
def get_index(column_name: str, dataframe: DataFrame) -> int:
    """Extract the index of a column in a DataFrame.

    Args:
        column_name (str): Column name to extract the index from.
        dataframe (DataFrame): DataFrame to extract the index from.

    Returns:
        int: Index of the column in the DataFrame.
    """
    ind_index = list(dataframe.columns).index(column_name)
    return ind_index




[docs]
def get_index_of_features(dataframe: DataFrame, feature: str) -> int:
    """Get the index of a feature in the DataFrame columns.

    Args:
        dataframe (DataFrame): DataFrame containing the features.
        feature (str): The feature name to find the index of.

    Returns:
        int: Index of the feature in the DataFrame columns.
    """
    try:
        return dataframe.columns.tolist().index(feature)
    except ValueError as exc:
        raise ValueError("Feature is not in dataframe.") from exc




[docs]
def target_groupby_category(
    dataframe: DataFrame,
    feature: str,
    target_serie: Series,
) -> DataFrame:
    """Group by a categorical feature and calculate mean and volume of the target.

    Args:
        dataframe (DataFrame): Input DataFrame containing the feature and target.
        feature (str): The feature name to group by.
        target_serie (Series): The target series to calculate statistics for.

    Returns:
        DataFrame: DataFrame with mean and volume of the target for each group.
    """
    target = target_serie.name
    df_feat_target = concat(
        [dataframe[[feature]], target_serie], axis=1
    ).copy()
    df_feat_target["group"] = dataframe[feature]

    df_feat_target_group_mean = (
        df_feat_target.groupby("group", dropna=False, observed=False)[target]
        .mean()
        .sort_index()
        .reset_index()
        .rename(columns={target: "mean_target"})
    )

    df_feat_target_group_volume = (
        df_feat_target.groupby("group", dropna=False, observed=False)[target]
        .count()
        .sort_index()
        .reset_index()
        .rename(columns={target: "volume_target"})
    )

    results = merge(
        df_feat_target_group_mean,
        df_feat_target_group_volume,
        how="left",
        on="group",
    )

    return results