Source code for clustering_mi.mutual_information

# Compute the mutual information and associated entropies
from __future__ import annotations

import logging

import numpy as np
from numpy.typing import ArrayLike

from clustering_mi import _core
from clustering_mi._input_output import _get_contingency_table

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def _stirling_mutual_information(contingency_table: ArrayLike) -> float:
    """
    Compute the Stirling approximated mutual information for the given contingency table.
    This is the mutual information of the corresponding probability distributions (times the number of objects).

    Parameters
    ----------
    contingency_table : ArrayLike
        Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.

    Returns
    -------
    float
        Mutual information (base 2)
    """

    return float(_core.stirling_mutual_information(np.asarray(contingency_table)))


def _traditional_mutual_information(contingency_table: ArrayLike) -> float:
    """
    Compute the unreduced microcanonical mutual information and entropies of the ground truth and candidate labelings.

    Parameters
    ----------
    contingency_table : ArrayLike
        Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.

    Returns
    -------
    float
        Mutual information (base 2).
    """

    return float(_core.traditional_mutual_information(np.asarray(contingency_table)))


def _adjusted_mutual_information(contingency_table: ArrayLike) -> float:
    """
    Compute the adjusted mutual information, which corrects the mutual information for chance (random permutations of the labels).

    Parameters
    ----------
    contingency_table : ArrayLike
        Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.

    Returns
    -------
    float
        Adjusted mutual information (base 2).
    """

    return float(_core.adjusted_mutual_information(np.asarray(contingency_table)))


def _reduced_flat_mutual_information(contingency_table: ArrayLike) -> float:
    """
    Compute the reduced mutual information, using the flat reduction method of https://arxiv.org/pdf/1907.12581.

    Parameters
    ----------
    contingency_table : ArrayLike
        Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.

    Returns
    -------
    float
        Reduced mutual information (base 2).
    """

    return float(_core.reduced_flat_mutual_information(np.asarray(contingency_table)))


def _H_ng_G_alpha(ng: ArrayLike, alpha: float) -> float:
    """
    Compute the entropy of a vector of group sizes given the concentration parameter alpha.

    Parameters
    ----------
    ng : list or np.ndarray
        Vector of group sizes.
    alpha : float
        Concentration parameter.

    Returns
    -------
    float
        Entropy of the group sizes. (base e).

    """
    return float(_core.H_ng_G_alpha(np.asarray(ng), float(alpha)))


def _H_ngc_G_nc_alpha(ngc: ArrayLike, alpha: float) -> float:
    """
    Compute the entropy of a contingency table given knowledge of the column sums and the concentration parameter alpha.

    Parameters
    ----------
    ngc : list or np.ndarray
        Vector of column sums.
    alpha : float
        Concentration parameter.

    Returns
    -------
    float
        Entropy of the contingency table. (base e).
    """

    return float(_core.H_ngc_G_nc_alpha(np.asarray(ngc), float(alpha)))


def _reduced_mutual_information(contingency_table: ArrayLike) -> float:
    """
    Compute the reduced mutual information, using the Dirichlet-multinomial reduction of https://arxiv.org/pdf/2405.05393.

    Parameters
    ----------
    contingency_table : ArrayLike
        Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.

    Returns
    -------
    float
        Reduced mutual information (base 2).
    """

    return float(_core.reduced_mutual_information(np.asarray(contingency_table)))



[docs]
def normalized_mutual_information(
    input_data_1: ArrayLike | str,
    input_data_2: ArrayLike | None = None,
    *,
    variation: str = "reduced",
    normalization: str = "second",
) -> ArrayLike:
    """
    Compute the normalized mutual information between two labelings from a pair of lists, the name of a space separated file of labels,
    or a contingency table. Can specify the variation of mutual information and type of normalization.
    For the asymmetric (default) normalization, the result is reported as a fraction of the entropy of the second labeling, which is considered the ground truth.

    Raises AssertionError for invalid inputs.

    Parameters
    ----------
    input_data_1 : ArrayLike or str
        First argument. This will either be a 2D array-like which specifies the contingency table whose columns are the first labeling and rows are the second labeling,
        or a string which is the path to a file containing a list of pairs of labels,
        or a 1-D array-like of labels.
    input_data_2 : ArrayLike, optional
        Second argument. This can only be a 1-D array-like of labels in the case where the first argument is also such a list.
    variation : str, optional
        Variation of mutual information to compute. Options are:
            - "stirling": Stirling's approximation of the traditional mutual information, equal to the mutual information of the corresponding probability distributions (times the number of objects).
            - "reduced" (default): Reduced mutual information (RMI), Dirichlet-multinomial reduction of https://arxiv.org/pdf/2405.05393
            - "reduced_flat": Reduced mutual information (RMI), flat reduction of https://arxiv.org/pdf/1907.12581
            - "adjusted": Adjusted mutual information (AMI), correcting for chance: https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf
            - "traditional": Traditional mutual information (MI), microcanonical
            - "stirling": Stirling's approximation of the traditional mutual information, equal to the mutual information of the corresponding probability distributions (times the number of objects).
    normalization : str, optional
        Type of normalization to apply. Options are:
            - "second" (default): Asymmetric normalization, measures how much the first labeling tells us about the second, as a fraction of all there is to know about the second labeling.
            - "first": Asymmetric normalization, measures how much the second labeling tells us about the first, as a fraction of all there is to know about the first labeling.
            - "mean": Symmetric normalization by the arithmetic mean of the two entropies.
            - "min": Normalize by the minimum of the two entropies.
            - "max": Normalize by the maximum of the two entropies.
            - "geometric": Normalize by the geometric mean of the two entropies.
            - "none": No normalization, returns the mutual information in bits.

    Returns
    -------
    float
        Normalized mutual information
    """

    # Get the contingency table
    contingency_table = _get_contingency_table(input_data_1, input_data_2)

    # Make the candidate-candidate (labeling 1) and truth-truth (label 2) contingency tables
    nc = np.sum(contingency_table, axis=0)
    ng = np.sum(contingency_table, axis=1)
    ncc = np.diag(nc)  # Candidate-candidate
    ngg = np.diag(ng)  # Truth-truth

    # Compute the mutual information between each pair of labelings
    MI_c_g, MI_g_c, MI_c_c, MI_g_g = None, None, None, None  # Values to be computed
    if variation == "stirling":
        MI_c_g = _stirling_mutual_information(
            contingency_table
        )  # Mutual information between candidate and ground truth
        MI_g_c = MI_c_g  # Symmetric measure
        MI_c_c = _stirling_mutual_information(
            ncc
        )  # Mutual information between candidate and candidate
        MI_g_g = _stirling_mutual_information(
            ngg
        )  # Mutual information between ground truth and ground truth
    elif variation == "traditional":
        MI_c_g = _traditional_mutual_information(contingency_table)
        MI_g_c = MI_c_g  # Symmetric measure
        MI_c_c = _traditional_mutual_information(ncc)
        MI_g_g = _traditional_mutual_information(ngg)
    elif variation == "adjusted":
        MI_c_g = _adjusted_mutual_information(contingency_table)
        MI_g_c = MI_c_g  # Symmetric measure
        MI_c_c = _adjusted_mutual_information(ncc)
        MI_g_g = _adjusted_mutual_information(ngg)
    elif variation == "reduced":
        MI_c_g = _reduced_mutual_information(contingency_table)
        MI_g_c = _reduced_mutual_information(
            contingency_table.T
        )  # Transpose to get the ground truth as the first labeling
        MI_c_c = _reduced_mutual_information(ncc)
        MI_g_g = _reduced_mutual_information(ngg)
    elif variation == "reduced_flat":
        MI_c_g = _reduced_flat_mutual_information(contingency_table)
        MI_g_c = _reduced_flat_mutual_information(contingency_table.T)
        MI_c_c = _reduced_flat_mutual_information(ncc)
        MI_g_g = _reduced_flat_mutual_information(ngg)
    else:
        raise ValueError(f"Unknown variation type: {variation}")

    # Compute the normalized mutual information
    if normalization == "second":
        # Asymmetric normalization, measures how much the first labeling tells us about the second, as a fraction of all there is to know about the second labeling
        return MI_c_g / MI_g_g if MI_g_g > 0 else 0
    if normalization == "first":
        # Asymmetric normalization, measures how much the second labeling tells us about the first, as a fraction of all there is to know about the first labeling
        return MI_g_c / MI_c_c if MI_c_c > 0 else 0
    if (
        normalization == "mean"
    ):  # Note that the numerators of these symmetric measures are non-standard in order to account for asymmetries in the calculated MI_c_g vs MI_g_c
        return (MI_c_g + MI_g_c) / (MI_c_c + MI_g_g) if (MI_c_c + MI_g_g) > 0 else 0
    if normalization == "min":
        return (
            min(MI_c_g, MI_g_c) / min(MI_c_c, MI_g_g) if min(MI_c_c, MI_g_g) > 0 else 0
        )
    if normalization == "max":
        return (
            max(MI_c_g, MI_g_c) / max(MI_c_c, MI_g_g) if max(MI_c_c, MI_g_g) > 0 else 0
        )
    if normalization == "geometric":
        return (
            np.sqrt(MI_c_g * MI_g_c) / np.sqrt(MI_c_c * MI_g_g)
            if (MI_c_c * MI_g_g) > 0
            else 0
        )
    if normalization == "none":
        return MI_c_g  # Return the mutual information in bits without normalization (note that this may not be symmetric for the reduced measures)
    raise ValueError(f"Unknown normalization type: {normalization}")




[docs]
def mutual_information(
    input_data_1: ArrayLike | str,
    input_data_2: ArrayLike | None = None,
    *,
    variation: str = "reduced",
) -> ArrayLike:
    """
    Compute the mutual information between two labelings from a pair of lists, the name of a space separated file of labels,
    or a contingency table. Can specify the variation of mutual information to compute.

    Raises AssertionError for invalid inputs.

    Parameters
    ----------
    input_data_1 : ArrayLike or str
        First argument. This will either be a 2D array-like which specifies the contingency table whose columns are the first labeling and rows are the second labeling,
        or a string which is the path to a file containing a list of pairs of labels,
        or a 1-D array-like of labels.
    input_data_2 : ArrayLike, optional
        Second argument. This can only be a 1-D array-like of labels in the case where the first argument is also such a list.
    variation : str, optional
        Variation of mutual information to compute. Options are:
            - "reduced" (default): Reduced mutual information (RMI), reduction of https://arxiv.org/pdf/2405.05393, note that this can be (slightly) asymmetric.
            - "reduced_flat": Reduced mutual information (RMI), flat reduction of https://arxiv.org/pdf/1907.12581
            - "adjusted": Adjusted mutual information (AMI), correcting for chance: https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf
            - "traditional": Traditional mutual information (MI), microcanonical
            - "stirling": Stirling's approximation of the traditional mutual information, equal to the mutual information of the corresponding probability distributions (times the number of objects).

    Returns
    -------
    float
        Mutual information value in bits (base 2).
    """

    return normalized_mutual_information(
        input_data_1,
        input_data_2,
        variation=variation,
        normalization="none",
    )