Source code for clustering_mi.mutual_information
# Compute the mutual information and associated entropies
from __future__ import annotations
import logging
import numpy as np
from numpy.typing import ArrayLike
from clustering_mi._input_output import _get_contingency_table
# from scipy.optimize import minimize_scalar # Used to optimize the alpha parameter in the Dirichlet-multinomial reduction
from clustering_mi._util import (
_log_binom,
_log_factorial,
_log_Omega_EC,
_minimize_golden_section_log,
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def _stirling_mutual_information(contingency_table: ArrayLike) -> float:
"""
Compute the Stirling approximated mutual information for the given contingency table.
This is the mutual information of the corresponding probability distributions (times the number of objects).
Parameters
----------
contingency_table : ArrayLike
Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.
Returns
-------
float
Mutual information (base 2)
"""
# Compute summary information
n: float = np.sum(contingency_table)
nc = np.sum(contingency_table, axis=0)
ng = np.sum(contingency_table, axis=1)
MI = 0
for r, ng_r in enumerate(ng):
for s, nc_s in enumerate(nc):
if contingency_table[r, s] > 0:
MI += contingency_table[r, s] * np.log(
n * contingency_table[r, s] / (ng_r * nc_s)
)
# Convert to bits (log base 2)
MI /= np.log(2)
return float(MI)
def _traditional_mutual_information(contingency_table: ArrayLike) -> float:
"""
Compute the unreduced microcanonical mutual information and entropies of the ground truth and candidate labelings.
Parameters
----------
contingency_table : ArrayLike
Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.
Returns
-------
float
Mutual information (base 2).
"""
# Compute summary information
n: float = np.sum(contingency_table)
nc = np.sum(contingency_table, axis=0)
ng = np.sum(contingency_table, axis=1)
MI = (
_log_factorial(n)
- np.sum(_log_factorial(ng))
- np.sum(_log_factorial(nc))
+ np.sum(_log_factorial(contingency_table.flatten()))
)
# Convert to bits (log base 2)
MI /= np.log(2)
return float(MI)
def _adjusted_mutual_information(contingency_table: ArrayLike) -> float:
"""
Compute the adjusted mutual information, which corrects the mutual information for chance (random permutations of the labels).
Parameters
----------
contingency_table : ArrayLike
Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.
Returns
-------
float
Adjusted mutual information (base 2).
"""
n: float = np.sum(contingency_table)
nc = np.sum(contingency_table, axis=0)
ng = np.sum(contingency_table, axis=1)
qg = len(ng)
qc = len(nc)
EMI = 0
for r in range(qg):
for s in range(qc):
for ngc in range(max(1, ng[r] + nc[s] - n), min(ng[r], nc[s]) + 1):
EMI += (
ngc
* (np.log(n) + np.log(ngc) - np.log(ng[r]) - np.log(nc[s]))
* np.exp(
_log_binom(nc[s], ngc)
+ _log_binom(n - nc[s], ng[r] - ngc)
- _log_binom(n, ng[r])
)
)
# Convert to bits (log base 2)
EMI /= np.log(2)
return float(_stirling_mutual_information(contingency_table) - EMI)
def _reduced_flat_mutual_information(contingency_table: ArrayLike) -> float:
"""
Compute the reduced mutual information, using the flat reduction method of https://arxiv.org/pdf/1907.12581.
Parameters
----------
contingency_table : ArrayLike
Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.
Returns
-------
float
Reduced mutual information (base 2).
"""
nc = np.sum(contingency_table, axis=0)
ng = np.sum(contingency_table, axis=1)
logOmega = _log_Omega_EC(nc, ng)
return float(_traditional_mutual_information(contingency_table) - logOmega)
def _H_ng_G_alpha(ng: ArrayLike, alpha: float) -> float:
"""
Compute the entropy of a vector of group sizes given the concentration parameter alpha.
Parameters
----------
ng : list or np.ndarray
Vector of group sizes.
alpha : float
Concentration parameter.
Returns
-------
float
Entropy of the group sizes. (base e).
"""
n: float = np.sum(ng)
q = len(ng)
H_ng = _log_binom(
n + q * alpha - 1, q * alpha - 1
) # Dirichlet-multinomial distribution
for r in range(q):
H_ng -= _log_binom(ng[r] + alpha - 1, alpha - 1)
return H_ng
def _H_ngc_G_nc_alpha(ngc: ArrayLike, alpha: float) -> float:
"""
Compute the entropy of a contingency table given knowledge of the column sums and the concentration parameter alpha.
Parameters
----------
ngc : list or np.ndarray
Vector of column sums.
alpha : float
Concentration parameter.
Returns
-------
float
Entropy of the contingency table. (base e).
"""
qg = int(ngc.shape[0])
qc = int(ngc.shape[1])
nc = np.sum(ngc, axis=0) # Column sums
H_ngc = 0.0
for s in range(qc):
H_ngc += _log_binom(
nc[s] + float(qg) * alpha - 1, float(qg) * alpha - 1
) # Independent Dirichlet-multinomial distributions of the columns
for r in range(qg):
H_ngc -= _log_binom(ngc[r, s] + alpha - 1, alpha - 1)
return H_ngc
def _reduced_mutual_information(contingency_table: ArrayLike) -> float:
"""
Compute the reduced mutual information, using the Dirichlet-multinomial reduction of https://arxiv.org/pdf/2405.05393.
Parameters
----------
contingency_table : ArrayLike
Contingency table T as a 2D NumPy array where T[r][s] counts the number of objects with label r in the ground truth g and label s in the candidate c.
Returns
-------
float
Reduced mutual information (base 2).
"""
n: float = np.sum(contingency_table)
nc = np.sum(contingency_table, axis=0)
ng = np.sum(contingency_table, axis=1)
# Range of values of the concentration parameter alpha to consider in the Dirichlet-multinomial transmissions
min_alpha = 0.0001
max_alpha = 10000
# H_g
H_qg: float = np.log(n)
_, H_ng_G_alpha = _minimize_golden_section_log(
lambda alpha: _H_ng_G_alpha(ng, alpha), min_alpha, max_alpha
) # Note that we neglect the cost to transmit the alpha parameter here, although a fixed cost would cancel in the mutual information calculation.
H_ng_G_alpha = float(H_ng_G_alpha)
H_g_G_ng: float = float(_log_factorial(n)) - float(np.sum(_log_factorial(ng)))
H_g: float = H_qg + H_ng_G_alpha + H_g_G_ng
# H_g_G_c
_, H_ngc_G_nc_alpha = _minimize_golden_section_log(
lambda alpha: _H_ngc_G_nc_alpha(contingency_table, alpha), min_alpha, max_alpha
)
H_ngc_G_nc_alpha = float(H_ngc_G_nc_alpha)
H_g_G_c_ngc: float = float(np.sum(_log_factorial(nc))) - float(
np.sum(_log_factorial(contingency_table.flatten()))
)
H_g_G_c: float = H_qg + H_ngc_G_nc_alpha + H_g_G_c_ngc
MI: float = H_g - H_g_G_c
return float(MI / np.log(2)) # Convert to bits (log base 2)
[docs]
def normalized_mutual_information(
input_data_1: ArrayLike | str,
input_data_2: ArrayLike | None = None,
*,
variation: str = "reduced",
normalization: str = "second",
) -> ArrayLike:
"""
Compute the normalized mutual information between two labelings from a pair of lists, the name of a space separated file of labels,
or a contingency table. Can specify the variation of mutual information and type of normalization.
For the asymmetric (default) normalization, the result is reported as a fraction of the entropy of the second labeling, which is considered the ground truth.
Raises AssertionError for invalid inputs.
Parameters
----------
input_data_1 : ArrayLike or str
First argument. This will either be a 2D array-like which specifies the contingency table whose columns are the first labeling and rows are the second labeling,
or a string which is the path to a file containing a list of pairs of labels,
or a 1-D array-like of labels.
input_data_2 : ArrayLike, optional
Second argument. This can only be a 1-D array-like of labels in the case where the first argument is also such a list.
variation : str, optional
Variation of mutual information to compute. Options are:
- "stirling": Stirling's approximation of the traditional mutual information, equal to the mutual information of the corresponding probability distributions (times the number of objects).
- "reduced" (default): Reduced mutual information (RMI), Dirichlet-multinomial reduction of https://arxiv.org/pdf/2405.05393
- "reduced_flat": Reduced mutual information (RMI), flat reduction of https://arxiv.org/pdf/1907.12581
- "adjusted": Adjusted mutual information (AMI), correcting for chance: https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf
- "traditional": Traditional mutual information (MI), microcanonical
- "stirling": Stirling's approximation of the traditional mutual information, equal to the mutual information of the corresponding probability distributions (times the number of objects).
normalization : str, optional
Type of normalization to apply. Options are:
- "second" (default): Asymmetric normalization, measures how much the first labeling tells us about the second, as a fraction of all there is to know about the second labeling.
- "first": Asymmetric normalization, measures how much the second labeling tells us about the first, as a fraction of all there is to know about the first labeling.
- "mean": Symmetric normalization by the arithmetic mean of the two entropies.
- "min": Normalize by the minimum of the two entropies.
- "max": Normalize by the maximum of the two entropies.
- "geometric": Normalize by the geometric mean of the two entropies.
- "none": No normalization, returns the mutual information in bits.
Returns
-------
float
Normalized mutual information
"""
# Get the contingency table
contingency_table = _get_contingency_table(input_data_1, input_data_2)
# Make the candidate-candidate (labeling 1) and truth-truth (label 2) contingency tables
nc = np.sum(contingency_table, axis=0)
ng = np.sum(contingency_table, axis=1)
ncc = np.diag(nc) # Candidate-candidate
ngg = np.diag(ng) # Truth-truth
# Compute the mutual information between each pair of labelings
MI_c_g, MI_g_c, MI_c_c, MI_g_g = None, None, None, None # Values to be computed
if variation == "stirling":
MI_c_g = _stirling_mutual_information(
contingency_table
) # Mutual information between candidate and ground truth
MI_g_c = MI_c_g # Symmetric measure
MI_c_c = _stirling_mutual_information(
ncc
) # Mutual information between candidate and candidate
MI_g_g = _stirling_mutual_information(
ngg
) # Mutual information between ground truth and ground truth
elif variation == "traditional":
MI_c_g = _traditional_mutual_information(contingency_table)
MI_g_c = MI_c_g # Symmetric measure
MI_c_c = _traditional_mutual_information(ncc)
MI_g_g = _traditional_mutual_information(ngg)
elif variation == "adjusted":
MI_c_g = _adjusted_mutual_information(contingency_table)
MI_g_c = MI_c_g # Symmetric measure
MI_c_c = _adjusted_mutual_information(ncc)
MI_g_g = _adjusted_mutual_information(ngg)
elif variation == "reduced":
MI_c_g = _reduced_mutual_information(contingency_table)
MI_g_c = _reduced_mutual_information(
contingency_table.T
) # Transpose to get the ground truth as the first labeling
MI_c_c = _reduced_mutual_information(ncc)
MI_g_g = _reduced_mutual_information(ngg)
elif variation == "reduced_flat":
MI_c_g = _reduced_flat_mutual_information(contingency_table)
MI_g_c = _reduced_flat_mutual_information(contingency_table.T)
MI_c_c = _reduced_flat_mutual_information(ncc)
MI_g_g = _reduced_flat_mutual_information(ngg)
else:
raise ValueError(f"Unknown variation type: {variation}")
# Compute the normalized mutual information
if normalization == "second":
# Asymmetric normalization, measures how much the first labeling tells us about the second, as a fraction of all there is to know about the second labeling
return MI_c_g / MI_g_g if MI_g_g > 0 else 0
if normalization == "first":
# Asymmetric normalization, measures how much the second labeling tells us about the first, as a fraction of all there is to know about the first labeling
return MI_g_c / MI_c_c if MI_c_c > 0 else 0
if (
normalization == "mean"
): # Note that the numerators of these symmetric measures are non-standard in order to account for asymmetries in the calculated MI_c_g vs MI_g_c
return (MI_c_g + MI_g_c) / (MI_c_c + MI_g_g) if (MI_c_c + MI_g_g) > 0 else 0
if normalization == "min":
return (
min(MI_c_g, MI_g_c) / min(MI_c_c, MI_g_g) if min(MI_c_c, MI_g_g) > 0 else 0
)
if normalization == "max":
return (
max(MI_c_g, MI_g_c) / max(MI_c_c, MI_g_g) if max(MI_c_c, MI_g_g) > 0 else 0
)
if normalization == "geometric":
return (
np.sqrt(MI_c_g * MI_g_c) / np.sqrt(MI_c_c * MI_g_g)
if (MI_c_c * MI_g_g) > 0
else 0
)
if normalization == "none":
return MI_c_g # Return the mutual information in bits without normalization (note that this may not be symmetric for the reduced measures)
raise ValueError(f"Unknown normalization type: {normalization}")
[docs]
def mutual_information(
input_data_1: ArrayLike | str,
input_data_2: ArrayLike | None = None,
*,
variation: str = "reduced",
) -> ArrayLike:
"""
Compute the mutual information between two labelings from a pair of lists, the name of a space separated file of labels,
or a contingency table. Can specify the variation of mutual information to compute.
Raises AssertionError for invalid inputs.
Parameters
----------
input_data_1 : ArrayLike or str
First argument. This will either be a 2D array-like which specifies the contingency table whose columns are the first labeling and rows are the second labeling,
or a string which is the path to a file containing a list of pairs of labels,
or a 1-D array-like of labels.
input_data_2 : ArrayLike, optional
Second argument. This can only be a 1-D array-like of labels in the case where the first argument is also such a list.
variation : str, optional
Variation of mutual information to compute. Options are:
- "reduced" (default): Reduced mutual information (RMI), reduction of https://arxiv.org/pdf/2405.05393, note that this can be (slightly) asymmetric.
- "reduced_flat": Reduced mutual information (RMI), flat reduction of https://arxiv.org/pdf/1907.12581
- "adjusted": Adjusted mutual information (AMI), correcting for chance: https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf
- "traditional": Traditional mutual information (MI), microcanonical
- "stirling": Stirling's approximation of the traditional mutual information, equal to the mutual information of the corresponding probability distributions (times the number of objects).
Returns
-------
float
Mutual information value in bits (base 2).
"""
return normalized_mutual_information(
input_data_1,
input_data_2,
variation=variation,
normalization="none",
)