Source code for pronoms.normalizers.mad_normalizer

import warnings

import matplotlib.pyplot as plt
import numpy as np

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data

# Consistency constant making MAD a σ-equivalent estimator under normality
# (1 / Φ⁻¹(0.75) ≈ 1.4826). Matches R's ``mad()`` default.
MAD_SIGMA_CONSTANT = 1.4826

_SCALE_DEFAULT_UNSPECIFIED = object()


[docs] class MADNormalizer: """ Median Absolute Deviation (MAD) Normalizer. Centers each sample (row) by subtracting its median and scales it by its Median Absolute Deviation (MAD). Optionally performs calculations on log2-transformed data (default) to stabilize variance and handle typical intensity distributions. If `log_transform=True` (default): Calculations (median, MAD) are performed on `log2(X + 1)`. Normalization: `(log2(X + 1) - median_log) / (k * MAD_log)` If `log_transform=False`: Calculations are performed directly on `X`. Normalization: `(X - median) / (k * MAD)` Where ``k`` is the consistency constant set by ``scale_to_sigma``: - ``scale_to_sigma=True``: ``k = 1.4826`` (``MAD_SIGMA_CONSTANT``). The output is a robust z-score: per-row spread ≈ 1 σ for normal data. Matches R's ``mad()`` default and ``statsmodels.robust.scale.mad``. - ``scale_to_sigma=False``: ``k = 1`` (raw MAD divisor). Per-row spread is ≈ 1.4826 × what a true robust z-score would give. Use this if you explicitly want raw-MAD output and have not standardized to σ. .. deprecated:: Calling without ``scale_to_sigma`` emits a ``DeprecationWarning``; the implicit default (raw MAD) will be replaced by ``scale_to_sigma=True`` in a future major release. Pass the argument explicitly to lock in your intended behavior across versions. Attributes ---------- log_transform : bool Whether log2 transformation was applied before normalization. scale_to_sigma : bool Whether the divisor is ``MAD_SIGMA_CONSTANT * MAD`` (σ-equivalent) rather than raw MAD. row_medians : np.ndarray Median of the (potentially log2-transformed) data for each sample. row_mads : np.ndarray Raw Median Absolute Deviation (MAD) of the (potentially log2-transformed) data for each sample. Always the unscaled MAD, regardless of ``scale_to_sigma``. """ def __init__(self, log_transform: bool = True, scale_to_sigma: bool = _SCALE_DEFAULT_UNSPECIFIED): # type: ignore[assignment] """ Initializes the MADNormalizer. Parameters ---------- log_transform : bool, optional Whether to apply log2(X+1) transformation before calculating median and MAD, by default True. scale_to_sigma : bool, optional If True, divide by ``1.4826 * MAD`` so the output is a robust z-score (σ-equivalent under normality). If False, divide by raw MAD. The current implicit default is False for backward compatibility but will flip to True in a future major release; omitting the argument emits a ``DeprecationWarning``. """ if scale_to_sigma is _SCALE_DEFAULT_UNSPECIFIED: warnings.warn( "MADNormalizer's default scaling will change in a future major " "release: the divisor will become 1.4826 * MAD (σ-equivalent, " "matching R's mad()) instead of raw MAD. Pass " "scale_to_sigma=True for the new behavior or " "scale_to_sigma=False to keep raw MAD.", DeprecationWarning, stacklevel=2, ) scale_to_sigma = False self.log_transform = log_transform self.scale_to_sigma = bool(scale_to_sigma) self.row_medians: np.ndarray | None = None self.row_mads: np.ndarray | None = None
[docs] def normalize(self, X: np.ndarray) -> np.ndarray: """ Apply MAD normalization to the input data matrix X. Parameters ---------- X : np.ndarray Input data matrix (n_samples, n_features). Must contain non-negative values if `log_transform=True`. Returns ------- np.ndarray Normalized data matrix. Raises ------ ValueError - If input is not a 2D array with at least one feature. - If input data contains NaN or Inf values. - If `log_transform=True` and input data contains negative values. - If MAD is zero for any sample (which prevents normalization). """ # Validate input data type and shape first X_validated = validate_input_data(X) # Use a different name to avoid modifying X if log_transform is False if X_validated.ndim != 2 or X_validated.shape[1] == 0: raise ValueError("X must be a 2D array with at least one feature (n_samples, n_features).") # Check for NaN or Inf values (on original data) has_nan_inf, _ = check_nan_inf(X_validated) if has_nan_inf: raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.") data_to_process = X_validated scale_type = "original" if self.log_transform: # Check for negative values only if log transforming if np.any(X_validated < 0): raise ValueError("Input data contains negative values. Log2 transformation cannot be applied.") # Apply log2 transformation with np.errstate(divide="ignore", invalid="ignore"): log_X = np.log2(X_validated + 1) # Check for issues potentially introduced by log2 if np.any(~np.isfinite(log_X)): raise ValueError( "Non-finite values encountered after log2 transformation. Check input data near 0 or -1." ) data_to_process = log_X scale_type = "log2(X+1)" # --- Calculations performed on data_to_process (either original or log2) --- # Calculate row-wise medians row_medians = np.median(data_to_process, axis=1, keepdims=True) # Calculate absolute deviations from the median abs_deviations = np.abs(data_to_process - row_medians) # Calculate row-wise MAD row_mads = np.median(abs_deviations, axis=1, keepdims=True) # Check for zero MAD values if np.any(row_mads == 0): zero_mad_indices = np.where(row_mads.flatten() == 0)[0] indices_repr = [int(i) for i in zero_mad_indices] raise ValueError( f"Cannot normalize: MAD of {scale_type} data is zero " f"for sample(s) at index/indices: {indices_repr}. " f"This usually means all {scale_type} values in the sample are identical." ) # Store state (medians and *raw* MADs from the scale used). self.row_medians = row_medians.flatten() self.row_mads = row_mads.flatten() # Apply normalization: (data_to_process - median) / (k * MAD) # where k is the σ-consistency constant when scale_to_sigma is True. divisor = row_mads * MAD_SIGMA_CONSTANT if self.scale_to_sigma else row_mads normalized_data = (data_to_process - row_medians) / divisor return normalized_data
[docs] def plot_comparison( self, before_data: np.ndarray, after_data: np.ndarray, figsize: tuple[int, int] = (10, 8), title: str = "MAD Normalization Comparison", ) -> plt.Figure: """ Plot data before vs after normalization using a 2D hexbin density plot. Parameters ---------- before_data : np.ndarray Data before normalization, shape (n_samples, n_features). after_data : np.ndarray Data after normalization, shape (n_samples, n_features). figsize : Tuple[int, int], optional Figure size, by default (10, 8). title : str, optional Plot title, by default "MAD Normalization Comparison". Returns ------- plt.Figure Figure object containing the hexbin density plot. """ # Use the consistent utility function, but enable y-axis autoscaling # and add a horizontal line at y=0. fig = create_hexbin_comparison( before_data, after_data, figsize=figsize, title=title, xlabel="Original Data", ylabel=( "After MAD Normalization (" f"{'Standardized Log2 Scale' if self.log_transform else 'Standardized Original Scale'}" ")" ), autoscale_y=True, add_identity_line=False, add_center_line_y0=True, # Centered around 0 in both cases ) return fig