Source code for pronoms.normalizers.quantile_normalizer

"""
Quantile Normalizer for proteomics data.

This module provides a class for quantile normalization of proteomics data.
"""

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import rankdata

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data



[docs]
class QuantileNormalizer:
    """
    Normalizer that performs quantile normalization across samples.

    Quantile normalization makes the distribution of intensities for each sample
    identical by replacing each value with the mean of the corresponding quantiles
    across all samples.

    Tied values within a row receive the same normalized value (the average of
    the reference values at the tied ranks), following Bolstad et al. (2003).

    Attributes
    ----------
    reference_distribution : Optional[np.ndarray]
        The reference distribution used for normalization.
        Only available after calling normalize().
    """

    def __init__(self):
        """Initialize the QuantileNormalizer."""
        self.reference_distribution = None


[docs]
    def normalize(self, X: np.ndarray) -> np.ndarray:
        """
        Perform quantile normalization on input data X.

        Parameters
        ----------
        X : np.ndarray
            Input data matrix with shape (n_samples, n_features).
            Each row represents a sample, each column represents a feature/protein.

        Returns
        -------
        np.ndarray
            Normalized data matrix with the same shape as X.

        Raises
        ------
        ValueError
            If input data contains NaN or Inf values.
        """
        # Validate input data
        X = validate_input_data(X)

        # Check for NaN or Inf values
        has_nan_inf, _ = check_nan_inf(X)
        if has_nan_inf:
            raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.")

        n_features = X.shape[1]

        # Reference distribution: column-mean of row-sorted data.
        reference = np.mean(np.sort(X, axis=1), axis=0)
        self.reference_distribution = reference

        # Tie-aware mapping: average ranks (1..N) → reference values.
        # Linear interpolation reproduces the Bolstad et al. tie rule, since for
        # a tie at integer half-ranks like 1.5 it returns (ref[0] + ref[1]) / 2.
        ranks = rankdata(X, method="average", axis=1)
        normalized_data = np.interp(ranks, np.arange(1, n_features + 1), reference)

        return normalized_data



[docs]
    def plot_comparison(
        self,
        before_data: np.ndarray,
        after_data: np.ndarray,
        figsize: tuple[int, int] = (10, 8),
        title: str = "Quantile Normalization Comparison",
    ) -> plt.Figure:
        """
        Plot data before vs after normalization using a 2D hexbin density plot.

        Parameters
        ----------
        before_data : np.ndarray
            Data before normalization, shape (n_samples, n_features).
        after_data : np.ndarray
            Data after normalization, shape (n_samples, n_features).
        figsize : Tuple[int, int], optional
            Figure size, by default (10, 8).
        title : str, optional
            Plot title, by default "Quantile Normalization Comparison".

        Returns
        -------
        plt.Figure
            Figure object containing the hexbin density plot.
        """
        # Validate input data
        before_data = validate_input_data(before_data)
        after_data = validate_input_data(after_data)

        # Create hexbin comparison plot
        fig = create_hexbin_comparison(
            before_data,
            after_data,
            figsize=figsize,
            title=title,
            xlabel="Before Quantile Normalization",
            ylabel="After Quantile Normalization",
        )

        # If reference distribution is available, add a note about it
        if self.reference_distribution is not None:
            plt.figtext(
                0.01,
                0.01,
                "Quantile normalization transforms all samples\nto match a common reference distribution.",
                fontsize=9,
                bbox={"boxstyle": "round,pad=0.3", "fc": "white", "alpha": 0.8},
            )

        return fig