Source code for pronoms.normalizers.rank_normalizer

"""
Rank Normalizer for proteomics data.

This module provides a class for rank transformation normalization of proteomics data.
"""

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import rankdata

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data



[docs]
class RankNormalizer:
    """
    Normalizer that transforms each sample's values to their ranks.

    This normalizer replaces each value in a sample with its rank, where the
    smallest value gets rank 1 and the largest gets rank N (number of features).
    Tied values are assigned the median rank of their group.

    Attributes
    ----------
    normalize_by_n : bool
        Whether to divide ranks by N (number of features) for comparability.
    ranks : np.ndarray | None
        The rank-transformed data. Only available after calling normalize().
    """

    def __init__(self, normalize_by_n: bool = False):
        """
        Initialize the RankNormalizer.

        Parameters
        ----------
        normalize_by_n : bool, optional
            If True, divide all ranks by N (number of features) to get values
            between 1/N and 1. This makes ranks comparable across datasets
            with different numbers of features. By default False.
        """
        self.normalize_by_n = normalize_by_n
        self.ranks: np.ndarray | None = None


[docs]
    def normalize(self, X: np.ndarray) -> np.ndarray:
        """
        Perform rank transformation on input data X.

        Parameters
        ----------
        X : np.ndarray
            Input data matrix with shape (n_samples, n_features).
            Each row represents a sample, each column represents a feature/protein.

        Returns
        -------
        np.ndarray
            Rank-transformed data matrix with the same shape as X.
            Values range from 1 to N (or 1/N to 1 if normalize_by_n=True).

        Raises
        ------
        ValueError
            - If input is not a 2D array with at least one feature.
            - If input data contains NaN or Inf values.
        """
        # Dimensionality guard
        if X.ndim != 2 or X.shape[1] == 0:
            raise ValueError("X must be a 2D array with at least one feature (n_samples, n_features).")

        # Validate input data (dtype conversion, etc.)
        X = validate_input_data(X)

        # Check for NaN or Inf values
        has_nan_inf, _ = check_nan_inf(X)
        if has_nan_inf:
            raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.")

        n_samples, n_features = X.shape
        rank_data = np.zeros_like(X, dtype=float)

        # Apply rank transformation to each sample (row)
        for i in range(n_samples):
            # Use scipy.stats.rankdata with method='average' for median rank of ties
            # This automatically handles tied values by assigning the average rank
            sample_ranks = rankdata(X[i, :], method="average")

            if self.normalize_by_n:
                # Normalize ranks to [1/N, 1] range
                sample_ranks = sample_ranks / n_features

            rank_data[i, :] = sample_ranks

        # Store the transformed data
        self.ranks = rank_data.copy()

        return rank_data



[docs]
    def plot_comparison(
        self,
        before_data: np.ndarray,
        after_data: np.ndarray,
        figsize: tuple[int, int] = (10, 8),
        title: str = "Rank Normalization Comparison",
        log_axes: bool = False,
    ) -> plt.Figure:
        """
        Plot data before vs after normalization using a 2D hexbin density plot.

        Parameters
        ----------
        before_data : np.ndarray
            Data before normalization, shape (n_samples, n_features).
        after_data : np.ndarray
            Data after normalization, shape (n_samples, n_features).
        figsize : Tuple[int, int], optional
            Figure size, by default (10, 8).
        title : str, optional
            Plot title, by default "Rank Normalization Comparison".
        log_axes : bool, optional
            If True, plot log10 of the original values on the x-axis. If False (default),
            plot raw original values on the x-axis. The y-axis always shows the
            actual rank values from the normalization. Log scaling of x-axis can help
            visualize data with wide dynamic ranges.

        Returns
        -------
        plt.Figure
            Figure object containing the hexbin density plot.

        Notes
        -----
        The y-axis limits and label are set assuming integer ranks in
        ``[1, n_features]`` (the default ``normalize_by_n=False`` case). When
        the normalizer was constructed with ``normalize_by_n=True`` the
        plotted y-values are in ``(1/n_features, 1]`` and the y-axis label
        ("Assigned Rank (1 to N)") and ylim ``(0, n_features+1)`` will not
        match the data — read the y-tick values rather than the label in that
        case, or pass the raw integer-rank output through the helper directly.
        """
        # Validate input data
        before_data = validate_input_data(before_data)
        after_data = validate_input_data(after_data)

        # Prepare x-axis data based on log_axes parameter
        if log_axes:
            # Log-transform the original data for the x-axis
            # Add 1 to handle zero values before taking the log
            with np.errstate(divide="ignore", invalid="ignore"):
                x_data = np.log10(before_data + 1)
            xlabel = "Log10(Original Value + 1)"

            # Find min/max of log-transformed data for x-axis range
            finite = x_data[np.isfinite(x_data)]
            x_min = float(np.min(finite))
            x_max = float(np.max(finite))
        else:
            # Use raw original data for the x-axis
            x_data = before_data
            xlabel = "Original Value"

            # Find min/max of raw data for x-axis range
            x_min = float(np.min(x_data))
            x_max = float(np.max(x_data))

        # Add padding to x-axis range
        padding = (x_max - x_min) * 0.05  # 5% padding
        xlim = (x_min - padding, x_max + padding)

        # Get the number of features (N) for setting y-axis limits
        n_features = before_data.shape[1]

        # Set custom y-axis limits and labels for rank normalization
        # Y-axis shows ranks from 1 to N (unchanged logic)
        ylim = (0, n_features + 1)

        return create_hexbin_comparison(
            x_data,  # Use log-transformed data for x-axis
            after_data,
            figsize=figsize,
            title=title,
            xlabel=xlabel,
            ylabel=f"Assigned Rank (1 to {n_features})",
            log_axes=False,  # Data is already transformed
            xlim=xlim,
            ylim=ylim,
            autoscale_y=True,  # Allow y-axis to use its own scale
            add_identity_line=False,  # Identity line is not meaningful here
        )