Source code for pronoms.normalizers.rank_normalizer

"""
Rank Normalizer for proteomics data.

This module provides a class for rank transformation normalization of proteomics data.
"""

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import rankdata

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data


[docs] class RankNormalizer: """ Normalizer that transforms each sample's values to their ranks. This normalizer replaces each value in a sample with its rank, where the smallest value gets rank 1 and the largest gets rank N (number of features). Tied values are assigned the median rank of their group. Attributes ---------- normalize_by_n : bool Whether to divide ranks by N (number of features) for comparability. ranks : np.ndarray | None The rank-transformed data. Only available after calling normalize(). """ def __init__(self, normalize_by_n: bool = False): """ Initialize the RankNormalizer. Parameters ---------- normalize_by_n : bool, optional If True, divide all ranks by N (number of features) to get values between 1/N and 1. This makes ranks comparable across datasets with different numbers of features. By default False. """ self.normalize_by_n = normalize_by_n self.ranks: np.ndarray | None = None
[docs] def normalize(self, X: np.ndarray) -> np.ndarray: """ Perform rank transformation on input data X. Parameters ---------- X : np.ndarray Input data matrix with shape (n_samples, n_features). Each row represents a sample, each column represents a feature/protein. Returns ------- np.ndarray Rank-transformed data matrix with the same shape as X. Values range from 1 to N (or 1/N to 1 if normalize_by_n=True). Raises ------ ValueError - If input is not a 2D array with at least one feature. - If input data contains NaN or Inf values. """ # Dimensionality guard if X.ndim != 2 or X.shape[1] == 0: raise ValueError("X must be a 2D array with at least one feature (n_samples, n_features).") # Validate input data (dtype conversion, etc.) X = validate_input_data(X) # Check for NaN or Inf values has_nan_inf, _ = check_nan_inf(X) if has_nan_inf: raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.") n_samples, n_features = X.shape rank_data = np.zeros_like(X, dtype=float) # Apply rank transformation to each sample (row) for i in range(n_samples): # Use scipy.stats.rankdata with method='average' for median rank of ties # This automatically handles tied values by assigning the average rank sample_ranks = rankdata(X[i, :], method="average") if self.normalize_by_n: # Normalize ranks to [1/N, 1] range sample_ranks = sample_ranks / n_features rank_data[i, :] = sample_ranks # Store the transformed data self.ranks = rank_data.copy() return rank_data
[docs] def plot_comparison( self, before_data: np.ndarray, after_data: np.ndarray, figsize: tuple[int, int] = (10, 8), title: str = "Rank Normalization Comparison", log_axes: bool = False, ) -> plt.Figure: """ Plot data before vs after normalization using a 2D hexbin density plot. Parameters ---------- before_data : np.ndarray Data before normalization, shape (n_samples, n_features). after_data : np.ndarray Data after normalization, shape (n_samples, n_features). figsize : Tuple[int, int], optional Figure size, by default (10, 8). title : str, optional Plot title, by default "Rank Normalization Comparison". log_axes : bool, optional If True, plot log10 of the original values on the x-axis. If False (default), plot raw original values on the x-axis. The y-axis always shows the actual rank values from the normalization. Log scaling of x-axis can help visualize data with wide dynamic ranges. Returns ------- plt.Figure Figure object containing the hexbin density plot. Notes ----- The y-axis limits and label are set assuming integer ranks in ``[1, n_features]`` (the default ``normalize_by_n=False`` case). When the normalizer was constructed with ``normalize_by_n=True`` the plotted y-values are in ``(1/n_features, 1]`` and the y-axis label ("Assigned Rank (1 to N)") and ylim ``(0, n_features+1)`` will not match the data — read the y-tick values rather than the label in that case, or pass the raw integer-rank output through the helper directly. """ # Validate input data before_data = validate_input_data(before_data) after_data = validate_input_data(after_data) # Prepare x-axis data based on log_axes parameter if log_axes: # Log-transform the original data for the x-axis # Add 1 to handle zero values before taking the log with np.errstate(divide="ignore", invalid="ignore"): x_data = np.log10(before_data + 1) xlabel = "Log10(Original Value + 1)" # Find min/max of log-transformed data for x-axis range finite = x_data[np.isfinite(x_data)] x_min = float(np.min(finite)) x_max = float(np.max(finite)) else: # Use raw original data for the x-axis x_data = before_data xlabel = "Original Value" # Find min/max of raw data for x-axis range x_min = float(np.min(x_data)) x_max = float(np.max(x_data)) # Add padding to x-axis range padding = (x_max - x_min) * 0.05 # 5% padding xlim = (x_min - padding, x_max + padding) # Get the number of features (N) for setting y-axis limits n_features = before_data.shape[1] # Set custom y-axis limits and labels for rank normalization # Y-axis shows ranks from 1 to N (unchanged logic) ylim = (0, n_features + 1) return create_hexbin_comparison( x_data, # Use log-transformed data for x-axis after_data, figsize=figsize, title=title, xlabel=xlabel, ylabel=f"Assigned Rank (1 to {n_features})", log_axes=False, # Data is already transformed xlim=xlim, ylim=ylim, autoscale_y=True, # Allow y-axis to use its own scale add_identity_line=False, # Identity line is not meaningful here )