"""
Rank Normalizer for proteomics data.
This module provides a class for rank transformation normalization of proteomics data.
"""
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import rankdata
from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data
[docs]
class RankNormalizer:
"""
Normalizer that transforms each sample's values to their ranks.
This normalizer replaces each value in a sample with its rank, where the
smallest value gets rank 1 and the largest gets rank N (number of features).
Tied values are assigned the median rank of their group.
Attributes
----------
normalize_by_n : bool
Whether to divide ranks by N (number of features) for comparability.
ranks : np.ndarray | None
The rank-transformed data. Only available after calling normalize().
"""
def __init__(self, normalize_by_n: bool = False):
"""
Initialize the RankNormalizer.
Parameters
----------
normalize_by_n : bool, optional
If True, divide all ranks by N (number of features) to get values
between 1/N and 1. This makes ranks comparable across datasets
with different numbers of features. By default False.
"""
self.normalize_by_n = normalize_by_n
self.ranks: np.ndarray | None = None
[docs]
def normalize(self, X: np.ndarray) -> np.ndarray:
"""
Perform rank transformation on input data X.
Parameters
----------
X : np.ndarray
Input data matrix with shape (n_samples, n_features).
Each row represents a sample, each column represents a feature/protein.
Returns
-------
np.ndarray
Rank-transformed data matrix with the same shape as X.
Values range from 1 to N (or 1/N to 1 if normalize_by_n=True).
Raises
------
ValueError
- If input is not a 2D array with at least one feature.
- If input data contains NaN or Inf values.
"""
# Dimensionality guard
if X.ndim != 2 or X.shape[1] == 0:
raise ValueError("X must be a 2D array with at least one feature (n_samples, n_features).")
# Validate input data (dtype conversion, etc.)
X = validate_input_data(X)
# Check for NaN or Inf values
has_nan_inf, _ = check_nan_inf(X)
if has_nan_inf:
raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.")
n_samples, n_features = X.shape
rank_data = np.zeros_like(X, dtype=float)
# Apply rank transformation to each sample (row)
for i in range(n_samples):
# Use scipy.stats.rankdata with method='average' for median rank of ties
# This automatically handles tied values by assigning the average rank
sample_ranks = rankdata(X[i, :], method="average")
if self.normalize_by_n:
# Normalize ranks to [1/N, 1] range
sample_ranks = sample_ranks / n_features
rank_data[i, :] = sample_ranks
# Store the transformed data
self.ranks = rank_data.copy()
return rank_data
[docs]
def plot_comparison(
self,
before_data: np.ndarray,
after_data: np.ndarray,
figsize: tuple[int, int] = (10, 8),
title: str = "Rank Normalization Comparison",
log_axes: bool = False,
) -> plt.Figure:
"""
Plot data before vs after normalization using a 2D hexbin density plot.
Parameters
----------
before_data : np.ndarray
Data before normalization, shape (n_samples, n_features).
after_data : np.ndarray
Data after normalization, shape (n_samples, n_features).
figsize : Tuple[int, int], optional
Figure size, by default (10, 8).
title : str, optional
Plot title, by default "Rank Normalization Comparison".
log_axes : bool, optional
If True, plot log10 of the original values on the x-axis. If False (default),
plot raw original values on the x-axis. The y-axis always shows the
actual rank values from the normalization. Log scaling of x-axis can help
visualize data with wide dynamic ranges.
Returns
-------
plt.Figure
Figure object containing the hexbin density plot.
Notes
-----
The y-axis limits and label are set assuming integer ranks in
``[1, n_features]`` (the default ``normalize_by_n=False`` case). When
the normalizer was constructed with ``normalize_by_n=True`` the
plotted y-values are in ``(1/n_features, 1]`` and the y-axis label
("Assigned Rank (1 to N)") and ylim ``(0, n_features+1)`` will not
match the data — read the y-tick values rather than the label in that
case, or pass the raw integer-rank output through the helper directly.
"""
# Validate input data
before_data = validate_input_data(before_data)
after_data = validate_input_data(after_data)
# Prepare x-axis data based on log_axes parameter
if log_axes:
# Log-transform the original data for the x-axis
# Add 1 to handle zero values before taking the log
with np.errstate(divide="ignore", invalid="ignore"):
x_data = np.log10(before_data + 1)
xlabel = "Log10(Original Value + 1)"
# Find min/max of log-transformed data for x-axis range
finite = x_data[np.isfinite(x_data)]
x_min = float(np.min(finite))
x_max = float(np.max(finite))
else:
# Use raw original data for the x-axis
x_data = before_data
xlabel = "Original Value"
# Find min/max of raw data for x-axis range
x_min = float(np.min(x_data))
x_max = float(np.max(x_data))
# Add padding to x-axis range
padding = (x_max - x_min) * 0.05 # 5% padding
xlim = (x_min - padding, x_max + padding)
# Get the number of features (N) for setting y-axis limits
n_features = before_data.shape[1]
# Set custom y-axis limits and labels for rank normalization
# Y-axis shows ranks from 1 to N (unchanged logic)
ylim = (0, n_features + 1)
return create_hexbin_comparison(
x_data, # Use log-transformed data for x-axis
after_data,
figsize=figsize,
title=title,
xlabel=xlabel,
ylabel=f"Assigned Rank (1 to {n_features})",
log_axes=False, # Data is already transformed
xlim=xlim,
ylim=ylim,
autoscale_y=True, # Allow y-axis to use its own scale
add_identity_line=False, # Identity line is not meaningful here
)