Source code for pronoms.normalizers.splm_normalizer

import matplotlib.pyplot as plt
import numpy as np

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data


[docs] class SPLMNormalizer: """ Normalizer based on Stable Protein Log-Mean Normalization (SPLM-Norm). Scales proteomics intensity data using a subset of stably expressed proteins (lowest coefficient of variation, computed in linear space as ``std(X) / mean(X)`` per protein across samples). It uses the mean of log-transformed intensities of these stable proteins per sample to define scaling factors, performs normalization in log-space, recenters, and then transforms back to the original scale. Attributes ---------- num_stable_proteins : int Number of stable proteins used for calculating scaling factors. epsilon : float Small constant added before log transformation to avoid log(0). stable_protein_indices : Optional[np.ndarray] Indices of the proteins identified as stable. Available after normalize(). log_scaling_factors : Optional[np.ndarray] The per-sample log-space scaling factors derived from stable proteins. Available after normalize(). grand_mean_log_scaling_factor : Optional[float] The mean of the log_scaling_factors across all samples. Available after normalize(). """ def __init__(self, num_stable_proteins: int = 100, epsilon: float = 1e-6): """ Initialize the SPLMNormalizer. Parameters ---------- num_stable_proteins : int, optional Number of proteins with the lowest linear-space CV (``std/mean``) to use as stable references, by default 100. epsilon : float, optional Small constant added to intensities before log transformation to avoid log(0), by default 1e-6. """ if not isinstance(num_stable_proteins, int) or num_stable_proteins <= 0: raise ValueError("num_stable_proteins must be a positive integer.") if not isinstance(epsilon, (int, float)) or epsilon < 0: raise ValueError("epsilon must be a non-negative number.") self.num_stable_proteins = num_stable_proteins self.epsilon = epsilon self.stable_protein_indices: np.ndarray | None = None self.log_scaling_factors: np.ndarray | None = None self.grand_mean_log_scaling_factor: float | None = None
[docs] def normalize(self, X: np.ndarray) -> np.ndarray: """ Perform Stable Protein Log-Mean Normalization on input data X. Parameters ---------- X : np.ndarray Input data matrix with shape (n_samples, n_features). Each row represents a sample, each column represents a feature/protein. Returns ------- np.ndarray Normalized data matrix with the same shape as X. Raises ------ ValueError - If input is not a 2D array with at least one feature. - If input data contains NaN or Inf values. - If num_stable_proteins is greater than the number of features in X. - If stable proteins cannot be determined (e.g., all proteins have zero variance). """ # 1. Validate input X = validate_input_data(X) if X.ndim != 2 or X.shape[1] == 0: raise ValueError("X must be a 2D array with at least one feature (n_samples, n_features).") has_nan_inf, _ = check_nan_inf(X) if has_nan_inf: raise ValueError("Input data contains NaN or Inf values.") if self.num_stable_proteins > X.shape[1]: raise ValueError( f"num_stable_proteins ({self.num_stable_proteins}) " f"cannot be greater than the number of features ({X.shape[1]})." ) # 2. Compute per-protein CV in linear space (std/mean across samples). # Constant proteins (std == 0) get CV=0 and are most preferred. # Proteins with zero mean would give an undefined CV; treat those as +inf # so they are deprioritized rather than crashing the selection. means = np.mean(X, axis=0) stds = np.std(X, axis=0) cvs = np.full_like(means, np.inf) nonzero_mean = means != 0 cvs[nonzero_mean] = stds[nonzero_mean] / means[nonzero_mean] cvs[stds == 0] = 0.0 if np.all(np.isinf(cvs)): raise ValueError("Could not compute valid CVs for any protein. Check input data variance.") # 3. Select stable reference proteins (smallest CVs). partitioned_indices = np.argpartition(cvs, self.num_stable_proteins - 1) self.stable_protein_indices = np.sort(partitioned_indices[: self.num_stable_proteins]) # 4. Log-transform for the centering step. # Epsilon guards against log(0) when intensities reach the floor. X_log = np.log(X + self.epsilon) # 5. Compute sample-wise log-scaling factors stable_log_data = X_log[:, self.stable_protein_indices] self.log_scaling_factors = np.mean(stable_log_data, axis=1) # 6. Normalize in log space # Reshape factors for broadcasting (n_samples,) -> (n_samples, 1) log_factors_reshaped = self.log_scaling_factors[:, np.newaxis] X_log_norm = X_log - log_factors_reshaped # 7. Recenter the data self.grand_mean_log_scaling_factor = float(np.mean(self.log_scaling_factors)) X_log_recentered = X_log_norm + self.grand_mean_log_scaling_factor # 8. Back-transform to linear space X_norm = np.exp(X_log_recentered) - self.epsilon # Ensure non-negativity after subtracting epsilon X_norm = np.maximum(0, X_norm) return X_norm
[docs] def plot_comparison( self, before_data: np.ndarray, after_data: np.ndarray, figsize: tuple[int, int] = (10, 8), title: str = "SPLM Normalization Comparison", ) -> plt.Figure: """ Plot data before vs after normalization using a 2D hexbin density plot. Parameters ---------- before_data : np.ndarray Data before normalization, shape (n_samples, n_features). after_data : np.ndarray Data after normalization, shape (n_samples, n_features). figsize : Tuple[int, int], optional Figure size, by default (10, 8). title : str, optional Plot title, by default "SPLM Normalization Comparison". Returns ------- plt.Figure Figure object containing the hexbin density plot. """ before_data = validate_input_data(before_data) after_data = validate_input_data(after_data) fig = create_hexbin_comparison( before_data, after_data, figsize=figsize, title=title, xlabel="Before SPLM Normalization", ylabel="After SPLM Normalization", ) return fig