Source code for pronoms.normalizers.vsn_normalizer

"""
VSN (Variance Stabilizing Normalization) Normalizer for proteomics data.

This module provides ``VSNNormalizer``, a pure-Python implementation of
Huber et al.'s VSN algorithm. Earlier releases of pronoms used the
Bioconductor ``vsn`` R package via ``rpy2``; that dependency has been
replaced by a vectorized NumPy/SciPy implementation in
``pronoms.normalizers._vsn_engine``. The public class signature is
preserved.
"""

from __future__ import annotations

import warnings
from typing import Any

import matplotlib.pyplot as plt
import numpy as np

from ..utils.plotting import plot_comparison_hexbin
from ..utils.validators import check_nan_inf, validate_input_data
from . import _vsn_engine


[docs] class VSNNormalizer: """Variance-stabilizing normalization (Huber et al., 2002). The fitted model is ``h_ij = arsinh(exp(beta_j) * y_ij + a_j)``, which behaves like ``log(y_ij)`` for large ``y_ij`` and is well-defined for zero or negative inputs. Per-sample offset ``a_j`` and log-scale factor ``beta_j`` are estimated by maximum profile likelihood with a least-trimmed-squares robustification step. The output is on a log2-comparable scale. Parameters ---------- calib : str, optional Calibration method, by default ``"affine"``. ``"affine"`` is the only supported value; other values raise ``ValueError``. reference_sample : Optional[int], optional **Deprecated.** Always treated as ``None`` regardless of value. Pass ``None`` (the default) to silence the deprecation warning. Will be removed in a future major release. lts_quantile : float, optional Quantile for the Least-Trimmed-Squares robust step, by default ``0.75``. Must lie in (0, 1]. ``1.0`` disables the robust step. Attributes ---------- vsn_params : Optional[dict] Fitted parameters as a plain Python dict. Only populated after ``normalize()``. Keys: ``coefficients`` : np.ndarray, shape ``(1, n_samples, 2)`` Per-sample (a, beta) coefficients in R-VSN's layout. Slice ``[0, :, 0]`` is ``a``; ``[0, :, 1]`` is ``beta = log(b)``. ``a`` : np.ndarray, shape ``(n_samples,)`` Per-sample offset. ``b_log`` : np.ndarray, shape ``(n_samples,)`` Per-sample log-scale factor; the actual scaling is ``np.exp(b_log)``. ``sigsq`` : float Profiled variance on the natural ``arsinh`` scale. ``hoffset`` : float Offset applied during transform to put output on a log2-like scale. ``mu`` : np.ndarray, shape ``(n_features,)`` Per-feature mean of the transformed data on the natural ``arsinh`` scale (rows trimmed by LTS in iterations >1 appear as ``NaN``, matching R's bookkeeping). ``converged`` : bool Whether the inner L-BFGS-B converged on the final LTS iteration. ``n_lts_iter`` : int Number of LTS iterations executed (capped at 7). """ def __init__( self, calib: str = "affine", reference_sample: int | None = None, lts_quantile: float = 0.75, ) -> None: if calib != "affine": raise ValueError(f"Only calib='affine' is supported by the native VSN engine; got {calib!r}.") self.calib = calib if reference_sample is not None: warnings.warn( "VSNNormalizer's 'reference_sample' parameter is deprecated and will " "be removed in a future major release. R-VSN's 'reference' argument " "expects a fitted vsn object, not a single-sample index, so this " "parameter cannot be implemented faithfully. Calls now ignore the " "value and use vsn2's default reference selection.", DeprecationWarning, stacklevel=2, ) self.reference_sample = reference_sample if not 0 < lts_quantile <= 1: raise ValueError("lts_quantile must be between 0 and 1") self.lts_quantile = lts_quantile self.vsn_params: dict[str, Any] | None = None
[docs] def normalize( self, X: np.ndarray, protein_ids: list[str] | None = None, sample_ids: list[str] | None = None, ) -> np.ndarray: """Fit VSN on ``X`` and return the transformed matrix. Parameters ---------- X : np.ndarray Input data with shape ``(n_samples, n_features)``. protein_ids : Optional[List[str]], optional Unused; retained for API compatibility with the previous R-backed implementation. sample_ids : Optional[List[str]], optional Unused; retained for API compatibility. Returns ------- np.ndarray Normalized data with the same shape as ``X``, on a log2-comparable (variance-stabilized) scale. Raises ------ ValueError If the input contains NaN or Inf, or if the native VSN engine fails to fit (e.g., fewer than 2 samples). """ del protein_ids, sample_ids # accepted for back-compat, not used X = validate_input_data(X) has_nan_inf, _ = check_nan_inf(X) if has_nan_inf: raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.") # Native engine works in (n_features, n_samples) orientation. try: X_t = np.ascontiguousarray(X.T, dtype=np.float64) normalized_t, fit = _vsn_engine.fit_transform( X_t, lts_quantile=self.lts_quantile, ) except Exception as e: raise ValueError(f"VSN normalization failed: {e!s}") from e self.vsn_params = { "coefficients": fit.coefficients, "a": fit.a, "b_log": fit.b_log, "sigsq": fit.sigsq, "hoffset": fit.hoffset, "mu": fit.mu, "converged": fit.converged, "n_lts_iter": fit.n_lts_iter, } return np.ascontiguousarray(normalized_t.T)
[docs] def plot_comparison( self, before_data: np.ndarray, after_data: np.ndarray, figsize: tuple[int, int] = (8, 8), gridsize: int = 50, cmap: str = "viridis", ) -> plt.Figure: """Plot a hexbin comparison of raw vs. VSN-normalized intensities. Parameters ---------- before_data : np.ndarray Data before normalization. after_data : np.ndarray Data after normalization (output of ``normalize``). figsize : Tuple[int, int], optional gridsize : int, optional cmap : str, optional Returns ------- plt.Figure """ return plot_comparison_hexbin( before_data=before_data, after_data=after_data, title="VSN Normalization Comparison (glog2 vs log2)", figsize=figsize, gridsize=gridsize, cmap=cmap, transform_original="log2", log_axes=False, )