Source code for pronoms.normalizers.vsn_normalizer

"""
VSN (Variance Stabilizing Normalization) Normalizer for proteomics data.

This module provides ``VSNNormalizer``, a pure-Python implementation of
Huber et al.'s VSN algorithm. Earlier releases of pronoms used the
Bioconductor ``vsn`` R package via ``rpy2``; that dependency has been
replaced by a vectorized NumPy/SciPy implementation in
``pronoms.normalizers._vsn_engine``. The public class signature is
preserved.
"""

from __future__ import annotations

import warnings
from typing import Any

import matplotlib.pyplot as plt
import numpy as np

from ..utils.plotting import plot_comparison_hexbin
from ..utils.validators import check_nan_inf, validate_input_data
from . import _vsn_engine



[docs]
class VSNNormalizer:
    """Variance-stabilizing normalization (Huber et al., 2002).

    The fitted model is ``h_ij = arsinh(exp(beta_j) * y_ij + a_j)``, which
    behaves like ``log(y_ij)`` for large ``y_ij`` and is well-defined for
    zero or negative inputs. Per-sample offset ``a_j`` and log-scale
    factor ``beta_j`` are estimated by maximum profile likelihood with a
    least-trimmed-squares robustification step. The output is on a
    log2-comparable scale.

    Parameters
    ----------
    calib : str, optional
        Calibration method, by default ``"affine"``. ``"affine"`` is the
        only supported value; other values raise ``ValueError``.
    reference_sample : Optional[int], optional
        **Deprecated.** Always treated as ``None`` regardless of value.
        Pass ``None`` (the default) to silence the deprecation warning.
        Will be removed in a future major release.
    lts_quantile : float, optional
        Quantile for the Least-Trimmed-Squares robust step, by default
        ``0.75``. Must lie in (0, 1]. ``1.0`` disables the robust step.

    Attributes
    ----------
    vsn_params : Optional[dict]
        Fitted parameters as a plain Python dict. Only populated after
        ``normalize()``. Keys:

        ``coefficients`` : np.ndarray, shape ``(1, n_samples, 2)``
            Per-sample (a, beta) coefficients in R-VSN's layout. Slice
            ``[0, :, 0]`` is ``a``; ``[0, :, 1]`` is ``beta = log(b)``.
        ``a`` : np.ndarray, shape ``(n_samples,)``
            Per-sample offset.
        ``b_log`` : np.ndarray, shape ``(n_samples,)``
            Per-sample log-scale factor; the actual scaling is
            ``np.exp(b_log)``.
        ``sigsq`` : float
            Profiled variance on the natural ``arsinh`` scale.
        ``hoffset`` : float
            Offset applied during transform to put output on a log2-like
            scale.
        ``mu`` : np.ndarray, shape ``(n_features,)``
            Per-feature mean of the transformed data on the natural
            ``arsinh`` scale (rows trimmed by LTS in iterations >1
            appear as ``NaN``, matching R's bookkeeping).
        ``converged`` : bool
            Whether the inner L-BFGS-B converged on the final LTS
            iteration.
        ``n_lts_iter`` : int
            Number of LTS iterations executed (capped at 7).
    """

    def __init__(
        self,
        calib: str = "affine",
        reference_sample: int | None = None,
        lts_quantile: float = 0.75,
    ) -> None:
        if calib != "affine":
            raise ValueError(f"Only calib='affine' is supported by the native VSN engine; got {calib!r}.")
        self.calib = calib
        if reference_sample is not None:
            warnings.warn(
                "VSNNormalizer's 'reference_sample' parameter is deprecated and will "
                "be removed in a future major release. R-VSN's 'reference' argument "
                "expects a fitted vsn object, not a single-sample index, so this "
                "parameter cannot be implemented faithfully. Calls now ignore the "
                "value and use vsn2's default reference selection.",
                DeprecationWarning,
                stacklevel=2,
            )
        self.reference_sample = reference_sample
        if not 0 < lts_quantile <= 1:
            raise ValueError("lts_quantile must be between 0 and 1")
        self.lts_quantile = lts_quantile
        self.vsn_params: dict[str, Any] | None = None


[docs]
    def normalize(
        self,
        X: np.ndarray,
        protein_ids: list[str] | None = None,
        sample_ids: list[str] | None = None,
    ) -> np.ndarray:
        """Fit VSN on ``X`` and return the transformed matrix.

        Parameters
        ----------
        X : np.ndarray
            Input data with shape ``(n_samples, n_features)``.
        protein_ids : Optional[List[str]], optional
            Unused; retained for API compatibility with the previous
            R-backed implementation.
        sample_ids : Optional[List[str]], optional
            Unused; retained for API compatibility.

        Returns
        -------
        np.ndarray
            Normalized data with the same shape as ``X``, on a
            log2-comparable (variance-stabilized) scale.

        Raises
        ------
        ValueError
            If the input contains NaN or Inf, or if the native VSN
            engine fails to fit (e.g., fewer than 2 samples).
        """
        del protein_ids, sample_ids  # accepted for back-compat, not used
        X = validate_input_data(X)
        has_nan_inf, _ = check_nan_inf(X)
        if has_nan_inf:
            raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.")

        # Native engine works in (n_features, n_samples) orientation.
        try:
            X_t = np.ascontiguousarray(X.T, dtype=np.float64)
            normalized_t, fit = _vsn_engine.fit_transform(
                X_t,
                lts_quantile=self.lts_quantile,
            )
        except Exception as e:
            raise ValueError(f"VSN normalization failed: {e!s}") from e

        self.vsn_params = {
            "coefficients": fit.coefficients,
            "a": fit.a,
            "b_log": fit.b_log,
            "sigsq": fit.sigsq,
            "hoffset": fit.hoffset,
            "mu": fit.mu,
            "converged": fit.converged,
            "n_lts_iter": fit.n_lts_iter,
        }
        return np.ascontiguousarray(normalized_t.T)



[docs]
    def plot_comparison(
        self,
        before_data: np.ndarray,
        after_data: np.ndarray,
        figsize: tuple[int, int] = (8, 8),
        gridsize: int = 50,
        cmap: str = "viridis",
    ) -> plt.Figure:
        """Plot a hexbin comparison of raw vs. VSN-normalized intensities.

        Parameters
        ----------
        before_data : np.ndarray
            Data before normalization.
        after_data : np.ndarray
            Data after normalization (output of ``normalize``).
        figsize : Tuple[int, int], optional
        gridsize : int, optional
        cmap : str, optional

        Returns
        -------
        plt.Figure
        """
        return plot_comparison_hexbin(
            before_data=before_data,
            after_data=after_data,
            title="VSN Normalization Comparison (glog2 vs log2)",
            figsize=figsize,
            gridsize=gridsize,
            cmap=cmap,
            transform_original="log2",
            log_axes=False,
        )