Source code for pronoms.normalizers.l1_normalizer

"""
L1 Normalizer for proteomics data.

This module provides a class for L1 normalization of proteomics data.
"""

import matplotlib.pyplot as plt
import numpy as np

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import check_nan_inf, validate_input_data



[docs]
class L1Normalizer:
    """
    Normalizer that scales each sample to have an L1 norm of 1.

    L1 normalization divides each value in a sample by the sum of absolute values
    in that sample. This is also known as "sum normalization" in proteomics.

    Attributes
    ----------
    scaling_factors : Optional[np.ndarray]
        Scaling factors used for normalization (L1 norm of each sample).
        Only available after calling normalize().
    mean_of_scaling_factors : Optional[float]
        Mean of scaling factors used to preserve original scale.
        Only available after calling normalize().
    """

    def __init__(self):
        """Initialize the L1Normalizer."""
        self.scaling_factors = None
        self.mean_of_scaling_factors = None


[docs]
    def normalize(self, X: np.ndarray) -> np.ndarray:
        """
        Perform L1 normalization on input data X.

        Parameters
        ----------
        X : np.ndarray
            Input data matrix with shape (n_samples, n_features).
            Each row represents a sample, each column represents a feature/protein.

        Returns
        -------
        np.ndarray
            Normalized data matrix with the same shape as X.

        Raises
        ------
        ValueError
            If input data contains NaN or Inf values.
        """
        # Validate input data
        X = validate_input_data(X)

        # Check for NaN or Inf values
        has_nan_inf, _ = check_nan_inf(X)
        if has_nan_inf:
            raise ValueError("Input data contains NaN or Inf values. Please handle these values before normalization.")

        # Calculate L1 norm (sum of absolute values) for each sample (row)
        l1_norms = np.sum(np.abs(X), axis=1, keepdims=True)

        # Store the true norms and their unbiased mean for inspection.
        self.scaling_factors = l1_norms.flatten()
        self.mean_of_scaling_factors = float(np.mean(self.scaling_factors))

        # Zero-guard the divisor only; an all-zero row stays all-zero because
        # the numerator is also zero.
        divisor = np.where(l1_norms == 0, 1.0, l1_norms)
        normalized_data = (X / divisor) * self.mean_of_scaling_factors

        return normalized_data



[docs]
    def plot_comparison(
        self,
        before_data: np.ndarray,
        after_data: np.ndarray,
        figsize: tuple[int, int] = (10, 8),
        title: str = "L1 Normalization Comparison",
    ) -> plt.Figure:
        """
        Plot data before vs after normalization using a 2D hexbin density plot.

        Parameters
        ----------
        before_data : np.ndarray
            Data before normalization, shape (n_samples, n_features).
        after_data : np.ndarray
            Data after normalization, shape (n_samples, n_features).
        figsize : Tuple[int, int], optional
            Figure size, by default (10, 8).
        title : str, optional
            Plot title, by default "L1 Normalization Comparison".

        Returns
        -------
        plt.Figure
            Figure object containing the hexbin density plot.
        """
        # Validate input data
        before_data = validate_input_data(before_data)
        after_data = validate_input_data(after_data)

        # Create hexbin comparison plot
        fig = create_hexbin_comparison(
            before_data,
            after_data,
            figsize=figsize,
            title=title,
            xlabel="Before L1 Normalization",
            ylabel="After L1 Normalization",
        )

        return fig