Source code for pronoms.normalizers.directlfq_normalizer

"""
DirectLFQ Normalizer for proteomics data.

This module provides a class for DirectLFQ normalization of proteomics data,
using the directlfq library.

"""

import directlfq.config as dlcfg
import directlfq.normalization as dlnorm
import directlfq.protein_intensity_estimation as dlprot
import directlfq.utils as dlu
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import validate_input_data


[docs] class DirectLFQNormalizer: """ Normalizer using the DirectLFQ algorithm for in-memory processing. This normalizer wraps the external `directlfq` library to perform intensity normalization directly on NumPy arrays without intermediate file I/O. It processes peptide-level data to produce normalized protein-level and peptide-level intensities. Parameters ---------- do_between_sample_norm : bool, optional Whether to perform between-sample normalization (median centering based on selected stable proteins), by default True. n_quad_samples : int, optional Number of samples used for quadratic stabilization during between-sample normalization, by default 50. n_quad_ions : int, optional Number of ions used for quadratic stabilization during protein intensity estimation, by default 10. min_nonan : int, optional Minimum number of non-NaN values required per protein for its intensity to be estimated, by default 1. num_cores : int | None, optional Number of CPU cores to use for parallel processing in directlfq. If None, directlfq attempts to use all available cores, by default None. Attributes ---------- do_between_sample_norm : bool Flag indicating if between-sample normalization is enabled. n_quad_samples : int Number of samples for quadratic stabilization (sample norm). n_quad_ions : int Number of ions for quadratic stabilization (protein estimation). min_nonan : int Minimum non-NaN values required per protein. num_cores : Optional[int] Number of cores used by directlfq. """ def __init__( self, do_between_sample_norm: bool = True, n_quad_samples: int = 50, n_quad_ions: int = 10, min_nonan: int = 1, num_cores: int | None = None, ): self.do_between_sample_norm = do_between_sample_norm self.n_quad_samples = n_quad_samples self.n_quad_ions = n_quad_ions self.min_nonan = min_nonan self.num_cores = num_cores
[docs] def normalize( self, X: np.ndarray, proteins: list[str], peptides: list[str], ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Run DirectLFQ on the given peptide-level intensity matrix in memory. This method orchestrates the DirectLFQ workflow: 1. Constructs a DataFrame in the format required by `directlfq`. 2. Applies preprocessing steps (log transform, sorting, NaN removal). 3. Optionally performs between-sample normalization. 4. Estimates protein intensities. 5. Extracts normalized protein and ion matrices and their corresponding IDs. Parameters ---------- X : np.ndarray Input data matrix with shape (n_samples, n_features), where features typically represent peptides or ions. proteins : list[str] List of protein identifiers corresponding to each feature (column) in X. The length must equal `X.shape[1]`. peptides : list[str] List of peptide or ion identifiers corresponding to each feature (column) in X. The length must equal `X.shape[1]`. Returns ------- tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray] A tuple containing four NumPy arrays: - protein_matrix: Normalized protein intensities (shape: n_samples, n_proteins). - ion_matrix: Normalized peptide/ion intensities (shape: n_samples, n_peptides). - protein_ids: Array of unique protein identifiers corresponding to the columns of `protein_matrix` (shape: n_proteins,). - peptide_ids: Array of unique peptide/ion identifiers corresponding to the columns of `ion_matrix` (shape: n_peptides,). Raises ------ ValueError - If input `X` is not 2-dimensional. - If lengths of `proteins` or `peptides` do not match `X.shape[1]`. - If `X` contains NaN or infinite values. - If internal DataFrame processing or ID extraction fails. ImportError If the 'directlfq' library is not installed. """ # ----------------- Input validation ----------------- if X.ndim != 2: raise ValueError("X must be a 2-D array (samples × features).") if len(proteins) != X.shape[1] or len(peptides) != X.shape[1]: raise ValueError("Lengths of 'proteins' and 'peptides' must equal n_features.") if np.isnan(X).any() or np.isinf(X).any(): raise ValueError("DirectLFQ cannot handle NaN or Inf values.") # ----------------- Construct DataFrame --------------- n_samples, _ = X.shape sample_cols = [f"sample_{i + 1}" for i in range(n_samples)] df = pd.DataFrame({"protein": proteins, "ion": peptides, **{sample_cols[i]: X[i, :] for i in range(n_samples)}}) # ----------------- DirectLFQ Configuration ----------- dlcfg.set_global_protein_and_ion_id(protein_id="protein", quant_id="ion") dlcfg.check_wether_to_copy_numpy_arrays_derived_from_pandas() # ----------------- Preprocessing --------------------- df = dlu.sort_input_df_by_protein_and_quant_id(df) df = dlu.index_and_log_transform_input_df(df) df = dlu.remove_allnan_rows_input_df(df) if self.do_between_sample_norm: df = dlnorm.NormalizationManagerSamplesOnSelectedProteins( df, num_samples_quadratic=self.n_quad_samples, selected_proteins_file=None, ).complete_dataframe # ----------------- Protein inference ----------------- prot_df, ion_df = dlprot.estimate_protein_intensities( df, min_nonan=self.min_nonan, num_samples_quadratic=self.n_quad_ions, num_cores=self.num_cores, ) # ----------------- Extract IDs ----------------------- protein_ids = ( prot_df["protein"].to_numpy(dtype=str, copy=False) if "protein" in prot_df.columns else np.array(prot_df.index, dtype=str) ) peptide_ids = ( ion_df["ion"].to_numpy(dtype=str, copy=False) if "ion" in ion_df.columns else ion_df.index.get_level_values("ion").to_numpy(dtype=str) ) # ----------------- Drop ID columns ------------------- prot_numeric = prot_df.drop(columns=["protein"], errors="ignore") ion_numeric = ion_df.drop(columns=["protein", "ion"], errors="ignore") # ----------------- To NumPy -------------------------- protein_matrix = prot_numeric.T.to_numpy(dtype=np.float64, copy=False) ion_matrix = ion_numeric.T.to_numpy(dtype=np.float64, copy=False) # ----------------- Sanity check ---------------------- if ion_matrix.shape[1] != peptide_ids.shape[0]: raise ValueError("Ion matrix shape does not match number of returned peptide IDs.") return protein_matrix, ion_matrix, protein_ids, peptide_ids
[docs] def plot_comparison( self, before_data: np.ndarray, after_data: np.ndarray, figsize: tuple[int, int] = (10, 8), title: str = "DirectLFQ Protein Normalization Comparison", ) -> plt.Figure: """ Plot protein data before vs after DirectLFQ normalization using a hexbin plot. Note: This plots the *protein* level intensities. DirectLFQ computes these from the input peptide/ion intensities. Parameters ---------- before_data : np.ndarray Protein intensity data *before* normalization, shape (n_samples, n_proteins). This needs to be calculated/provided separately if the input to `normalize` was peptide-level. after_data : np.ndarray Normalized protein intensity data *after* normalization, shape (n_samples, n_proteins). Typically the first element returned by the `normalize` method. figsize : Tuple[int, int], optional Figure size, by default (10, 8). title : str, optional Plot title, by default "DirectLFQ Protein Normalization Comparison". Returns ------- plt.Figure Figure object containing the hexbin density plot. """ # Validate input data before_data = validate_input_data(before_data) after_data = validate_input_data(after_data) if before_data.shape != after_data.shape: print( "Warning: Shape mismatch in plot_comparison: " f"before={before_data.shape}, after={after_data.shape}. " "Plotting may be misleading." ) # Create hexbin comparison plot fig = create_hexbin_comparison( before_data, after_data, figsize=figsize, title=title, xlabel="Before DirectLFQ (Protein Intensity)", ylabel="After DirectLFQ (Protein Intensity)", ) return fig