"""
DirectLFQ Normalizer for proteomics data.
This module provides a class for DirectLFQ normalization of proteomics data,
using the directlfq library.
"""
import directlfq.config as dlcfg
import directlfq.normalization as dlnorm
import directlfq.protein_intensity_estimation as dlprot
import directlfq.utils as dlu
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ..utils.plotting import create_hexbin_comparison
from ..utils.validators import validate_input_data
[docs]
class DirectLFQNormalizer:
"""
Normalizer using the DirectLFQ algorithm for in-memory processing.
This normalizer wraps the external `directlfq` library to perform
intensity normalization directly on NumPy arrays without intermediate
file I/O. It processes peptide-level data to produce normalized
protein-level and peptide-level intensities.
Parameters
----------
do_between_sample_norm : bool, optional
Whether to perform between-sample normalization (median centering
based on selected stable proteins), by default True.
n_quad_samples : int, optional
Number of samples used for quadratic stabilization during
between-sample normalization, by default 50.
n_quad_ions : int, optional
Number of ions used for quadratic stabilization during protein
intensity estimation, by default 10.
min_nonan : int, optional
Minimum number of non-NaN values required per protein for its
intensity to be estimated, by default 1.
num_cores : int | None, optional
Number of CPU cores to use for parallel processing in directlfq.
If None, directlfq attempts to use all available cores, by default None.
Attributes
----------
do_between_sample_norm : bool
Flag indicating if between-sample normalization is enabled.
n_quad_samples : int
Number of samples for quadratic stabilization (sample norm).
n_quad_ions : int
Number of ions for quadratic stabilization (protein estimation).
min_nonan : int
Minimum non-NaN values required per protein.
num_cores : Optional[int]
Number of cores used by directlfq.
"""
def __init__(
self,
do_between_sample_norm: bool = True,
n_quad_samples: int = 50,
n_quad_ions: int = 10,
min_nonan: int = 1,
num_cores: int | None = None,
):
self.do_between_sample_norm = do_between_sample_norm
self.n_quad_samples = n_quad_samples
self.n_quad_ions = n_quad_ions
self.min_nonan = min_nonan
self.num_cores = num_cores
[docs]
def normalize(
self,
X: np.ndarray,
proteins: list[str],
peptides: list[str],
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""
Run DirectLFQ on the given peptide-level intensity matrix in memory.
This method orchestrates the DirectLFQ workflow:
1. Constructs a DataFrame in the format required by `directlfq`.
2. Applies preprocessing steps (log transform, sorting, NaN removal).
3. Optionally performs between-sample normalization.
4. Estimates protein intensities.
5. Extracts normalized protein and ion matrices and their corresponding IDs.
Parameters
----------
X : np.ndarray
Input data matrix with shape (n_samples, n_features), where features
typically represent peptides or ions.
proteins : list[str]
List of protein identifiers corresponding to each feature (column) in X.
The length must equal `X.shape[1]`.
peptides : list[str]
List of peptide or ion identifiers corresponding to each feature (column)
in X. The length must equal `X.shape[1]`.
Returns
-------
tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
A tuple containing four NumPy arrays:
- protein_matrix: Normalized protein intensities (shape: n_samples, n_proteins).
- ion_matrix: Normalized peptide/ion intensities (shape: n_samples, n_peptides).
- protein_ids: Array of unique protein identifiers corresponding to the
columns of `protein_matrix` (shape: n_proteins,).
- peptide_ids: Array of unique peptide/ion identifiers corresponding to the
columns of `ion_matrix` (shape: n_peptides,).
Raises
------
ValueError
- If input `X` is not 2-dimensional.
- If lengths of `proteins` or `peptides` do not match `X.shape[1]`.
- If `X` contains NaN or infinite values.
- If internal DataFrame processing or ID extraction fails.
ImportError
If the 'directlfq' library is not installed.
"""
# ----------------- Input validation -----------------
if X.ndim != 2:
raise ValueError("X must be a 2-D array (samples × features).")
if len(proteins) != X.shape[1] or len(peptides) != X.shape[1]:
raise ValueError("Lengths of 'proteins' and 'peptides' must equal n_features.")
if np.isnan(X).any() or np.isinf(X).any():
raise ValueError("DirectLFQ cannot handle NaN or Inf values.")
# ----------------- Construct DataFrame ---------------
n_samples, _ = X.shape
sample_cols = [f"sample_{i + 1}" for i in range(n_samples)]
df = pd.DataFrame({"protein": proteins, "ion": peptides, **{sample_cols[i]: X[i, :] for i in range(n_samples)}})
# ----------------- DirectLFQ Configuration -----------
dlcfg.set_global_protein_and_ion_id(protein_id="protein", quant_id="ion")
dlcfg.check_wether_to_copy_numpy_arrays_derived_from_pandas()
# ----------------- Preprocessing ---------------------
df = dlu.sort_input_df_by_protein_and_quant_id(df)
df = dlu.index_and_log_transform_input_df(df)
df = dlu.remove_allnan_rows_input_df(df)
if self.do_between_sample_norm:
df = dlnorm.NormalizationManagerSamplesOnSelectedProteins(
df,
num_samples_quadratic=self.n_quad_samples,
selected_proteins_file=None,
).complete_dataframe
# ----------------- Protein inference -----------------
prot_df, ion_df = dlprot.estimate_protein_intensities(
df,
min_nonan=self.min_nonan,
num_samples_quadratic=self.n_quad_ions,
num_cores=self.num_cores,
)
# ----------------- Extract IDs -----------------------
protein_ids = (
prot_df["protein"].to_numpy(dtype=str, copy=False)
if "protein" in prot_df.columns
else np.array(prot_df.index, dtype=str)
)
peptide_ids = (
ion_df["ion"].to_numpy(dtype=str, copy=False)
if "ion" in ion_df.columns
else ion_df.index.get_level_values("ion").to_numpy(dtype=str)
)
# ----------------- Drop ID columns -------------------
prot_numeric = prot_df.drop(columns=["protein"], errors="ignore")
ion_numeric = ion_df.drop(columns=["protein", "ion"], errors="ignore")
# ----------------- To NumPy --------------------------
protein_matrix = prot_numeric.T.to_numpy(dtype=np.float64, copy=False)
ion_matrix = ion_numeric.T.to_numpy(dtype=np.float64, copy=False)
# ----------------- Sanity check ----------------------
if ion_matrix.shape[1] != peptide_ids.shape[0]:
raise ValueError("Ion matrix shape does not match number of returned peptide IDs.")
return protein_matrix, ion_matrix, protein_ids, peptide_ids
[docs]
def plot_comparison(
self,
before_data: np.ndarray,
after_data: np.ndarray,
figsize: tuple[int, int] = (10, 8),
title: str = "DirectLFQ Protein Normalization Comparison",
) -> plt.Figure:
"""
Plot protein data before vs after DirectLFQ normalization using a hexbin plot.
Note: This plots the *protein* level intensities. DirectLFQ computes these
from the input peptide/ion intensities.
Parameters
----------
before_data : np.ndarray
Protein intensity data *before* normalization, shape (n_samples, n_proteins).
This needs to be calculated/provided separately if the input to
`normalize` was peptide-level.
after_data : np.ndarray
Normalized protein intensity data *after* normalization, shape (n_samples, n_proteins).
Typically the first element returned by the `normalize` method.
figsize : Tuple[int, int], optional
Figure size, by default (10, 8).
title : str, optional
Plot title, by default "DirectLFQ Protein Normalization Comparison".
Returns
-------
plt.Figure
Figure object containing the hexbin density plot.
"""
# Validate input data
before_data = validate_input_data(before_data)
after_data = validate_input_data(after_data)
if before_data.shape != after_data.shape:
print(
"Warning: Shape mismatch in plot_comparison: "
f"before={before_data.shape}, after={after_data.shape}. "
"Plotting may be misleading."
)
# Create hexbin comparison plot
fig = create_hexbin_comparison(
before_data,
after_data,
figsize=figsize,
title=title,
xlabel="Before DirectLFQ (Protein Intensity)",
ylabel="After DirectLFQ (Protein Intensity)",
)
return fig