Source code for alphastats.loader.SpectronautLoader

from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd
import numpy as np
import logging



[docs]class SpectronautLoader(BaseLoader):
    """Loader for Spectronaut outputfiles 
    """

    def __init__(
        self,
        file,
        intensity_column="PG.Quantity",
        index_column="PG.ProteinGroups", 
        sample_column = "R.FileName",
        gene_names_column="PG.Genes",
        filter_qvalue = True,
        qvalue_cutoff = 0.01,
        sep="\t"
    ):
        """Loads Spectronaut output. Will add contamination column for further analysis.

        Args:
            file (str): path to Spectronaut outputfile or pandas.DataFrame 
            intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity".
            index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups".
            sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName".
            gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes".
            filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True.
            qvalue_cutoff (float, optional): cut off vaéie. Defaults to 0.01.
            sep (str, optional): file separation of file. Defaults to "\t".
        """
        
        self.software = "Spectronaut"
        self.intensity_column = intensity_column
        self.index_column = index_column
        self.confidence_column = None
        self.filter_columns = []
        self.evidence_df = None 
        self.gene_names = None

        self._read_spectronaut_file(file=file, sep=sep)

        if filter_qvalue:
            self._filter_qvalue(qvalue_cutoff=qvalue_cutoff)

        self._reshape_spectronaut(sample_column=sample_column, gene_names_column=gene_names_column)
        self._add_contamination_column()
        self._read_all_columns_as_string()
              

    def _reshape_spectronaut(self, sample_column, gene_names_column):
        """
        other proteomics softwares use a wide format (column for each sample)
        reshape to a wider format
        """
        self.rawinput["sample"] = self.rawinput[sample_column] + "_" + self.intensity_column
        
        indexing_columns = [self.index_column]
        
        if gene_names_column in self.rawinput.columns.to_list():
            self.gene_names = gene_names_column
            indexing_columns += [self.gene_names]
        
        keep_columns = [self.intensity_column, "sample"] + indexing_columns
        
        df = self.rawinput[keep_columns].drop_duplicates()
        df = df.pivot(columns='sample', index=indexing_columns, values=self.intensity_column)
        df.reset_index(inplace=True)
        
        self.rawinput = df
        
        self.intensity_column = "[sample]_" + self.intensity_column

    def _filter_qvalue(self, qvalue_cutoff):
        if "EG.Qvalue" not in self.rawinput.columns.to_list():
            raise Warning("Column EG.Qvalue not found in file. File will not be filtered according to q-value.")
        
        rows_before_filtering = self.rawinput.shape[0]
        self.rawinput = self.rawinput[self.rawinput["EG.Qvalue"] < qvalue_cutoff]
        rows_after_filtering = self.rawinput.shape[0]

        rows_removed = rows_before_filtering - rows_after_filtering
        logging.info(f"{rows_removed} identification with a qvalue below {qvalue_cutoff} have been removed")
        
    
    def _read_spectronaut_file(self, file, sep):
        # some spectronaut files include european decimal separators
        if isinstance(file, pd.DataFrame):
            df = file
        else:
            df = pd.read_csv(file, sep=sep, low_memory=False)

            if df[self.intensity_column].dtype != np.float64:
                # load european
                df = pd.read_csv(file, sep=sep, decimal=",")

        self.rawinput = df
        
    

#filter_with_Qvalue	
#TRUE(default) will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.

#qvalue_cutoff	
#Cutoff for EG.Qvalue. default is 0.01.

# Protein Level
# PG.Quantity
# PG.ProteinGroups

# Peptide Level
# F.PeakArea
# PEP.StrippedSequence