Source code for alphastats.loader.AlphaPeptLoader

from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd
import numpy as np
import logging


[docs]class AlphaPeptLoader(BaseLoader):
    """Loader for AlphaPept outputfiles
    https://github.com/MannLabs/alphapept
    """

    def __init__(
        self,
        file,
        intensity_column="[sample]_LFQ",
        index_column="Unnamed: 0",  # column name to be changed
        sep=",",
        **kwargs
    ):
        """Loads Alphapept output: results_proteins.csv. Will add contamination column for further analysis.

        Args:
            file (str): AlphaPept output, either results_proteins.csv file or the hdf_file with the protein_table given
            intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample]_LFQ".
            index_column (str, optional): column indicating the protein groups. Defaults to "Unnamed: 0".
            sep (str, optional): file separation of file. Defaults to ",".
        """

        if file.endswith(".hdf"):
            self._load_hdf_protein_table(file=file)
        else:
            self.rawinput = pd.read_csv(file, sep=sep)

        self.intensity_column = intensity_column
        self.index_column = index_column
        self.filter_columns = []
        self.confidence_column = None
        self.software = "AlphaPept"
        self.evidence_df = None
        self.gene_names = None
        # add contamination column "Reverse"
        self._add_contamination_reverse_column()
        self._add_contamination_column()
        self._read_all_columns_as_string()
        #  make ProteinGroup column
        self.rawinput["ProteinGroup"] = self.rawinput[self.index_column].map(
            self._standardize_protein_group_column
        )
        self.index_column = "ProteinGroup"

    def _load_hdf_protein_table(self, file):
        self.rawinput = pd.read_hdf(file, "protein_table")

    def _add_contamination_reverse_column(self):
        """adds column 'Reverse' to the rawinput for filtering"""
        self.rawinput["Reverse"] = np.where(
            self.rawinput[self.index_column].str.contains("REV_"), True, False
        )
        self.filter_columns = ["Reverse"]
        logging.info(
            "Proteins with a peptide derived from the reversed part of the decoy database have been annotated"
            "These proteins should be filtered with `DataSet.preprocess(remove_contaminations=True)` later."
        )

    def _standardize_protein_group_column(self, entry):
        #  make column with ProteinGroup to make comparison between softwares possible
        #  'sp|P0DMV9|HS71B_HUMAN,sp|P0DMV8|HS71A_HUMAN', -> P0DMV9;P0DMV8

        # TODO this needs a more beautiful and robuster solution
        # split proteins into list
        proteins = entry.split(",")
        protein_id_list = []
        for protein in proteins:
            # 'sp|P0DMV9|HS71B_HUMAN,sp|P0DMV8|HS71A_HUMAN',
            if "|" in protein:
                fasta_header_split = protein.split("|")
            else:
                fasta_header_split = protein
            if isinstance(fasta_header_split, str):
                #  'ENSEMBL:ENSBTAP00000007350',
                if "ENSEMBL:" in fasta_header_split:
                    protein_id = fasta_header_split.replace("ENSEMBL:", "")
                else:
                    protein_id = fasta_header_split
            else:
                protein_id = fasta_header_split[1]
            protein_id_list.append(protein_id)
        protein_id_concentate = ";".join(protein_id_list)
        # ADD REV to the protein ID, else there will be duplicates in the ProteinGroup column
        if "REV_" in entry:
            protein_id_concentate = "REV_" + protein_id_concentate
        return protein_id_concentate


# https://mannlabs.github.io/alphapept/file_formats.html#Output-Files