Source code for alphastats.loader.AlphaPeptLoader

from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd
import numpy as np
import logging


[docs]class AlphaPeptLoader(BaseLoader): """Loader for AlphaPept outputfiles https://github.com/MannLabs/alphapept """ def __init__( self, file, intensity_column="[sample]_LFQ", index_column="Unnamed: 0", # column name to be changed sep=",", **kwargs ): """Loads Alphapept output: results_proteins.csv. Will add contamination column for further analysis. Args: file (str): AlphaPept output, either results_proteins.csv file or the hdf_file with the protein_table given intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample]_LFQ". index_column (str, optional): column indicating the protein groups. Defaults to "Unnamed: 0". sep (str, optional): file separation of file. Defaults to ",". """ if file.endswith(".hdf"): self._load_hdf_protein_table(file=file) else: self.rawinput = pd.read_csv(file, sep=sep) self.intensity_column = intensity_column self.index_column = index_column self.filter_columns = [] self.confidence_column = None self.software = "AlphaPept" self.evidence_df = None self.gene_names = None # add contamination column "Reverse" self._add_contamination_reverse_column() self._add_contamination_column() self._read_all_columns_as_string() #  make ProteinGroup column self.rawinput["ProteinGroup"] = self.rawinput[self.index_column].map( self._standardize_protein_group_column ) self.index_column = "ProteinGroup" def _load_hdf_protein_table(self, file): self.rawinput = pd.read_hdf(file, "protein_table") def _add_contamination_reverse_column(self): """adds column 'Reverse' to the rawinput for filtering""" self.rawinput["Reverse"] = np.where( self.rawinput[self.index_column].str.contains("REV_"), True, False ) self.filter_columns = ["Reverse"] logging.info( "Proteins with a peptide derived from the reversed part of the decoy database have been annotated" "These proteins should be filtered with `DataSet.preprocess(remove_contaminations=True)` later." ) def _standardize_protein_group_column(self, entry): #  make column with ProteinGroup to make comparison between softwares possible # 'sp|P0DMV9|HS71B_HUMAN,sp|P0DMV8|HS71A_HUMAN', -> P0DMV9;P0DMV8 # TODO this needs a more beautiful and robuster solution # split proteins into list proteins = entry.split(",") protein_id_list = [] for protein in proteins: # 'sp|P0DMV9|HS71B_HUMAN,sp|P0DMV8|HS71A_HUMAN', if "|" in protein: fasta_header_split = protein.split("|") else: fasta_header_split = protein if isinstance(fasta_header_split, str): # 'ENSEMBL:ENSBTAP00000007350', if "ENSEMBL:" in fasta_header_split: protein_id = fasta_header_split.replace("ENSEMBL:", "") else: protein_id = fasta_header_split else: protein_id = fasta_header_split[1] protein_id_list.append(protein_id) protein_id_concentate = ";".join(protein_id_list) # ADD REV to the protein ID, else there will be duplicates in the ProteinGroup column if "REV_" in entry: protein_id_concentate = "REV_" + protein_id_concentate return protein_id_concentate
# https://mannlabs.github.io/alphapept/file_formats.html#Output-Files