Source code for alphastats.loader.DIANNLoader

from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd


[docs]class DIANNLoader(BaseLoader): """Loader for DIA-NN output files https://github.com/vdemichev/DiaNN """ def __init__( self, file, intensity_column="[sample]", index_column="Protein.Group", sep="\t", **kwargs ): """Import DIA-NN output data report.pg_matrix.tsv Args: file (str): DIA-NN output file report.pg_matrix.tsv intensity_column (str, optional): columns containing the intensity column for each experiment. Defaults to "[experiment]". index_column (str, optional): column with the Protein IDs. Defaults to "Protein.Group". sep (str, optional): file separation of the input file. Defaults to "\t". """ super().__init__(file, intensity_column, index_column, sep) self.software = "DIANN" self.no_sample_column = [ "PG.Q.value", "Global.PG.Q.value", "PTM.Q.value", "PTM.Site.Confidence", "PG.Quantity", "Protein.Group", "Protein.Ids", "Protein.Names", "Genes", "First.Protein.Description", "contamination_library", ] self._remove_filepath_from_name() self._add_tag_to_sample_columns() self._add_contamination_column() self._read_all_columns_as_string() def _add_tag_to_sample_columns(self): """ when creating matrix sample columns wont be found when it is only specified as [experiment] so tag will be added """ # TODO this is very fragile as changes in column names can break this self.rawinput.columns = [ str(col) + "_Intensity" if col not in self.no_sample_column else str(col) for col in self.rawinput.columns ] self.intensity_column = "[sample]_Intensity" @staticmethod def _split_path(file_path): """ split file path for windows and macOS """ # try: if "/" in file_path: file = file_path.split("/")[-1] else: file = file_path.split("\\")[-1] # windows path can cause error # except SyntaxError: # file = file_path return file def _remove_filepath_from_name(self): """ split filepath so only filename is used for analysis """ self.rawinput.columns = [ self._split_path(col) if col not in self.no_sample_column else str(col) for col in self.rawinput.columns ]