Source code for alphastats.loader.DIANNLoader

from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd


[docs]class DIANNLoader(BaseLoader):
    """Loader for DIA-NN output files
    https://github.com/vdemichev/DiaNN
    """

    def __init__(
        self,
        file,
        intensity_column="[sample]",
        index_column="Protein.Group",
        sep="\t",
        **kwargs
    ):
        """Import DIA-NN output data report.pg_matrix.tsv

        Args:
            file (str): DIA-NN output file report.pg_matrix.tsv
            intensity_column (str, optional): columns containing the intensity column for each experiment. Defaults to "[experiment]".
            index_column (str, optional): column with the Protein IDs. Defaults to "Protein.Group".
            sep (str, optional): file separation of the input file. Defaults to "\t".
        """

        super().__init__(file, intensity_column, index_column, sep)
        self.software = "DIANN"
        self.no_sample_column = [
            "PG.Q.value",
            "Global.PG.Q.value",
            "PTM.Q.value",
            "PTM.Site.Confidence",
            "PG.Quantity",
            "Protein.Group",
            "Protein.Ids",
            "Protein.Names",
            "Genes",
            "First.Protein.Description",
            "contamination_library",
        ]
        self._remove_filepath_from_name()
        self._add_tag_to_sample_columns()
        self._add_contamination_column()
        self._read_all_columns_as_string()

    def _add_tag_to_sample_columns(self):
        """
        when creating matrix sample columns wont be found when it is only specified as [experiment]
        so tag will be added
        """
        # TODO this is very fragile as changes in column names can break this

        self.rawinput.columns = [
            str(col) + "_Intensity" if col not in self.no_sample_column else str(col)
            for col in self.rawinput.columns
        ]
        self.intensity_column = "[sample]_Intensity"

    @staticmethod
    def _split_path(file_path):
        """
        split file path for windows and macOS
        """
        # try:
        if "/" in file_path:
            file = file_path.split("/")[-1]

        else:
            file = file_path.split("\\")[-1]

        # windows path can cause error
        # except SyntaxError:
        #     file = file_path

        return file

    def _remove_filepath_from_name(self):
        """
        split filepath so only filename is used for analysis
        """

        self.rawinput.columns = [
            self._split_path(col) if col not in self.no_sample_column else str(col)
            for col in self.rawinput.columns
        ]