Addtest #49 goea test code (#50)

* Fix testable code - Transfer code FastDTLgoea.py -> args.py, out_path.py, goea.py - Fix og_go_association.py bad arguments implements * Add GOEA test data * Add GOEA test data fixture * Add GOEA test code * Add GraphViz install step GOEA tool "goatools" use GraphViz for plot
moshi4 · Oct 17, 2021 · f48e190 · f48e190
1 parent 6a21a47
commit f48e190
Show file tree

Hide file tree

Showing 25 changed files with 626,360 additions and 263 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -30,6 +30,9 @@ jobs:
       - name: Install Dependencies
         run: poetry install -n
 
+      - name: Install GraphViz
+        run: sudo apt install -y graphviz
+
       - name: Run black format check
         run: poetry run black src tests --check --diff --verbose --exclude src/fastdtlmapper/bin
 

diff --git a/src/fastdtlmapper/goea/args.py b/src/fastdtlmapper/goea/args.py
@@ -0,0 +1,99 @@
+import argparse
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+
+@dataclass
+class Args:
+    """Parse Arguments Class"""
+
+    indir: Path
+    plot_pvalue_thr: float
+    plot_max_num: int
+    plot_format: str
+    plot_color: str
+    use_adjusted_pvalue: bool
+
+
+def get_args(argv: Optional[List[str]] = None) -> Args:
+    """Get argument values
+
+    Returns:
+        argparse.Namespace: argument values
+    """
+    parser = argparse.ArgumentParser(
+        description="Gain/Loss genes GOEA(GO Enrichment Analysis) tool"
+    )
+    parser.add_argument(
+        "-i",
+        "--indir",
+        required=True,
+        type=Path,
+        help="FastDTLmapper result directory",
+        metavar="IN",
+    )
+    default_plot_pvalue_thr = 0.05
+    parser.add_argument(
+        "--plot_pvalue_thr",
+        type=float,
+        default=default_plot_pvalue_thr,
+        help=f"Plot GOterm pvalue threshold (Default: {default_plot_pvalue_thr})",
+        metavar="",
+    )
+    default_max_plot_num = 10
+    parser.add_argument(
+        "--plot_max_num",
+        type=int,
+        default=default_max_plot_num,
+        help=f"Plot GOterm max number (Default: {default_max_plot_num})",
+        metavar="",
+    )
+    default_plot_format = "png"
+    available_format = ["png", "svg", "jpg", "pdf"]
+    parser.add_argument(
+        "--plot_format",
+        type=str,
+        default=default_plot_format,
+        choices=available_format,
+        help=f"Plot file format [{'|'.join(available_format)}] "
+        + f"(Default: '{default_plot_format}')",
+        metavar="",
+    )
+    parser.add_argument(
+        "--plot_color",
+        type=str,
+        default="",
+        help="Plot specified hexcolor [e.g. '1affdb'] "
+        + "(Default: yellow to red gradient color)",
+        metavar="",
+    )
+    parser.add_argument(
+        "--adjusted_pvalue",
+        help="Use BH adjusted pvalue for plot threshold",
+        action="store_true",
+    )
+
+    args = parser.parse_args(argv)
+
+    # Plot hexcolor check
+    def is_valid_hexcolor(hexcolor: str) -> bool:
+        search_result = re.search(r"^(?:[0-9a-fA-F]{3}){1,2}$", hexcolor)
+        return False if search_result is None else True
+
+    if args.plot_color and not is_valid_hexcolor(args.plot_color):
+        parser.error(
+            f"argument --plot_color: invalid hexcolor code '{args.plot_color}'."
+        )
+    else:
+        args.plot_color = "#" + args.plot_color if args.plot_color else ""
+
+    return Args(
+        args.indir,
+        args.plot_pvalue_thr,
+        args.plot_max_num,
+        args.plot_format,
+        args.plot_color,
+        args.adjusted_pvalue,
+    )
diff --git a/src/fastdtlmapper/goea/goea.py b/src/fastdtlmapper/goea/goea.py
@@ -3,7 +3,7 @@
 import subprocess as sp
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 from urllib import request
 
 import numpy as np
@@ -249,3 +249,86 @@ def download_obo(obo_outfile: Path) -> None:
 
         if not obo_outfile.is_file():
             raise FileNotFoundError(f"'{obo_outfile}' not found. Download failure!!")
+
+    @staticmethod
+    def extract_significant_goea_result(
+        goea_result_file: Path,
+        pvalue_thr: float,
+        use_adjusted_pvalue: bool,
+        min_depth: int = 2,
+    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Extract over and under significant goea result
+
+        Args:
+            goea_result_file (Path): GOEA result file
+            pvalue_thr (float): Pvalue threshold for extract
+            use_adjusted_pvalue (bool): Use BH adjusted pvalue or not
+            min_depth (int, optional): Minimum depth for extract. Defaults to 2.
+
+        Returns:
+            Tuple[pd.DataFrame, pd.DataFrame]: over/under extract dataframes
+        """
+        df = pd.read_table(goea_result_file)
+        pvalue_column_name = "p_fdr_bh" if use_adjusted_pvalue else "p_uncorrected"
+        over_df = df[
+            (df["enrichment"] == "e")
+            & (df[pvalue_column_name] < pvalue_thr)
+            & (df["depth"] >= min_depth)
+        ]
+        under_df = df[
+            (df["enrichment"] == "p")
+            & (df[pvalue_column_name] < pvalue_thr)
+            & (df["depth"] >= min_depth)
+        ]
+        return (over_df, under_df)
+
+    @staticmethod
+    def format_significant_goea_dataframe(
+        goea_result_df: pd.DataFrame,
+        node_id: str,
+        gain_or_loss: str,
+    ) -> pd.DataFrame:
+        """Format significant GOterm dataframe for output
+
+        Args:
+            goea_result_df (pd.DataFrame): GOEA result dataframe
+            node_id (str): Node id
+            gain_or_loss (str): "gain" or "loss"
+
+        Returns:
+            pd.DataFrame: Formatted GOEA result dataframe
+        """
+        # Rename columns
+        rename_list = [
+            "GO",
+            "GO_CATEGORY",
+            "OVER/UNDER",
+            "GO_NAME",
+            "RATIO_IN_STUDY",
+            "RATIO_IN_POP",
+            "PVALUE",
+            "DEPTH",
+            "STUDY_COUNT",
+            "BH_ADJUSTED_PVALUE",
+            "STUDY_ITEMS",
+        ]
+        rename_dict = {b: a for b, a in zip(goea_result_df.columns, rename_list)}
+        goea_result_df = goea_result_df.rename(columns=rename_dict)
+        # Add columns
+        goea_result_df["NODE_ID"] = node_id
+        goea_result_df["GAIN/LOSS"] = gain_or_loss
+        # Replace OVER/UNDER value ("e" -> "over", "p" -> "under")
+        goea_result_df = goea_result_df.replace(
+            {"OVER/UNDER": {"e": "over", "p": "under"}}
+        )
+        # Return reorder columns dataframe
+        return goea_result_df[
+            [
+                "NODE_ID",
+                "GAIN/LOSS",
+                "GO_CATEGORY",
+                "OVER/UNDER",
+                "GO",
+                *rename_list[3:],
+            ]
+        ]
diff --git a/src/fastdtlmapper/goea/og_go_association.py b/src/fastdtlmapper/goea/og_go_association.py
@@ -13,54 +13,40 @@ class OgGoAssociation:
     annotation_file_list: List[Path]
     go_define_ratio_thr: float = 0.5
 
-    def __post_init__(self):
+    def load_og_go_association(self):
+        """Load OG & GOterms association"""
         # Get OrthologGroup(OG) ID & Gene ID association
-        self.og_id2gene_id_list = self._get_og_id2gene_id_association(
-            self.og_gene_list_file
-        )
+        self.og_id2gene_id_list = self._get_og_id2gene_id_association()
         # Get Gene ID & GO ID association
-        self.gene_id2go_id_set = self._get_gene_id2go_id_association(
-            self.annotation_file_list
-        )
+        self.gene_id2go_id_set = self._get_gene_id2go_id_association()
         # Get OG ID & GO ID association
         self.og_id2go_id_list = self._get_og_id2go_id_association(
-            self.og_id2gene_id_list, self.gene_id2go_id_set, self.go_define_ratio_thr
+            self.og_id2gene_id_list, self.gene_id2go_id_set
         )
 
-    def _get_og_id2gene_id_association(
-        self, og_gene_file: Path
-    ) -> Dict[str, List[str]]:
+    def _get_og_id2gene_id_association(self) -> Dict[str, List[str]]:
         """Get OG(OrthologGroup) ID & Gene ID association
 
-        Args:
-            og_gene_file (Path): OrthoFinder OG's gene list file path
-
         Returns:
             Dict[str, List[str]]: OG ID & Gene ID association dict
         """
         og_id2gene_id_list = {}
-        with open(og_gene_file) as f:
+        with open(self.og_gene_list_file) as f:
             line_list = f.read().splitlines()
         for line in line_list:
             og_id = line[0:9]
             gene_id_list = line[11:].split(" ")
             og_id2gene_id_list[og_id] = gene_id_list
         return og_id2gene_id_list
 
-    def _get_gene_id2go_id_association(
-        self,
-        annotation_file_list: List[Path],
-    ) -> Dict[str, Set[str]]:
+    def _get_gene_id2go_id_association(self) -> Dict[str, Set[str]]:
         """Get Gene ID & GO ID association
 
-        Args:
-            annotation_file_list (List[Path]): Interproscan annnotation file path list
-
         Returns:
             Dict[str, Set[str]]: Gene ID & GO ID association dict
         """
         gene_id2go_id_set = defaultdict(set)
-        for annotation_file in annotation_file_list:
+        for annotation_file in self.annotation_file_list:
             with open(annotation_file) as f:
                 reader = csv.reader(f, delimiter="\t")
                 for row in reader:
@@ -74,7 +60,6 @@ def _get_og_id2go_id_association(
         self,
         og_id2gene_id_list: Dict[str, List[str]],
         gene_id2go_id_set: Dict[str, Set[str]],
-        go_define_ratio_thr: float,
     ) -> Dict[str, List[str]]:
         """Get OG ID & GO ID association
 
@@ -93,7 +78,7 @@ def _get_og_id2go_id_association(
                 go_id_list.extend(list(gene_id2go_id_set[gene_id]))
             for go_id in set(go_id_list):
                 go_id_count = go_id_list.count(go_id)
-                if go_id_count >= len(go_id_list) * go_define_ratio_thr:
+                if go_id_count >= len(gene_id_list) * self.go_define_ratio_thr:
                     og_id2go_id_list[og_id].append(go_id)
         return og_id2go_id_list
 
@@ -103,6 +88,7 @@ def write_og2go_association(self, og2go_association_file: Path) -> None:
         Args:
             og2go_association_file (Path): OG & GO association output file path
         """
+        self.load_og_go_association()
         association_info = ""
         for og_id, go_id_list in self.og_id2go_id_list.items():
             og_gene_num = len(self.og_id2gene_id_list[og_id])

diff --git a/src/fastdtlmapper/out_path.py b/src/fastdtlmapper/out_path.py
@@ -10,6 +10,7 @@ class OutPath:
     """Output Path Class"""
 
     rootdir: Union[str, Path]
+    goea_mode: bool = False
 
     def __post_init__(self):
         self.rootdir = Path(self.rootdir)
@@ -28,6 +29,8 @@ def __post_init__(self):
         self.ortho_group_fasta_dir = self.ortho_dir / "Orthogroup_Sequences"
         self.ortho_tmpwork_dir = self.user_fasta_dir / "OrthoFinder"
 
+        self.og_gene_file = self.ortho_dir / "Orthogroups" / "Orthogroups.txt"
+
         # 02. DTL reconciliation directory
         self.dtl_rec_dir = self.rootdir / "02_dtl_reconciliation"
 
@@ -40,6 +43,24 @@ def __post_init__(self):
         self.all_trn_count_file = self.aggregate_map_dir / "all_transfer_gene_count.tsv"
         self.all_trn_genes_file = self.aggregate_map_dir / "all_transfer_gene_list.tsv"
 
+        # 04. (Optional) GOEA analysis directory
+        self.goea_dir = self.rootdir / "04_functional_analysis"
+
+        self.go_annotation_dir = self.goea_dir / "go_annotation"
+        self.go_annotation_workdir = self.go_annotation_dir / "work"
+        self.go_enrichment_dir = self.goea_dir / "go_enrichment"
+        self.result_summary_dir = self.goea_dir / "result_summary"
+        self.result_summary_plot_dir = self.result_summary_dir / "significant_go_plot"
+
+        self.obo_file = self.goea_dir / "go-basic.obo"
+        self.og2go_association_file = self.go_enrichment_dir / "og2go_association.txt"
+        self.significant_go_list_file = (
+            self.result_summary_dir / "significant_go_list.tsv"
+        )
+        self.significant_go_count_file = (
+            self.result_summary_dir / "significant_go_count.tsv"
+        )
+
         # Log commands list and command stderr
         self.log_dir = self.rootdir / "log"
         self.parallel_cmds_dir = self.log_dir / "parallel_cmds"
@@ -57,17 +78,35 @@ def __post_init__(self):
 
     def _makedirs(self) -> None:
         """Create config output directories"""
-        # Check & Delete OrthoFinder previous work dir
-        if self.ortho_tmpwork_dir.exists():
-            shutil.rmtree(self.ortho_tmpwork_dir)
+        # Delete previous work dir
+        shutil.rmtree(self.ortho_tmpwork_dir, ignore_errors=True)
+        shutil.rmtree(self.go_enrichment_dir, ignore_errors=True)
+        shutil.rmtree(self.result_summary_dir, ignore_errors=True)
 
-        # Make configured output directory
-        for target_dir in (
+        # Default output dir
+        target_dir_list = [
             self.user_fasta_dir,
             self.user_tree_dir,
             self.ortho_dir,
             self.dtl_rec_dir,
             self.aggregate_map_dir,
             self.parallel_cmds_dir,
-        ):
+        ]
+        if self.goea_mode:
+            # Check rootdir is FastDTLmapper result dir or not
+            if not self.user_fasta_dir.exists():
+                err_msg = f"Input fasta directory '{self.user_fasta_dir}' not found!!\n"
+                err_msg += "Please specify FastDTLmapper result directory as input."
+                raise ValueError(err_msg)
+            # Add GOEA output dir
+            target_dir_list.extend(
+                [
+                    self.go_annotation_workdir,
+                    self.go_enrichment_dir,
+                    self.result_summary_plot_dir,
+                ]
+            )
+
+        # Make configured output directory
+        for target_dir in target_dir_list:
             os.makedirs(target_dir, exist_ok=True)