Skip to content
This repository has been archived by the owner on Sep 7, 2023. It is now read-only.

Commit

Permalink
Addtest #49 goea test code (#50)
Browse files Browse the repository at this point in the history
* Fix testable code

- Transfer code
  FastDTLgoea.py -> args.py, out_path.py, goea.py
- Fix og_go_association.py bad arguments implements

* Add GOEA test data

* Add GOEA test data fixture

* Add GOEA test code

* Add GraphViz install step

GOEA tool "goatools" use GraphViz for plot
  • Loading branch information
moshi4 authored Oct 17, 2021
1 parent 6a21a47 commit f48e190
Show file tree
Hide file tree
Showing 25 changed files with 626,360 additions and 263 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ jobs:
- name: Install Dependencies
run: poetry install -n

- name: Install GraphViz
run: sudo apt install -y graphviz

- name: Run black format check
run: poetry run black src tests --check --diff --verbose --exclude src/fastdtlmapper/bin

Expand Down
99 changes: 99 additions & 0 deletions src/fastdtlmapper/goea/args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import argparse
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional


@dataclass
class Args:
"""Parse Arguments Class"""

indir: Path
plot_pvalue_thr: float
plot_max_num: int
plot_format: str
plot_color: str
use_adjusted_pvalue: bool


def get_args(argv: Optional[List[str]] = None) -> Args:
"""Get argument values
Returns:
argparse.Namespace: argument values
"""
parser = argparse.ArgumentParser(
description="Gain/Loss genes GOEA(GO Enrichment Analysis) tool"
)
parser.add_argument(
"-i",
"--indir",
required=True,
type=Path,
help="FastDTLmapper result directory",
metavar="IN",
)
default_plot_pvalue_thr = 0.05
parser.add_argument(
"--plot_pvalue_thr",
type=float,
default=default_plot_pvalue_thr,
help=f"Plot GOterm pvalue threshold (Default: {default_plot_pvalue_thr})",
metavar="",
)
default_max_plot_num = 10
parser.add_argument(
"--plot_max_num",
type=int,
default=default_max_plot_num,
help=f"Plot GOterm max number (Default: {default_max_plot_num})",
metavar="",
)
default_plot_format = "png"
available_format = ["png", "svg", "jpg", "pdf"]
parser.add_argument(
"--plot_format",
type=str,
default=default_plot_format,
choices=available_format,
help=f"Plot file format [{'|'.join(available_format)}] "
+ f"(Default: '{default_plot_format}')",
metavar="",
)
parser.add_argument(
"--plot_color",
type=str,
default="",
help="Plot specified hexcolor [e.g. '1affdb'] "
+ "(Default: yellow to red gradient color)",
metavar="",
)
parser.add_argument(
"--adjusted_pvalue",
help="Use BH adjusted pvalue for plot threshold",
action="store_true",
)

args = parser.parse_args(argv)

# Plot hexcolor check
def is_valid_hexcolor(hexcolor: str) -> bool:
search_result = re.search(r"^(?:[0-9a-fA-F]{3}){1,2}$", hexcolor)
return False if search_result is None else True

if args.plot_color and not is_valid_hexcolor(args.plot_color):
parser.error(
f"argument --plot_color: invalid hexcolor code '{args.plot_color}'."
)
else:
args.plot_color = "#" + args.plot_color if args.plot_color else ""

return Args(
args.indir,
args.plot_pvalue_thr,
args.plot_max_num,
args.plot_format,
args.plot_color,
args.adjusted_pvalue,
)
85 changes: 84 additions & 1 deletion src/fastdtlmapper/goea/goea.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import subprocess as sp
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Tuple, Union
from urllib import request

import numpy as np
Expand Down Expand Up @@ -249,3 +249,86 @@ def download_obo(obo_outfile: Path) -> None:

if not obo_outfile.is_file():
raise FileNotFoundError(f"'{obo_outfile}' not found. Download failure!!")

@staticmethod
def extract_significant_goea_result(
goea_result_file: Path,
pvalue_thr: float,
use_adjusted_pvalue: bool,
min_depth: int = 2,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Extract over and under significant goea result
Args:
goea_result_file (Path): GOEA result file
pvalue_thr (float): Pvalue threshold for extract
use_adjusted_pvalue (bool): Use BH adjusted pvalue or not
min_depth (int, optional): Minimum depth for extract. Defaults to 2.
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: over/under extract dataframes
"""
df = pd.read_table(goea_result_file)
pvalue_column_name = "p_fdr_bh" if use_adjusted_pvalue else "p_uncorrected"
over_df = df[
(df["enrichment"] == "e")
& (df[pvalue_column_name] < pvalue_thr)
& (df["depth"] >= min_depth)
]
under_df = df[
(df["enrichment"] == "p")
& (df[pvalue_column_name] < pvalue_thr)
& (df["depth"] >= min_depth)
]
return (over_df, under_df)

@staticmethod
def format_significant_goea_dataframe(
goea_result_df: pd.DataFrame,
node_id: str,
gain_or_loss: str,
) -> pd.DataFrame:
"""Format significant GOterm dataframe for output
Args:
goea_result_df (pd.DataFrame): GOEA result dataframe
node_id (str): Node id
gain_or_loss (str): "gain" or "loss"
Returns:
pd.DataFrame: Formatted GOEA result dataframe
"""
# Rename columns
rename_list = [
"GO",
"GO_CATEGORY",
"OVER/UNDER",
"GO_NAME",
"RATIO_IN_STUDY",
"RATIO_IN_POP",
"PVALUE",
"DEPTH",
"STUDY_COUNT",
"BH_ADJUSTED_PVALUE",
"STUDY_ITEMS",
]
rename_dict = {b: a for b, a in zip(goea_result_df.columns, rename_list)}
goea_result_df = goea_result_df.rename(columns=rename_dict)
# Add columns
goea_result_df["NODE_ID"] = node_id
goea_result_df["GAIN/LOSS"] = gain_or_loss
# Replace OVER/UNDER value ("e" -> "over", "p" -> "under")
goea_result_df = goea_result_df.replace(
{"OVER/UNDER": {"e": "over", "p": "under"}}
)
# Return reorder columns dataframe
return goea_result_df[
[
"NODE_ID",
"GAIN/LOSS",
"GO_CATEGORY",
"OVER/UNDER",
"GO",
*rename_list[3:],
]
]
36 changes: 11 additions & 25 deletions src/fastdtlmapper/goea/og_go_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,54 +13,40 @@ class OgGoAssociation:
annotation_file_list: List[Path]
go_define_ratio_thr: float = 0.5

def __post_init__(self):
def load_og_go_association(self):
"""Load OG & GOterms association"""
# Get OrthologGroup(OG) ID & Gene ID association
self.og_id2gene_id_list = self._get_og_id2gene_id_association(
self.og_gene_list_file
)
self.og_id2gene_id_list = self._get_og_id2gene_id_association()
# Get Gene ID & GO ID association
self.gene_id2go_id_set = self._get_gene_id2go_id_association(
self.annotation_file_list
)
self.gene_id2go_id_set = self._get_gene_id2go_id_association()
# Get OG ID & GO ID association
self.og_id2go_id_list = self._get_og_id2go_id_association(
self.og_id2gene_id_list, self.gene_id2go_id_set, self.go_define_ratio_thr
self.og_id2gene_id_list, self.gene_id2go_id_set
)

def _get_og_id2gene_id_association(
self, og_gene_file: Path
) -> Dict[str, List[str]]:
def _get_og_id2gene_id_association(self) -> Dict[str, List[str]]:
"""Get OG(OrthologGroup) ID & Gene ID association
Args:
og_gene_file (Path): OrthoFinder OG's gene list file path
Returns:
Dict[str, List[str]]: OG ID & Gene ID association dict
"""
og_id2gene_id_list = {}
with open(og_gene_file) as f:
with open(self.og_gene_list_file) as f:
line_list = f.read().splitlines()
for line in line_list:
og_id = line[0:9]
gene_id_list = line[11:].split(" ")
og_id2gene_id_list[og_id] = gene_id_list
return og_id2gene_id_list

def _get_gene_id2go_id_association(
self,
annotation_file_list: List[Path],
) -> Dict[str, Set[str]]:
def _get_gene_id2go_id_association(self) -> Dict[str, Set[str]]:
"""Get Gene ID & GO ID association
Args:
annotation_file_list (List[Path]): Interproscan annnotation file path list
Returns:
Dict[str, Set[str]]: Gene ID & GO ID association dict
"""
gene_id2go_id_set = defaultdict(set)
for annotation_file in annotation_file_list:
for annotation_file in self.annotation_file_list:
with open(annotation_file) as f:
reader = csv.reader(f, delimiter="\t")
for row in reader:
Expand All @@ -74,7 +60,6 @@ def _get_og_id2go_id_association(
self,
og_id2gene_id_list: Dict[str, List[str]],
gene_id2go_id_set: Dict[str, Set[str]],
go_define_ratio_thr: float,
) -> Dict[str, List[str]]:
"""Get OG ID & GO ID association
Expand All @@ -93,7 +78,7 @@ def _get_og_id2go_id_association(
go_id_list.extend(list(gene_id2go_id_set[gene_id]))
for go_id in set(go_id_list):
go_id_count = go_id_list.count(go_id)
if go_id_count >= len(go_id_list) * go_define_ratio_thr:
if go_id_count >= len(gene_id_list) * self.go_define_ratio_thr:
og_id2go_id_list[og_id].append(go_id)
return og_id2go_id_list

Expand All @@ -103,6 +88,7 @@ def write_og2go_association(self, og2go_association_file: Path) -> None:
Args:
og2go_association_file (Path): OG & GO association output file path
"""
self.load_og_go_association()
association_info = ""
for og_id, go_id_list in self.og_id2go_id_list.items():
og_gene_num = len(self.og_id2gene_id_list[og_id])
Expand Down
51 changes: 45 additions & 6 deletions src/fastdtlmapper/out_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class OutPath:
"""Output Path Class"""

rootdir: Union[str, Path]
goea_mode: bool = False

def __post_init__(self):
self.rootdir = Path(self.rootdir)
Expand All @@ -28,6 +29,8 @@ def __post_init__(self):
self.ortho_group_fasta_dir = self.ortho_dir / "Orthogroup_Sequences"
self.ortho_tmpwork_dir = self.user_fasta_dir / "OrthoFinder"

self.og_gene_file = self.ortho_dir / "Orthogroups" / "Orthogroups.txt"

# 02. DTL reconciliation directory
self.dtl_rec_dir = self.rootdir / "02_dtl_reconciliation"

Expand All @@ -40,6 +43,24 @@ def __post_init__(self):
self.all_trn_count_file = self.aggregate_map_dir / "all_transfer_gene_count.tsv"
self.all_trn_genes_file = self.aggregate_map_dir / "all_transfer_gene_list.tsv"

# 04. (Optional) GOEA analysis directory
self.goea_dir = self.rootdir / "04_functional_analysis"

self.go_annotation_dir = self.goea_dir / "go_annotation"
self.go_annotation_workdir = self.go_annotation_dir / "work"
self.go_enrichment_dir = self.goea_dir / "go_enrichment"
self.result_summary_dir = self.goea_dir / "result_summary"
self.result_summary_plot_dir = self.result_summary_dir / "significant_go_plot"

self.obo_file = self.goea_dir / "go-basic.obo"
self.og2go_association_file = self.go_enrichment_dir / "og2go_association.txt"
self.significant_go_list_file = (
self.result_summary_dir / "significant_go_list.tsv"
)
self.significant_go_count_file = (
self.result_summary_dir / "significant_go_count.tsv"
)

# Log commands list and command stderr
self.log_dir = self.rootdir / "log"
self.parallel_cmds_dir = self.log_dir / "parallel_cmds"
Expand All @@ -57,17 +78,35 @@ def __post_init__(self):

def _makedirs(self) -> None:
"""Create config output directories"""
# Check & Delete OrthoFinder previous work dir
if self.ortho_tmpwork_dir.exists():
shutil.rmtree(self.ortho_tmpwork_dir)
# Delete previous work dir
shutil.rmtree(self.ortho_tmpwork_dir, ignore_errors=True)
shutil.rmtree(self.go_enrichment_dir, ignore_errors=True)
shutil.rmtree(self.result_summary_dir, ignore_errors=True)

# Make configured output directory
for target_dir in (
# Default output dir
target_dir_list = [
self.user_fasta_dir,
self.user_tree_dir,
self.ortho_dir,
self.dtl_rec_dir,
self.aggregate_map_dir,
self.parallel_cmds_dir,
):
]
if self.goea_mode:
# Check rootdir is FastDTLmapper result dir or not
if not self.user_fasta_dir.exists():
err_msg = f"Input fasta directory '{self.user_fasta_dir}' not found!!\n"
err_msg += "Please specify FastDTLmapper result directory as input."
raise ValueError(err_msg)
# Add GOEA output dir
target_dir_list.extend(
[
self.go_annotation_workdir,
self.go_enrichment_dir,
self.result_summary_plot_dir,
]
)

# Make configured output directory
for target_dir in target_dir_list:
os.makedirs(target_dir, exist_ok=True)
Loading

0 comments on commit f48e190

Please sign in to comment.