Skip to content

Commit

Permalink
Merge pull request #682 from kbase/dev_add_eggnog_container
Browse files Browse the repository at this point in the history
add eggnog
  • Loading branch information
Tianhao-Gu authored Mar 15, 2024
2 parents 0b422d9 + 1337c37 commit 3696130
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 1 deletion.
31 changes: 31 additions & 0 deletions .github/workflows/build-push-eggnog-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Build & Push eggNOG Image to GHCR

on:
pull_request:
types:
- opened
- reopened
- synchronize
- ready_for_review
paths:
- 'src/loaders/compute_tools/eggnog/versions.yaml'
- '.github/workflows/build-push-eggnog-image.yml'
- '.github/workflows/build-push-tool-images.yml'

push:
branches:
- main
- master
- develop
paths:
- 'src/loaders/compute_tools/eggnog/versions.yaml'
- '.github/workflows/build-push-eggnog-image.yml'
- '.github/workflows/build-push-tool-images.yml'

jobs:
trigger-build-push:
uses: ./.github/workflows/build-push-tool-images.yml
with:
tool_name: eggnog
version_file: 'src/loaders/compute_tools/eggnog/versions.yaml'
secrets: inherit
3 changes: 3 additions & 0 deletions src/loaders/common/loader_common_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@
# The metadata file name created during the Mash run
MASH_METADATA = 'mash_run_metadata.json'

# The metadata file name created during the Eggnog run
EGGNOG_METADATA = 'eggnog_run_metadata.json'

# The fatal error file created if a data file cannot be successfully processed
FATAL_ERROR_FILE = "fatal_error.json"

Expand Down
34 changes: 34 additions & 0 deletions src/loaders/compute_tools/eggnog/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
FROM continuumio/miniconda3:24.1.2-0

ENV EGGNOG_VER 2.1.12
ENV CONDA_ENV eggnog-$EGGNOG_VER
ENV PYTHON_VER 3.11

RUN conda config --add channels bioconda
RUN conda config --add channels conda-forge

RUN conda create -n $CONDA_ENV python=$PYTHON_VER
RUN conda install -n $CONDA_ENV -c conda-forge -c bioconda eggnog-mapper=$EGGNOG_VER
RUN conda install -n $CONDA_ENV pandas=2.2.1 jsonlines=2.0.0

RUN echo "source activate $CONDA_ENV" >> ~/.bashrc

# eggNOG annotation DB is pre-downloaded at /global/cfs/cdirs/kbase/collections/libraries/eggnog/5.0.2
# following instructions at https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.12#setup
# Mount the annotation DB directory to /reference_data when running the container
ENV EGGNOG_DATA_DIR /reference_data

RUN mkdir -p /app
COPY ./ /app/collections
# slows down that chmod step if left in place
RUN rm -r /app/collections/.git

ENV PYTHONPATH /app/collections

WORKDIR /app

ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/eggnog/eggnog.py

RUN chmod -R 777 /app/collections

ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"]
10 changes: 10 additions & 0 deletions src/loaders/compute_tools/eggnog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

# eggNOG tool

## Overview
The eggNOG tool is designed to utilize the collections infrastructure for execution and storage of result data.

This tool is exclusively intended for use with the CDM project.

The Collections parser program ([parse_tool_results.py](../../genome_collection/parse_tool_results.py)) will skip parsing the result files generated by this tool, as the result data is
specifically tailored for the CDM project.
58 changes: 58 additions & 0 deletions src/loaders/compute_tools/eggnog/eggnog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Run eggNOG tool on a set of faa files.
This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work.
Therefore, the parser program is not compatible with data generated by this tool.
"""
import json
from pathlib import Path

from src.loaders.common.loader_common_names import EGGNOG_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command

INPUT_TYPE = 'proteins'
THREADS = 8


def _run_eggnog_single(
tool_safe_data_id: str,
data_id: str,
source_file: Path,
output_dir: Path,
debug: bool) -> None:

metadata_file = output_dir / EGGNOG_METADATA
if metadata_file.exists():
print(f"Skipping {source_file} as it has already been processed.")
return

# RUN eggNOG for a single genome
command = ['emapper.py',
'-i', source_file, # Input file.
'-o', output_dir / source_file.name, # Output prefix.
# Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files.
'--itype', f'{INPUT_TYPE}',
'--cpu', f'{THREADS}',
'--excel',
'--sensmode', 'fast',
'--dmnd_iterate', 'no',
'--override' # Overwrites output files if they exist from previous runs.
]

run_command(command, output_dir if debug else None)

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
'input_type': INPUT_TYPE}
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)


def main():
runner = ToolRunner("eggnog")
runner.parallel_single_execution(_run_eggnog_single, unzip=True)


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions src/loaders/compute_tools/eggnog/versions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
versions:
- version: 0.1.0
date: 2024-03-13
reference_db_version: 5.0.2
3 changes: 2 additions & 1 deletion src/loaders/jobs/taskfarmer/task_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@
256 cores.
'''

TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash']
TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog']

# estimated execution time (in minutes) for each tool to process a chunk of data
TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65},
'eggnog': {'chunk_size': 1000, 'exe_time': 65}, # TODO: update this value after performance testing
'default': {'chunk_size': 5000, 'exe_time': 60}}
NODE_TIME_LIMIT = 5 # hours # TODO: automatically calculate this based on tool execution time and NODE_THREADS
MAX_NODE_NUM = 100 # maximum number of nodes to use
Expand Down

0 comments on commit 3696130

Please sign in to comment.