Merge pull request #682 from kbase/dev_add_eggnog_container

add eggnog
kbase · Mar 15, 2024 · 3696130 · 3696130
2 parents 0b422d9 + 1337c37
commit 3696130
Show file tree

Hide file tree

Showing 7 changed files with 142 additions and 1 deletion.
diff --git a/.github/workflows/build-push-eggnog-image.yml b/.github/workflows/build-push-eggnog-image.yml
@@ -0,0 +1,31 @@
+name: Build & Push eggNOG Image to GHCR
+
+on:
+  pull_request:
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - ready_for_review
+    paths:
+      - 'src/loaders/compute_tools/eggnog/versions.yaml'
+      - '.github/workflows/build-push-eggnog-image.yml'
+      - '.github/workflows/build-push-tool-images.yml'
+
+  push:
+    branches:
+      - main
+      - master
+      - develop
+    paths:
+      - 'src/loaders/compute_tools/eggnog/versions.yaml'
+      - '.github/workflows/build-push-eggnog-image.yml'
+      - '.github/workflows/build-push-tool-images.yml'
+
+jobs:
+  trigger-build-push:
+    uses: ./.github/workflows/build-push-tool-images.yml
+    with:
+      tool_name: eggnog
+      version_file: 'src/loaders/compute_tools/eggnog/versions.yaml'
+    secrets: inherit
diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py
@@ -94,6 +94,9 @@
 # The metadata file name created during the Mash run
 MASH_METADATA = 'mash_run_metadata.json'
 
+# The metadata file name created during the Eggnog run
+EGGNOG_METADATA = 'eggnog_run_metadata.json'
+
 # The fatal error file created if a data file cannot be successfully processed
 FATAL_ERROR_FILE = "fatal_error.json"
 

diff --git a/src/loaders/compute_tools/eggnog/Dockerfile b/src/loaders/compute_tools/eggnog/Dockerfile
@@ -0,0 +1,34 @@
+FROM continuumio/miniconda3:24.1.2-0
+
+ENV EGGNOG_VER 2.1.12
+ENV CONDA_ENV eggnog-$EGGNOG_VER
+ENV PYTHON_VER 3.11
+
+RUN conda config --add channels bioconda
+RUN conda config --add channels conda-forge
+
+RUN conda create -n $CONDA_ENV python=$PYTHON_VER
+RUN conda install -n $CONDA_ENV -c conda-forge -c bioconda eggnog-mapper=$EGGNOG_VER
+RUN conda install -n $CONDA_ENV pandas=2.2.1 jsonlines=2.0.0
+
+RUN echo "source activate $CONDA_ENV" >> ~/.bashrc
+
+# eggNOG annotation DB is pre-downloaded at /global/cfs/cdirs/kbase/collections/libraries/eggnog/5.0.2
+# following instructions at https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.12#setup
+# Mount the annotation DB directory to /reference_data when running the container
+ENV EGGNOG_DATA_DIR /reference_data
+
+RUN mkdir -p /app
+COPY ./ /app/collections
+# slows down that chmod step if left in place
+RUN rm -r /app/collections/.git
+
+ENV PYTHONPATH /app/collections
+
+WORKDIR /app
+
+ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/eggnog/eggnog.py
+
+RUN chmod -R 777 /app/collections
+
+ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"]
diff --git a/src/loaders/compute_tools/eggnog/README.md b/src/loaders/compute_tools/eggnog/README.md
@@ -0,0 +1,10 @@
+
+# eggNOG tool
+
+## Overview
+The eggNOG tool is designed to utilize the collections infrastructure for execution and storage of result data.
+
+This tool is exclusively intended for use with the CDM project.
+
+The Collections parser program ([parse_tool_results.py](../../genome_collection/parse_tool_results.py)) will skip parsing the result files generated by this tool, as the result data is
+specifically tailored for the CDM project.
diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py
@@ -0,0 +1,58 @@
+"""
+Run eggNOG tool on a set of faa files.
+
+This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work.
+Therefore, the parser program is not compatible with data generated by this tool.
+
+"""
+import json
+from pathlib import Path
+
+from src.loaders.common.loader_common_names import EGGNOG_METADATA
+from src.loaders.compute_tools.tool_common import ToolRunner, run_command
+
+INPUT_TYPE = 'proteins'
+THREADS = 8
+
+
+def _run_eggnog_single(
+        tool_safe_data_id: str,
+        data_id: str,
+        source_file: Path,
+        output_dir: Path,
+        debug: bool) -> None:
+
+    metadata_file = output_dir / EGGNOG_METADATA
+    if metadata_file.exists():
+        print(f"Skipping {source_file} as it has already been processed.")
+        return
+
+    # RUN eggNOG for a single genome
+    command = ['emapper.py',
+               '-i', source_file,  # Input file.
+               '-o', output_dir / source_file.name,  # Output prefix.
+                                                     # Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and  'emapper.seed_orthologs' files.
+               '--itype', f'{INPUT_TYPE}',
+               '--cpu', f'{THREADS}',
+               '--excel',
+               '--sensmode', 'fast',
+               '--dmnd_iterate', 'no',
+               '--override'  # Overwrites output files if they exist from previous runs.
+               ]
+
+    run_command(command, output_dir if debug else None)
+
+    # Save run info to a metadata file in the output directory for parsing later
+    metadata = {'source_file': str(source_file),
+                'input_type': INPUT_TYPE}
+    with open(metadata_file, 'w') as f:
+        json.dump(metadata, f, indent=4)
+
+
+def main():
+    runner = ToolRunner("eggnog")
+    runner.parallel_single_execution(_run_eggnog_single, unzip=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml
@@ -0,0 +1,4 @@
+versions:
+  - version: 0.1.0
+    date: 2024-03-13
+    reference_db_version: 5.0.2
diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py
@@ -49,10 +49,11 @@
 256 cores.
 '''
 
-TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash']
+TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog']
 
 # estimated execution time (in minutes) for each tool to process a chunk of data
 TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65},
+             'eggnog': {'chunk_size': 1000, 'exe_time': 65},  # TODO: update this value after performance testing
              'default': {'chunk_size': 5000, 'exe_time': 60}}
 NODE_TIME_LIMIT = 5  # hours  # TODO: automatically calculate this based on tool execution time and NODE_THREADS
 MAX_NODE_NUM = 100  # maximum number of nodes to use