From 902c639c58be26576047a920911c09152e487bf9 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 26 Aug 2024 08:03:28 -0500 Subject: [PATCH 1/3] add busco tool --- .github/workflows/build-push-busco-image.yml | 31 +++++++ RELEASE_NOTES.md | 2 +- src/loaders/compute_tools/busco/Dockerfile | 33 ++++++++ src/loaders/compute_tools/busco/busco.py | 81 +++++++++++++++++++ src/loaders/compute_tools/busco/versions.yaml | 12 +++ src/loaders/jobs/taskfarmer/task_generator.py | 11 ++- 6 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/build-push-busco-image.yml create mode 100644 src/loaders/compute_tools/busco/Dockerfile create mode 100644 src/loaders/compute_tools/busco/busco.py create mode 100644 src/loaders/compute_tools/busco/versions.yaml diff --git a/.github/workflows/build-push-busco-image.yml b/.github/workflows/build-push-busco-image.yml new file mode 100644 index 00000000..241bce58 --- /dev/null +++ b/.github/workflows/build-push-busco-image.yml @@ -0,0 +1,31 @@ +name: Build & Push BUSCO Image to GHCR + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - 'src/loaders/compute_tools/busco/versions.yaml' + - '.github/workflows/build-push-busco-image.yml' + - '.github/workflows/build-push-tool-images.yml' + + push: + branches: + - main + - master + - develop + paths: + - 'src/loaders/compute_tools/busco/versions.yaml' + - '.github/workflows/build-push-busco-image.yml' + - '.github/workflows/build-push-tool-images.yml' + +jobs: + trigger-build-push: + uses: ./.github/workflows/build-push-tool-images.yml + with: + tool_name: busco + version_file: 'src/loaders/compute_tools/busco/versions.yaml' + secrets: inherit \ No newline at end of file diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 920d0d7a..2891730f 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,7 +2,7 @@ ## 0.1.3 -* Added BBMap tool to the CDM pipeline. +* Added BBMap and BUSCO tool to the CDM pipeline. * Included metadata file generation after each tool's execution. * Updated Python library dependencies to the latest versions. * Standardized thread management logic across all tools. diff --git a/src/loaders/compute_tools/busco/Dockerfile b/src/loaders/compute_tools/busco/Dockerfile new file mode 100644 index 00000000..e17ee00b --- /dev/null +++ b/src/loaders/compute_tools/busco/Dockerfile @@ -0,0 +1,33 @@ +FROM continuumio/miniconda3:24.5.0-0 + +# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_busco_single method is updated +ARG BUSCO_VER=5.7.1 +ENV CONDA_ENV busco-$BUSCO_VER + +# Add Bioconda and Conda-Forge channels +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge + +# Install BUSCO +# Certain dependencies (e.g., dendropy, sepp) are only compatible with Python versions up to 3.9. +ARG PYTHON_VER=3.9 +RUN conda create -n $CONDA_ENV python=$PYTHON_VER +RUN conda install -n $CONDA_ENV pandas=2.2.2 jsonlines=4.0.0 mamba=1.5.8 pyyaml=6.0.1 +RUN conda run -n $CONDA_ENV mamba install -c bioconda -c conda-forge -y busco=$BUSCO_VER + +# Activate the environment +RUN echo "source activate $CONDA_ENV" >> ~/.bashrc + +# Set up directories +RUN mkdir -p /app +COPY ./ /app/collections +RUN rm -r /app/collections/.git + +ENV PYTHONPATH /app/collections +WORKDIR /app + +ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/busco/busco.py + +RUN chmod -R 777 /app/collections + +ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"] diff --git a/src/loaders/compute_tools/busco/busco.py b/src/loaders/compute_tools/busco/busco.py new file mode 100644 index 00000000..a4f7f151 --- /dev/null +++ b/src/loaders/compute_tools/busco/busco.py @@ -0,0 +1,81 @@ +""" +Run BUSCO tool on a set of fna files. + +This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work. +Therefore, the parser program is not compatible with data generated by this tool. + +""" +import os +import time +from pathlib import Path + +from src.loaders.common.loader_common_names import TOOL_METADATA +from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata +from src.loaders.compute_tools.tool_version import extract_latest_reference_db_version + + +def _run_busco_single( + tool_safe_data_id: str, + data_id: str, + source_file: Path, + output_dir: Path, + threads_per_tool_run: int, + debug: bool) -> None: + start = time.time() + print(f'Start executing BUSCO for {data_id}') + + metadata_file = output_dir / TOOL_METADATA + if metadata_file.exists(): + print(f"Skipping {source_file} as it has already been processed.") + return + + current_dir = os.path.dirname(os.path.abspath(__file__)) + version_file = os.path.join(current_dir, 'versions.yaml') + ref_db_version = extract_latest_reference_db_version(version_file) + + command = [ + 'busco', + '-i', str(source_file), + '-o', data_id, + '--out_path', str(output_dir), + '--datasets_version', ref_db_version, + '--download_path', '/reference_data', + '-c', str(threads_per_tool_run), + '--auto-lineage-prok', + '-m', 'genome', + '-f', + '--augustus', + ] + + run_command(command, output_dir if debug else None) + + end_time = time.time() + run_time = end_time - start + print( + f'Used {round(run_time / 60, 2)} minutes to execute BUSCO for {data_id}') + + # Save run info to a metadata file in the output directory for parsing later + additional_metadata = { + 'source_file': str(source_file), + 'data_id': data_id, + "reference_db": { + "version": "odb10", + }, + } + create_tool_metadata( + output_dir, + tool_name="busco", + version="5.7.1", + command=command, + run_time=round(run_time, 2), + batch_size=1, + additional_metadata=additional_metadata) + + +def main(): + runner = ToolRunner("busco") + runner.parallel_single_execution(_run_busco_single, unzip=True) + + +if __name__ == "__main__": + main() diff --git a/src/loaders/compute_tools/busco/versions.yaml b/src/loaders/compute_tools/busco/versions.yaml new file mode 100644 index 00000000..dd25de07 --- /dev/null +++ b/src/loaders/compute_tools/busco/versions.yaml @@ -0,0 +1,12 @@ +# This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work. +# Therefore, the parser program is not compatible with data generated by this tool. + +versions: + - version: 0.1.0 + date: 2024-08-22 + notes: | + - initial BUSCO implementation + reference_db_version: odb10 + +#Please keep this reminder at the end of this file +#NOTE: If the db verllsion changes, ensure the metadata information saved after running the tool in the _run_busco_single method is updated \ No newline at end of file diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 35231696..2054752d 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -38,10 +38,15 @@ --force Force overwrite of existing job directory --source_file_ext SOURCE_FILE_EXT Select files from source data directory that match the given extension. - + +TODO: The recommended approach by NERSC for running tasks with intensive I/O tools (most of our tools), is to utilize +the scratch directory. Before executing the task, source data and reference libraries should be copied to the scratch +directory. Soft links (such as for collection sources) should be created as needed. Once the task is complete, +the results should be copied back to the user's directory. For more information, refer to the NERSC documentation: +https://docs.nersc.gov/filesystems/perlmutter-scratch/ ''' -TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap'] +TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap', 'busco'] NODE_TIME_LIMIT_DEFAULT = 5 # hours # Used as THREADS variable in the batch script which controls the number of parallel tasks per node @@ -62,10 +67,10 @@ # if no specific metadata is provided for a tool, the default values are used. TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65, 'tasks_per_node': 4, 'threads_per_tool_run': 32}, 'eggnog': {'chunk_size': 100, 'exe_time': 15, 'node_time_limit': 0.5}, # Memory intensive tool - reserve more nodes with less node reservation time + 'busco': {'chunk_size': 50, 'exe_time': 90, 'node_time_limit': 1.5}, # 1.5 minutes per genome with a single task per node on the user's drive. TODO: Aim to test multi-threading per node along with scratch execution, and adjust `tasks_per_node` accordingly. 'default': {'chunk_size': 5000, 'exe_time': 60}} MAX_NODE_NUM = 100 # maximum number of nodes to use - REGISTRY = 'ghcr.io/kbase/collections' VERSION_FILE = 'versions.yaml' COMPUTE_TOOLS_DIR = '../../compute_tools' # relative to task_generator.py From f59a74ad5b33abf35b18ec4014abe0a1b6f02d56 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 26 Aug 2024 13:17:06 -0500 Subject: [PATCH 2/3] address comments --- src/loaders/compute_tools/busco/Dockerfile | 2 ++ src/loaders/compute_tools/busco/busco.py | 4 +++- src/loaders/compute_tools/busco/versions.yaml | 5 +---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/loaders/compute_tools/busco/Dockerfile b/src/loaders/compute_tools/busco/Dockerfile index e17ee00b..33f4851b 100644 --- a/src/loaders/compute_tools/busco/Dockerfile +++ b/src/loaders/compute_tools/busco/Dockerfile @@ -13,6 +13,8 @@ RUN conda config --add channels conda-forge ARG PYTHON_VER=3.9 RUN conda create -n $CONDA_ENV python=$PYTHON_VER RUN conda install -n $CONDA_ENV pandas=2.2.2 jsonlines=4.0.0 mamba=1.5.8 pyyaml=6.0.1 +# Suggestions from BUSCO team to use mamba for speeding up the installation process: +# https://busco.ezlab.org/busco_userguide.html#installation-with-conda RUN conda run -n $CONDA_ENV mamba install -c bioconda -c conda-forge -y busco=$BUSCO_VER # Activate the environment diff --git a/src/loaders/compute_tools/busco/busco.py b/src/loaders/compute_tools/busco/busco.py index a4f7f151..0558fa82 100644 --- a/src/loaders/compute_tools/busco/busco.py +++ b/src/loaders/compute_tools/busco/busco.py @@ -33,6 +33,8 @@ def _run_busco_single( version_file = os.path.join(current_dir, 'versions.yaml') ref_db_version = extract_latest_reference_db_version(version_file) + # Please refer to https://docs.google.com/document/d/15yV-S41Iqe20F-I2MRLWdzJwVdr8QKUfZPw7oq8WvB0/edit#heading=h.elgudks5mtxu + # for more information on the BUSCO command options we are using here. command = [ 'busco', '-i', str(source_file), @@ -59,7 +61,7 @@ def _run_busco_single( 'source_file': str(source_file), 'data_id': data_id, "reference_db": { - "version": "odb10", + "version": ref_db_version, }, } create_tool_metadata( diff --git a/src/loaders/compute_tools/busco/versions.yaml b/src/loaders/compute_tools/busco/versions.yaml index dd25de07..400bbdca 100644 --- a/src/loaders/compute_tools/busco/versions.yaml +++ b/src/loaders/compute_tools/busco/versions.yaml @@ -6,7 +6,4 @@ versions: date: 2024-08-22 notes: | - initial BUSCO implementation - reference_db_version: odb10 - -#Please keep this reminder at the end of this file -#NOTE: If the db verllsion changes, ensure the metadata information saved after running the tool in the _run_busco_single method is updated \ No newline at end of file + reference_db_version: odb10 \ No newline at end of file From 28fd7478b6c6bb774f41e77891af9f33d705a660 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 26 Aug 2024 13:31:11 -0500 Subject: [PATCH 3/3] update a comment --- src/loaders/jobs/taskfarmer/task_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 2054752d..df6a1926 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -61,7 +61,7 @@ # for single genome tools, such as microtrait and mash, the chunk_size is the number of genomes to process in a # serial manner # exe_time is the estimated execution time for a single task (default is 60 minutes) -# threads_per_tool_run is the number of threads to use for each tool execution (default is 32) +# threads_per_tool_run is the number of threads to use for each tool execution (default is SYSTEM_CPU_CORES (256) / number of parallel tasks per node) # tasks_per_node is the number of parallel tasks to run on a node (default is 1) # node_time_limit is the time limit for the node we reserved for the task (default is 5 hours) # if no specific metadata is provided for a tool, the default values are used.