Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Tianhao-Gu committed Oct 25, 2023
1 parent 96c8a21 commit d0c615d
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 29 deletions.
18 changes: 9 additions & 9 deletions docs/generate_genome_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,25 +89,25 @@ ssh user_name@perlmutter-p1.nersc.gov

* The default directory at NERSC for libraries is:
```text
/global/cfs/cdirs/kbase/collections/libraries/tool_name
/global/cfs/cdirs/kbase/collections/libraries/<tool_name>/<reference data version>
```
* Guidelines for downloading [GTDB-TK](https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data) reference data:
```commandline
# download the reference data to the default directory at NERSC
cd /global/cfs/cdirs/kbase/collections/libraries/gtdb_tk
wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz
wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz (or, mirror)
tar xvzf gtdbtk_v2_data.tar.gz
cd /global/cfs/cdirs/kbase/collections/libraries/gtdb_tk/<reference data version>
wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/<reference data version>.tar.gz
tar xvzf <reference data version>.tar.gz
# update the 'versions.yaml' file with a new version and set its 'reference_db_version' accordingly
```
* Guidelines for downloading [CheckM2](https://github.com/chklovski/CheckM2#database) DIAMOND DB:
```commandline
# download the database to the default directory at NERSC
checkm2 database --download --path /global/cfs/cdirs/kbase/collections/libraries/checkm2/CheckM2_database_[checkm2_version]
checkm2 database --download --path /global/cfs/cdirs/kbase/collections/libraries/checkm2/<checkm2 version>
# update the 'versions.yaml' file with a new version and set its 'reference_db_version' accordingly
```
* Please ensure to rename the DIAMOND DB file name to the value of `DIAMOND_DB_NAME` specified in the [checkm2 Docker file](../src/loaders/compute_tools/checkm2/Dockerfile).
* In case you want to utilize an alternative tooling library, you can specify the reference_db_version (directory name), e.g. release207_v2, CheckM2_database, etc., in the
[versions.yaml](../src/loaders/compute_tools/gtdb_tk/versions.yaml) file in each compute tools [directory](../src/loaders/compute_tools).

* If a tool requires reference data/DB, make sure to update the 'versions.yaml' file in the compute tools directory accordingly.
Otherwise, the reference data/DB will not be mounted to the compute tools Docker container.

3. Go to the collections project directory

Expand Down
6 changes: 3 additions & 3 deletions src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
versions:
- version: 0.1.0
date: 2023-07-19
reference_db_version: CheckM2_database_1.0.1
reference_db_version: 1.0.1
- version: 0.1.1
date: 2023-08-04
notes: |
- install jsonlines, pandas to support parsing of CheckM2 output
reference_db_version: CheckM2_database_1.0.1
reference_db_version: 1.0.1
- version: 0.1.2
date: 2023-10-24
notes: |
- update diamond reference DB default path
reference_db_version: CheckM2_database_1.0.1
reference_db_version: 1.0.1
8 changes: 4 additions & 4 deletions src/loaders/compute_tools/tool_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ def extract_latest_version(file_path: str) -> str:
return latest_version['version']


def extract_latest_reference_db_path(file_path: str) -> str:
def extract_latest_reference_db_version(file_path: str) -> str:
"""
Extracts the latest reference database path from a YAML file by referencing the latest date specified within the file.
Extracts the latest reference database version from a YAML file by referencing the latest date specified within the file.
Args:
file_path (str): The path to the YAML file.
Returns:
str: The latest reference database path extracted from the YAML file.
None: If the latest version does not have a reference database path.
str: The latest reference database version extracted from the YAML file.
None: If the latest version does not have a reference database version.
"""

latest_ver = extract_latest_version(file_path)
Expand Down
22 changes: 9 additions & 13 deletions src/loaders/jobs/taskfarmer/task_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import src.loaders.jobs.taskfarmer.taskfarmer_common as tf_common
from src.loaders.common import loader_common_names
from src.loaders.common.loader_helper import make_collection_source_dir
from src.loaders.compute_tools.tool_version import extract_latest_version, extract_latest_reference_db_path
from src.loaders.compute_tools.tool_version import extract_latest_version, extract_latest_reference_db_version
from src.loaders.jobs.taskfarmer.taskfarmer_task_mgr import TFTaskManager, PreconditionError

'''
Expand Down Expand Up @@ -67,28 +67,24 @@
# volume name for the Docker containers (the internal container ref data mount directory)
TOOL_IMG_VOLUME_NAME = '/reference_data'

TOOLS_REQUIRING_VOLUME_MOUNT = ['checkm2', 'gtdb_tk']

LIBRARY_DIR = 'libraries' # subdirectory for the library files


def _retrieve_tool_volume(tool, root_dir):
# Retrieve the volume mapping for the specified tool.

if tool in TOOLS_REQUIRING_VOLUME_MOUNT:
current_dir = os.path.dirname(os.path.abspath(__file__))
compute_tools_dir = os.path.join(current_dir, COMPUTE_TOOLS_DIR)
version_file = os.path.join(compute_tools_dir, tool, VERSION_FILE)
ref_db_path = extract_latest_reference_db_path(version_file)
current_dir = os.path.dirname(os.path.abspath(__file__))
compute_tools_dir = os.path.join(current_dir, COMPUTE_TOOLS_DIR)
version_file = os.path.join(compute_tools_dir, tool, VERSION_FILE)
ref_db_version = extract_latest_reference_db_version(version_file)

if not ref_db_path:
raise ValueError(f'No reference database path found for tool {tool}.')
ref_db_path_abs = os.path.join(root_dir, LIBRARY_DIR, tool, ref_db_path)
return {ref_db_path_abs: TOOL_IMG_VOLUME_NAME}
else:
if not ref_db_version:
# No reference database path needed for the tool (microtrait, mash).
return dict()

ref_db_path_abs = os.path.join(root_dir, LIBRARY_DIR, tool, ref_db_version)
return {ref_db_path_abs: TOOL_IMG_VOLUME_NAME}


def _pull_image(image_str, job_dir):
"""
Expand Down

0 comments on commit d0c615d

Please sign in to comment.