From e1038e395c030a4d89966af3831818d9672377d5 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 16:30:13 -0500 Subject: [PATCH 01/50] add eggnog --- src/loaders/common/loader_common_names.py | 3 ++ src/loaders/compute_tools/eggnog/Dockerfile | 34 ++++++++++++++ src/loaders/compute_tools/eggnog/eggnog.py | 46 +++++++++++++++++++ .../compute_tools/eggnog/versions.yaml | 3 ++ 4 files changed, 86 insertions(+) create mode 100644 src/loaders/compute_tools/eggnog/Dockerfile create mode 100644 src/loaders/compute_tools/eggnog/eggnog.py create mode 100644 src/loaders/compute_tools/eggnog/versions.yaml diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 7838ac971..1255045fc 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -94,6 +94,9 @@ # The metadata file name created during the Mash run MASH_METADATA = 'mash_run_metadata.json' +# The metadata file name created during the Mash run +EGGNOG_METADATA = 'eggnog_run_metadata.json' + # The fatal error file created if a data file cannot be successfully processed FATAL_ERROR_FILE = "fatal_error.json" diff --git a/src/loaders/compute_tools/eggnog/Dockerfile b/src/loaders/compute_tools/eggnog/Dockerfile new file mode 100644 index 000000000..54b0d6338 --- /dev/null +++ b/src/loaders/compute_tools/eggnog/Dockerfile @@ -0,0 +1,34 @@ +FROM continuumio/miniconda3:24.1.2-0 + +ENV EGGNOG_VER 2.1.12 +ENV CONDA_ENV eggnog-$EGGNOG_VER +ENV PYTHON_VER 3.11 + +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge + +RUN conda create -n $CONDA_ENV python=$PYTHON_VER +RUN conda install -n $CONDA_ENV -c conda-forge -c bioconda eggnog-mapper=$EGGNOG_VER +RUN conda install -n $CONDA_ENV pandas=2.2.1 jsonlines=2.0.0 + +RUN echo "source activate $CONDA_ENV" >> ~/.bashrc + +# eggNOG annotation DB is pre-downloaded at /global/cfs/cdirs/kbase/collections/libraries/eggnog/5.0.2 +# following instructions at https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.12#setup +# Mount the annotation DB directory to /reference_data when running the container +ENV EGGNOG_DATA_DIR /reference_data + +RUN mkdir -p /app +COPY ./ /app/collections +# slows down that chmod step if left in place +RUN rm -r /app/collections/.git + +ENV PYTHONPATH /app/collections + +WORKDIR /app + +ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/eggnog/eggnog.py + +RUN chmod -R 777 /app/collections + +ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"] \ No newline at end of file diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py new file mode 100644 index 000000000..6921d3d5d --- /dev/null +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -0,0 +1,46 @@ +""" +Run eggNOG tool on a set of faa files. +""" +import json +from pathlib import Path + +from src.loaders.common.loader_common_names import EGGNOG_METADATA +from src.loaders.compute_tools.tool_common import ToolRunner, run_command + +INPUT_TYPE = 'proteins' +THREADS = 128 + + +def _run_eggnog_single( + tool_safe_data_id: str, + data_id: str, + source_file: Path, + output_dir: Path, + debug: bool) -> None: + # RUN eggNOG for a single genome + command = ['emapper.py', + '-i', source_file, # Input file. + '-o', source_file, # Output prefix. + # Save result file to source file directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. + '--itype', f'{INPUT_TYPE}', + '--cpu', f'{THREADS}', + '--override' # Overwrites output files if they exist from previous runs. + ] + + run_command(command, output_dir if debug else None) + + # Save run info to a metadata file in the output directory for parsing later + metadata_file = output_dir / EGGNOG_METADATA + metadata = {'source_file': str(source_file), + 'input_type': INPUT_TYPE} + with open(metadata_file, 'w') as f: + json.dump(metadata, f, indent=4) + + +def main(): + runner = ToolRunner("eggNOG") + runner.parallel_single_execution(_run_eggnog_single, unzip=True) + + +if __name__ == "__main__": + main() diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml new file mode 100644 index 000000000..8702247eb --- /dev/null +++ b/src/loaders/compute_tools/eggnog/versions.yaml @@ -0,0 +1,3 @@ +versions: + - version: 0.1.0 + date: 2024-03-13 \ No newline at end of file From 1e534fba5c5d96f8bd94971291d1fe1c982891ee Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 16:38:15 -0500 Subject: [PATCH 02/50] add auto img build --- .github/workflows/build-push-eggnog-image.yml | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/build-push-eggnog-image.yml diff --git a/.github/workflows/build-push-eggnog-image.yml b/.github/workflows/build-push-eggnog-image.yml new file mode 100644 index 000000000..4966fde75 --- /dev/null +++ b/.github/workflows/build-push-eggnog-image.yml @@ -0,0 +1,31 @@ +name: Build & Push eggNOG Image to GHCR + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - 'src/loaders/compute_tools/eggnog/versions.yaml' + - '.github/workflows/build-push-eggnog-image.yml' + - '.github/workflows/build-push-tool-images.yml' + + push: + branches: + - main + - master + - develop + paths: + - 'src/loaders/compute_tools/eggnog/versions.yaml' + - '.github/workflows/build-push-eggnog-image.yml' + - '.github/workflows/build-push-tool-images.yml' + +jobs: + trigger-build-push: + uses: ./.github/workflows/build-push-tool-images.yml + with: + tool_name: eggnog + version_file: 'src/loaders/compute_tools/eggnog/versions.yaml' + secrets: inherit \ No newline at end of file From c3326a6c75059679f41f439c8a0854acc8783fd8 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 16:43:05 -0500 Subject: [PATCH 03/50] add reference db --- src/loaders/compute_tools/eggnog/versions.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml index 8702247eb..ece3f5da3 100644 --- a/src/loaders/compute_tools/eggnog/versions.yaml +++ b/src/loaders/compute_tools/eggnog/versions.yaml @@ -1,3 +1,4 @@ versions: - version: 0.1.0 - date: 2024-03-13 \ No newline at end of file + date: 2024-03-13 + reference_db_version: 5.0.2 \ No newline at end of file From fdb3dbd8c73a02b02cee3d2da5dcdf35e5c00a25 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 16:50:39 -0500 Subject: [PATCH 04/50] lower CPU --- src/loaders/compute_tools/eggnog/eggnog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 6921d3d5d..ca058bb4a 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -8,7 +8,7 @@ from src.loaders.compute_tools.tool_common import ToolRunner, run_command INPUT_TYPE = 'proteins' -THREADS = 128 +THREADS = 16 def _run_eggnog_single( From 3c0a81890a929fbc5d995d4d606656f5e0588dba Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 17:03:43 -0500 Subject: [PATCH 05/50] add taskfarmer --- src/loaders/jobs/taskfarmer/task_generator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index d7423e946..01ae1af4f 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -49,10 +49,11 @@ 256 cores. ''' -TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash'] +TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog'] # estimated execution time (in minutes) for each tool to process a chunk of data TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65}, + 'eggnog': {'chunk_size': 500, 'exe_time': 65}, # TODO: update this value after performance testing 'default': {'chunk_size': 5000, 'exe_time': 60}} NODE_TIME_LIMIT = 5 # hours # TODO: automatically calculate this based on tool execution time and NODE_THREADS MAX_NODE_NUM = 100 # maximum number of nodes to use From 5c73e964c1e12b010cfdacf49d2c7ec75619353b Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 17:35:41 -0500 Subject: [PATCH 06/50] skip processed genomes --- src/loaders/compute_tools/eggnog/eggnog.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index ca058bb4a..0a961cf2c 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -17,6 +17,12 @@ def _run_eggnog_single( source_file: Path, output_dir: Path, debug: bool) -> None: + + metadata_file = output_dir / EGGNOG_METADATA + if metadata_file.exists(): + print(f"Skipping {source_file} as it has already been processed.") + return + # RUN eggNOG for a single genome command = ['emapper.py', '-i', source_file, # Input file. @@ -30,7 +36,6 @@ def _run_eggnog_single( run_command(command, output_dir if debug else None) # Save run info to a metadata file in the output directory for parsing later - metadata_file = output_dir / EGGNOG_METADATA metadata = {'source_file': str(source_file), 'input_type': INPUT_TYPE} with open(metadata_file, 'w') as f: From 921c8a8a4616c72d85a96039e00cb24cfa5bc7b2 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 21:32:23 -0500 Subject: [PATCH 07/50] run with fast mode --- src/loaders/compute_tools/eggnog/eggnog.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 0a961cf2c..5d6356d0c 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -30,6 +30,9 @@ def _run_eggnog_single( # Save result file to source file directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. '--itype', f'{INPUT_TYPE}', '--cpu', f'{THREADS}', + '-excel', + '--sensmode', 'fast', + '--dmnd_iterate', 'no', '--override' # Overwrites output files if they exist from previous runs. ] From fd8c22d1f4aac40b022ca99d43945e464ec2d0ec Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 13 Mar 2024 21:52:28 -0500 Subject: [PATCH 08/50] typo --- src/loaders/compute_tools/eggnog/eggnog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 5d6356d0c..a335f04e2 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -30,7 +30,7 @@ def _run_eggnog_single( # Save result file to source file directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. '--itype', f'{INPUT_TYPE}', '--cpu', f'{THREADS}', - '-excel', + '--excel', '--sensmode', 'fast', '--dmnd_iterate', 'no', '--override' # Overwrites output files if they exist from previous runs. From ec4452378974d278004b625a0a0cb720fb38ed35 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 09:18:55 -0500 Subject: [PATCH 09/50] drop assembly only cases --- .../workspace_uploader/workspace_uploader.py | 203 +++------ .../workspace_uploader_test.py | 423 ++++-------------- 2 files changed, 153 insertions(+), 473 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 3d702e5fd..4f5c6d174 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -1,9 +1,10 @@ """ -usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_ver SOURCE_VER] - [--root_dir ROOT_DIR] [--load_id LOAD_ID] [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] - [--create_assembly_only] [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] - [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] [--gfu_service_ver GFU_SERVICE_VER] - [--keep_job_dir] [--as_catalog_admin] +usage: workspace_uploader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] + [--source_ver SOURCE_VER] [--root_dir ROOT_DIR] [--load_id LOAD_ID] + [--token_filepath TOKEN_FILEPATH] [--env {CI,NEXT,APPDEV,PROD}] + [--upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...]] [--batch_size BATCH_SIZE] + [--cbs_max_tasks CBS_MAX_TASKS] [--au_service_ver AU_SERVICE_VER] + [--gfu_service_ver GFU_SERVICE_VER] [--keep_job_dir] [--as_catalog_admin] PROTOTYPE - Upload files to the workspace service (WSS). Note that the uploader determines whether a genome is already uploaded in one of two ways. First it consults the *.yaml files in each genomes directory; if that file shows the genome has been uploaded it skips it @@ -35,11 +36,8 @@ If not provided, the token must be provided in the `KB_AUTH_TOKEN` environment variable. --env {CI,NEXT,APPDEV,PROD} KBase environment (default: PROD) - --create_assembly_only - Create only assembly object. If not set create Genome object using genbank files by default. --upload_file_ext UPLOAD_FILE_EXT [UPLOAD_FILE_EXT ...] - Upload only files that match given extensions. If not provided, uses the appropriate default extension - depending on the create_assembly_only flag + Upload only files that match given extensions. (default: ['genomic.gbff.gz']) --batch_size BATCH_SIZE Number of files to upload per batch (default: 2500) --cbs_max_tasks CBS_MAX_TASKS @@ -84,9 +82,8 @@ # setup KB_AUTH_TOKEN as env or provide a token_filepath in --token_filepath # export KB_AUTH_TOKEN="your-kb-auth-token" -_UPLOAD_ASSEMBLY_FILE_EXT = ["genomic.fna.gz"] _UPLOAD_GENOME_FILE_EXT = ["genomic.gbff.gz"] -_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER = "/kb/module/work/tmp" +_JOB_DIR_IN_CONTAINER = "/kb/module/work/tmp" _UPLOADED_YAML = "uploaded.yaml" _WS_MAX_BATCH_SIZE = 10000 @@ -162,17 +159,12 @@ def _get_parser(): default="PROD", help="KBase environment", ) - optional.add_argument( - "--create_assembly_only", - action="store_true", - help="Create only assembly object. If not set create Genome object using genbank files by default.", - ) optional.add_argument( "--upload_file_ext", type=str, + default=_UPLOAD_GENOME_FILE_EXT, nargs="+", - help="Upload only files that match given extensions. If not provided, uses the appropriate default extension " - "depending on the create_assembly_only flag", + help="Upload only files that match given extensions.", ) optional.add_argument( "--batch_size", @@ -215,10 +207,6 @@ def _get_parser(): return parser -def _get_default_file_ext(create_assembly_only: bool) -> list[str]: - return _UPLOAD_ASSEMBLY_FILE_EXT if create_assembly_only else _UPLOAD_GENOME_FILE_EXT - - def _get_yaml_file_path(obj_dir: str) -> str: """ Get the uploaded.yaml file path from collections source directory. @@ -275,7 +263,7 @@ def _upload_genomes_to_workspace( # copy assembly file from tmp job dir to the sourcedata/NCBI directory container_assembly_path = Path(result_dict["assembly_path"]) # remove prefix of the assembly_path - relative_assembly_path = container_assembly_path.relative_to(_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER) + relative_assembly_path = container_assembly_path.relative_to(_JOB_DIR_IN_CONTAINER) local_assembly_path = Path(job_data_dir) / relative_assembly_path if not os.path.exists(local_assembly_path): raise ValueError(f"Assembly file {local_assembly_path} does not exist") @@ -346,7 +334,6 @@ def _read_upload_status_yaml_file( workspace_id: int, load_id: str, obj_dir: str, - create_assembly_only: bool = True, ) -> tuple[dict[str, dict[int, list[str]]], bool]: """ Get metadata and upload status of a WS object from the uploaded.yaml file. @@ -357,8 +344,8 @@ def _read_upload_status_yaml_file( : assembly_upa: assembly_filename: - genome_upa: (can be None) - genome_filename: (can be None) + genome_upa: + genome_filename: """ if upload_env_key not in loader_common_names.KB_ENV: @@ -373,11 +360,7 @@ def _read_upload_status_yaml_file( workspace_dict = data.setdefault(upload_env_key, {}).setdefault(workspace_id, {}) - uploaded = load_id in workspace_dict - - # In the event of genome upload and an assembly with the identical load_id has already been uploaded. - if not create_assembly_only: - uploaded = uploaded and workspace_dict[load_id].get(_KEY_GENOME_UPA) + uploaded = load_id in workspace_dict and workspace_dict[load_id].get(_KEY_GENOME_UPA) return data, uploaded @@ -391,15 +374,13 @@ def _update_upload_status_yaml_file( """ Update the uploaded.yaml file in target genome_dir with newly uploaded WS object names and upa info. """ - is_genome = upload_result.is_genome - obj_tuple = upload_result.genome_tuple if is_genome else upload_result.assembly_tuple + obj_tuple = upload_result.genome_tuple data, uploaded = _read_upload_status_yaml_file( upload_env_key, workspace_id, load_id, obj_tuple.obj_coll_src_dir, - create_assembly_only=not is_genome, ) if uploaded: @@ -411,7 +392,7 @@ def _update_upload_status_yaml_file( _KEY_ASSEMBLY_UPA: upload_result.assembly_upa, _KEY_ASSEMBLY_FILENAME: upload_result.assembly_tuple.obj_name, _KEY_GENOME_UPA: upload_result.genome_upa, - _KEY_GENOME_FILENAME: upload_result.genome_tuple.obj_name if is_genome else None, + _KEY_GENOME_FILENAME: upload_result.genome_tuple.obj_name, } file_path = _get_yaml_file_path(obj_tuple.obj_coll_src_dir) @@ -425,7 +406,6 @@ def _fetch_objects_to_upload( load_id: str, collection_source_dir: str, upload_file_ext: list[str], - create_assembly_only: bool = True, ) -> tuple[int, dict[str, str]]: count = 0 wait_to_upload_objs = dict() @@ -444,7 +424,7 @@ def _fetch_objects_to_upload( and f.endswith(tuple(upload_file_ext)) ] - # Assembly and Genome (from genbank) object uploaders both only requires one file + # Genome (from genbank) object uploader only requires one file # Modify or skip this check if a different use case requires multiple files. if len(obj_file_list) != 1: raise ValueError( @@ -460,7 +440,6 @@ def _fetch_objects_to_upload( workspace_id, load_id, obj_dir, - create_assembly_only=create_assembly_only, ) if uploaded: @@ -480,7 +459,6 @@ def _query_workspace_with_load_id( workspace_id: int, load_id: str, obj_names: list[str], - assembly_objs_only: bool = True, ) -> tuple[list[Any], list[Any]]: if len(obj_names) > _WS_MAX_BATCH_SIZE: raise ValueError( @@ -492,25 +470,20 @@ def _query_workspace_with_load_id( if not uploaded_objs_info: return list(), list() - if assembly_objs_only: - _check_obj_type(workspace_id, load_id, uploaded_objs_info, {loader_common_names.OBJECTS_NAME_ASSEMBLY}) - assembly_objs_info = uploaded_objs_info - genome_objs_info = list() - else: - _check_obj_type(workspace_id, load_id, uploaded_objs_info, {loader_common_names.OBJECTS_NAME_GENOME}) - genome_objs_info = uploaded_objs_info - assembly_objs_ref = list() - for info in genome_objs_info: - try: - assembly_objs_ref.append(info[10]["Assembly Object"]) - except KeyError: - genome_ref = obj_info_to_upa(info) - raise ValueError( - f"Genome object {genome_ref} does not have an assembly object linked to it" - ) + _check_obj_type(workspace_id, load_id, uploaded_objs_info, {loader_common_names.OBJECTS_NAME_GENOME}) + genome_objs_info = uploaded_objs_info + assembly_objs_ref = list() + for info in genome_objs_info: + try: + assembly_objs_ref.append(info[10]["Assembly Object"]) + except KeyError: + genome_ref = obj_info_to_upa(info) + raise ValueError( + f"Genome object {genome_ref} does not have an assembly object linked to it" + ) - assembly_objs_spec = [{"ref": ref} for ref in assembly_objs_ref] - assembly_objs_info = ws.get_object_info3({"objects": assembly_objs_spec, "includeMetadata": 1})["infos"] + assembly_objs_spec = [{"ref": ref} for ref in assembly_objs_ref] + assembly_objs_info = ws.get_object_info3({"objects": assembly_objs_spec, "includeMetadata": 1})["infos"] return assembly_objs_info, genome_objs_info @@ -534,7 +507,6 @@ def _query_workspace_with_load_id_mass( load_id: str, obj_names: list[str], batch_size: int = _WS_MAX_BATCH_SIZE, - assembly_objs_only: bool = True, ) -> tuple[list[Any], list[Any]]: uploaded_assembly_objs_info, uploaded_genome_objs_info = [], [] @@ -544,8 +516,7 @@ def _query_workspace_with_load_id_mass( uploaded_genome_objs_info_batch) = _query_workspace_with_load_id(ws, workspace_id, load_id, - obj_names[i:i + batch_size], - assembly_objs_only=assembly_objs_only) + obj_names[i:i + batch_size]) uploaded_assembly_objs_info.extend(uploaded_assembly_objs_info_batch) uploaded_genome_objs_info.extend(uploaded_genome_objs_info_batch) @@ -578,17 +549,16 @@ def _post_process( """ Update the uploaded.yaml file in the genome directory with the object name and upa info. - If genome_tuple and genome_upa are provided, the function will also: + The function will also: Create a standard entry in sourcedata/workspace for each object. Hardlink to the original object file in sourcedata to avoid duplicating the file. Creates a softlink from new_dir in collectionssource to the contents of target_dir in sourcedata. """ - if upload_result.is_genome: - _process_genome_objects(collections_source_dir, - source_data_dir, - upload_result, - ) + _process_genome_objects(collections_source_dir, + source_data_dir, + upload_result, + ) # Update the 'uploaded.yaml' file, serving as a marker to indicate the successful upload of the object. # Ensure that this operation is the final step in the post-processing workflow @@ -632,23 +602,6 @@ def _process_genome_objects( loader_helper.create_softlink_between_dirs(new_dir, ws_source_data_dir) -def _process_batch_upload( - obj_tuples: list[WSObjTuple], - workspace_id: int, - load_id: str, - asu_client: AssemblyUtil, - gfu_client: GenomeFileUtil, - job_data_dir: str, - upload_assembly_only: bool = True, -) -> list[UploadResult]: - if upload_assembly_only: - upload_results = _upload_assemblies_to_workspace(asu_client, workspace_id, load_id, obj_tuples) - else: - upload_results = _upload_genomes_to_workspace(gfu_client, workspace_id, load_id, obj_tuples, job_data_dir) - - return upload_results - - def _download_assembly_fasta_file( asu_client: AssemblyUtil, genome_obj_info: list[Any], @@ -665,7 +618,7 @@ def _download_assembly_fasta_file( raise ValueError(f"Genome object {genome_obj_info[1]} does not have an assembly object linked to it") containerized_assembly_file = Path(asu_client.get_assembly_as_fasta({"ref": assembly_upa})["path"]) # remove prefix of the assembly_path - relative_assembly_path = containerized_assembly_file.relative_to(_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER) + relative_assembly_path = containerized_assembly_file.relative_to(_JOB_DIR_IN_CONTAINER) local_assembly_file = Path(job_data_dir) / relative_assembly_path loader_helper.create_hardlink_between_files(coll_src_assembly_path, @@ -709,7 +662,6 @@ def _process_failed_uploads( obj_tuples: list[WSObjTuple], asu_client: AssemblyUtil, job_data_dir: str, - upload_assembly_only: bool = True ) -> list[UploadResult]: # figure out uploads that succeeded @@ -720,38 +672,28 @@ def _process_failed_uploads( uploaded_genome_obj_infos) = _query_workspace_with_load_id_mass(ws, workspace_id, load_id, - list(name2tuple.keys()), - assembly_objs_only=upload_assembly_only) - - if upload_assembly_only: - batch_uploaded_assembly_tuples = [name2tuple[info[1]] for info in uploaded_assembly_obj_infos] + list(name2tuple.keys()) + ) - for assembly_obj_info, assembly_tuple in zip(uploaded_assembly_obj_infos, batch_uploaded_assembly_tuples): - upload_result = UploadResult( - assembly_obj_info=assembly_obj_info, - assembly_tuple=assembly_tuple - ) - upload_results.append(upload_result) - else: - batch_uploaded_genome_tuples = [name2tuple[info[1]] for info in uploaded_genome_obj_infos] - batch_assembly_tuples = _build_assembly_tuples(batch_uploaded_genome_tuples, - uploaded_genome_obj_infos, - asu_client, - job_data_dir) - for (genome_obj_info, - assembly_obj_info, - genome_tuple, - assembly_tuple) in zip(uploaded_genome_obj_infos, - uploaded_assembly_obj_infos, - batch_uploaded_genome_tuples, - batch_assembly_tuples): - upload_result = UploadResult( - genome_obj_info=genome_obj_info, - assembly_obj_info=assembly_obj_info, - genome_tuple=genome_tuple, - assembly_tuple=assembly_tuple, - ) - upload_results.append(upload_result) + batch_uploaded_genome_tuples = [name2tuple[info[1]] for info in uploaded_genome_obj_infos] + batch_assembly_tuples = _build_assembly_tuples(batch_uploaded_genome_tuples, + uploaded_genome_obj_infos, + asu_client, + job_data_dir) + for (genome_obj_info, + assembly_obj_info, + genome_tuple, + assembly_tuple) in zip(uploaded_genome_obj_infos, + uploaded_assembly_obj_infos, + batch_uploaded_genome_tuples, + batch_assembly_tuples): + upload_result = UploadResult( + genome_obj_info=genome_obj_info, + assembly_obj_info=assembly_obj_info, + genome_tuple=genome_tuple, + assembly_tuple=assembly_tuple, + ) + upload_results.append(upload_result) return upload_results @@ -768,7 +710,6 @@ def _upload_objects_in_parallel( asu_client: AssemblyUtil, gfu_client: GenomeFileUtil, job_data_dir: str, - upload_assembly_only: bool = True, ) -> int: """ Upload objects to the target workspace in parallel using multiprocessing. @@ -785,7 +726,6 @@ def _upload_objects_in_parallel( asu_client: AssemblyUtil client gfu_client: GenomeFileUtil client job_data_dir: the job directory to store object files - upload_assembly_only: upload assembly only if True, otherwise upload genome only Returns: number of object files have been successfully uploaded from wait_to_upload_objs @@ -798,13 +738,12 @@ def _upload_objects_in_parallel( upload_results = list() for obj_tuples in _gen(wait_to_upload_objs, batch_size): try: - upload_results = _process_batch_upload(obj_tuples, - workspace_id, - load_id, - asu_client, - gfu_client, - job_data_dir, - upload_assembly_only=upload_assembly_only) + upload_results = _upload_genomes_to_workspace(gfu_client, + workspace_id, + load_id, + obj_tuples, + job_data_dir) + except Exception as e: traceback.print_exc() uploaded_fail = True @@ -814,8 +753,7 @@ def _upload_objects_in_parallel( load_id, obj_tuples, asu_client, - job_data_dir, - upload_assembly_only=upload_assembly_only) + job_data_dir) except Exception as e: print( f"WARNING: There are inconsistencies between " @@ -851,7 +789,7 @@ def _upload_objects_in_parallel( def _dict2tuple_list(objs_dict: dict[str, str]) -> list[WSObjTuple]: ws_object_tuple_list = [ WSObjTuple( - i[0], i[1], os.path.join(_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER, i[0]) + i[0], i[1], os.path.join(_JOB_DIR_IN_CONTAINER, i[0]) ) for i in objs_dict.items() ] @@ -880,8 +818,7 @@ def main(): source_version = getattr(args, loader_common_names.SOURCE_VER_ARG_NAME) root_dir = getattr(args, loader_common_names.ROOT_DIR_ARG_NAME) token_filepath = args.token_filepath - create_assembly_only = args.create_assembly_only - upload_file_ext = args.upload_file_ext or _get_default_file_ext(create_assembly_only) + upload_file_ext = args.upload_file_ext batch_size = args.batch_size cbs_max_tasks = args.cbs_max_tasks au_service_ver = args.au_service_ver @@ -953,7 +890,6 @@ def main(): load_id, collection_source_dir, upload_file_ext, - create_assembly_only=create_assembly_only ) # set up workspace client @@ -968,7 +904,7 @@ def main(): wait_to_upload_tuples, AssemblyUtil(conf.callback_url, service_ver=au_service_ver, token=conf.token), conf.job_data_dir, - upload_assembly_only=create_assembly_only) + ) # fix inconsistencies between the workspace and the local yaml files if upload_results: @@ -982,7 +918,7 @@ def main(): source_dir, upload_result ) - obj_name = upload_result.genome_tuple.obj_name if upload_result.is_genome else upload_result.assembly_tuple.obj_name + obj_name = upload_result.genome_tuple.obj_name wait_to_upload_objs.pop(obj_name) print("Recovery process completed ...") @@ -1016,7 +952,6 @@ def main(): AssemblyUtil(conf.callback_url, service_ver=au_service_ver, token=conf.token), GenomeFileUtil(conf.callback_url, service_ver=gfu_service_ver, token=conf.token), conf.job_data_dir, - upload_assembly_only=create_assembly_only ) upload_time = (time.time() - start) / 60 diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index b416a3381..dcd6f9e3e 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -8,7 +8,6 @@ import pytest -from src.clients.AssemblyUtilClient import AssemblyUtil from src.clients.GenomeFileUtilClient import GenomeFileUtil from src.clients.workspaceClient import Workspace from src.common.common_helper import obj_info_to_upa @@ -131,8 +130,8 @@ class Params(NamedTuple): tmp_dir: str sourcedata_dir: str collection_source_dir: str - assembly_dirs: list[str] - target_files: list[str] + genome_dirs: list[str] + assembly_files: list[str] genbank_files: list[str] @@ -148,24 +147,24 @@ def setup_and_teardown(): sourcedata_dir.mkdir(parents=True) collection_source_dir.mkdir() - assembly_dirs, target_files, genbank_files = list(), list(), list() + genome_dirs, assembly_files, genbank_files = list(), list(), list() for assembly_dir_name, assembly_name, genome_name in zip(ASSEMBLY_DIR_NAMES, ASSEMBLY_NAMES, GEMOME_NAMES): target_dir_path = sourcedata_dir.joinpath(assembly_dir_name) target_dir_path.mkdir() - target_file_path = target_dir_path.joinpath(assembly_name) - target_file_path.touch() + assembly_file_path = target_dir_path.joinpath(assembly_name) + assembly_file_path.touch() genbank_file_path = target_dir_path.joinpath(genome_name) genbank_file_path.touch() new_dir_path = collection_source_dir.joinpath(assembly_dir_name) os.symlink( target_dir_path.resolve(), new_dir_path.resolve(), target_is_directory=True ) - assembly_dirs.append(str(new_dir_path)) - target_files.append(str(target_file_path)) + genome_dirs.append(str(new_dir_path)) + assembly_files.append(str(assembly_file_path)) genbank_files.append(str(genbank_file_path)) yield Params( - tmp_dir, sourcedata_dir, collection_source_dir, assembly_dirs, target_files, genbank_files + tmp_dir, sourcedata_dir, collection_source_dir, genome_dirs, assembly_files, genbank_files ) shutil.rmtree(tmp_dir, ignore_errors=True) @@ -173,31 +172,30 @@ def setup_and_teardown(): def test_get_yaml_file_path(setup_and_teardown): params = setup_and_teardown - assembly_dir = params.assembly_dirs[0] - yaml_path = workspace_uploader._get_yaml_file_path(assembly_dir) + genome_dir = params.genome_dirs[0] + yaml_path = workspace_uploader._get_yaml_file_path(genome_dir) - expected_yaml_path = os.path.join(assembly_dir, workspace_uploader._UPLOADED_YAML) + expected_yaml_path = os.path.join(genome_dir, workspace_uploader._UPLOADED_YAML) assert expected_yaml_path == yaml_path assert os.path.exists(yaml_path) def test_get_source_file(setup_and_teardown): params = setup_and_teardown - assembly_dir, assembly_name = params.assembly_dirs[0], ASSEMBLY_NAMES[0] - file_path = workspace_uploader._get_source_file(assembly_dir, assembly_name) + genome_dir, genome_name = params.genome_dirs[0], GEMOME_NAMES[0] + file_path = workspace_uploader._get_source_file(genome_dir, genome_name) - expected_target_file_path = os.path.abspath(params.target_files[0]) + expected_target_file_path = os.path.abspath(params.genbank_files[0]) assert expected_target_file_path == file_path def test_read_upload_status_yaml_file(setup_and_teardown): params = setup_and_teardown - assembly_dir = params.assembly_dirs[0] - assembly_name = ASSEMBLY_NAMES[0] + genome_dir = params.genome_dirs[0] - # test empty yaml file in assembly_dir + # test empty yaml file in genome_dir data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 12345, "214", assembly_dir + "CI", 12345, "214", genome_dir ) expected_data = { @@ -209,26 +207,34 @@ def test_read_upload_status_yaml_file(setup_and_teardown): def test_update_upload_status_yaml_file(setup_and_teardown): params = setup_and_teardown - assembly_dir = params.assembly_dirs[0] + genome_dir = params.genome_dirs[0] assembly_name = ASSEMBLY_NAMES[0] assembly_tuple = workspace_uploader.WSObjTuple( - assembly_name, assembly_dir, "/path/to/file/in/AssembilyUtil" + assembly_name, genome_dir, "/path/to/file/in/AssembilyUtil" + ) + + genome_name = GEMOME_NAMES[0] + genome_tuple = workspace_uploader.WSObjTuple( + genome_name, genome_dir, "/path/to/file/in/GenomeFileUtil" ) - upload_result = UploadResult(assembly_tuple=assembly_tuple, assembly_obj_info=ASSEMBLY_OBJ_INFOS[0]) + upload_result = UploadResult(assembly_tuple=assembly_tuple, + assembly_obj_info=ASSEMBLY_OBJ_INFOS[0], + genome_tuple=genome_tuple, + genome_obj_info=GENOME_OBJ_INFOS[0]) workspace_uploader._update_upload_status_yaml_file( "CI", 12345, "214", upload_result ) data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 12345, "214", assembly_dir, + "CI", 12345, "214", genome_dir, ) expected_data = { "CI": {12345: {"214": {"assembly_upa": "72231_60_1", "assembly_filename": assembly_name, - "genome_upa": None, - "genome_filename": None}}}} + "genome_upa": "72231_61_1", + "genome_filename": genome_name}}}} assert uploaded assert expected_data == data @@ -241,104 +247,77 @@ def test_update_upload_status_yaml_file(setup_and_teardown): def test_fetch_objects_to_upload(setup_and_teardown): params = setup_and_teardown - assembly_dirs = params.assembly_dirs + genome_dirs = params.genome_dirs collection_source_dir = params.collection_source_dir - count, wait_to_upload_assemblies = workspace_uploader._fetch_objects_to_upload( + count, wait_to_upload_genomes = workspace_uploader._fetch_objects_to_upload( "CI", 12345, "214", collection_source_dir, - workspace_uploader._UPLOAD_ASSEMBLY_FILE_EXT, + workspace_uploader._UPLOAD_GENOME_FILE_EXT, ) - expected_count = len(ASSEMBLY_NAMES) - expected_wait_to_upload_assemblies = { - assembly_name: assembly_dir - for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) + expected_count = len(GEMOME_NAMES) + expected_wait_to_upload_genomes = { + genome_name: genome_dir + for genome_name, genome_dir in zip(GEMOME_NAMES, genome_dirs) } assert expected_count == count - assert expected_wait_to_upload_assemblies == wait_to_upload_assemblies + assert expected_wait_to_upload_genomes == wait_to_upload_genomes - # let's assume these two assemly files are uploaded successfully + # let's assume these two genome files are uploaded successfully # Each UPLOADED_YAML file is also updated with the upa assigned from the workspace service - # Both assemnly files will be skipped in the next fetch_assemblies_to_upload call - upas = ["12345_58_1", "12345_58_2"] - for assembly_name, assembly_dir, upa in zip(ASSEMBLY_NAMES, assembly_dirs, upas): + # Both genome files will be skipped in the next fetch_assemblies_to_upload call + for genome_name, assembly_name, genome_dir in zip(GEMOME_NAMES, ASSEMBLY_NAMES, genome_dirs): assembly_tuple = workspace_uploader.WSObjTuple( - assembly_name, assembly_dir, "/path/to/file/in/AssembilyUtil" + assembly_name, genome_dir, "/path/to/file/in/AssembilyUtil" ) - upload_result = UploadResult(assembly_tuple=assembly_tuple, assembly_obj_info=ASSEMBLY_OBJ_INFOS[0]) + genome_tuple = workspace_uploader.WSObjTuple( + genome_name, genome_dir, "/path/to/file/in/GenomeFileUtil" + ) + upload_result = UploadResult(assembly_tuple=assembly_tuple, + assembly_obj_info=ASSEMBLY_OBJ_INFOS[0], + genome_tuple=genome_tuple, + genome_obj_info=GENOME_OBJ_INFOS[0]) workspace_uploader._update_upload_status_yaml_file( "CI", 12345, "214", upload_result ) ( new_count, - new_wait_to_upload_assemblies, + new_wait_to_upload_genomes, ) = workspace_uploader._fetch_objects_to_upload( "CI", 12345, "214", collection_source_dir, - workspace_uploader._UPLOAD_ASSEMBLY_FILE_EXT, + workspace_uploader._UPLOAD_GENOME_FILE_EXT, ) assert expected_count == new_count - assert {} == new_wait_to_upload_assemblies + assert {} == new_wait_to_upload_genomes def test_prepare_skd_job_dir_to_upload(setup_and_teardown): params = setup_and_teardown - assembly_dirs = params.assembly_dirs - wait_to_upload_assemblies = { - assembly_name: assembly_dir - for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) + genome_dirs = params.genome_dirs + wait_to_upload_genomes = { + genome_name: genome_dir + for genome_name, genome_dir in zip(GEMOME_NAMES, genome_dirs) } conf = Mock() job_dir = loader_helper.make_job_dir(params.tmp_dir, "kbase") conf.job_data_dir = loader_helper.make_job_data_dir(job_dir) data_dir = workspace_uploader._prepare_skd_job_dir_to_upload( - conf, wait_to_upload_assemblies - ) - - assert sorted(os.listdir(data_dir)) == sorted(ASSEMBLY_NAMES) - for assembly_name, src_file in zip(ASSEMBLY_NAMES, params.target_files): - assert os.path.samefile(src_file, os.path.join(data_dir, assembly_name)) - - -def test_post_process_with_assembly_only(setup_and_teardown): - params = setup_and_teardown - - host_assembly_dir = params.assembly_dirs[0] - assembly_name = ASSEMBLY_NAMES[0] - - assembly_tuple = workspace_uploader.WSObjTuple( - assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" - ) - - workspace_uploader._post_process( - "CI", - 88888, - "214", - params.collection_source_dir, - params.sourcedata_dir, - UploadResult(assembly_tuple=assembly_tuple, assembly_obj_info=ASSEMBLY_OBJ_INFOS[0]) + conf, wait_to_upload_genomes ) - data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 88888, "214", host_assembly_dir - ) - expected_data = { - "CI": {88888: {"214": {"assembly_upa": "72231_60_1", - "assembly_filename": assembly_name, - "genome_upa": None, - "genome_filename": None}}}} - - assert uploaded - assert expected_data == data + assert sorted(os.listdir(data_dir)) == sorted(GEMOME_NAMES) + for genome_name, src_file in zip(GEMOME_NAMES, params.genbank_files): + assert os.path.samefile(src_file, os.path.join(data_dir, genome_name)) def test_post_process_with_genome(setup_and_teardown): @@ -348,7 +327,7 @@ def test_post_process_with_genome(setup_and_teardown): source_dir = params.sourcedata_dir container_dir = Path(params.tmp_dir) / "kb/module/work/tmp" container_dir.mkdir(parents=True) - assembly_coll_src_dir = params.assembly_dirs[1] + genome_coll_src_dir = params.genome_dirs[1] assembly_obj_info = ASSEMBLY_OBJ_INFOS[1] genome_obj_info = GENOME_OBJ_INFOS[1] @@ -356,8 +335,8 @@ def test_post_process_with_genome(setup_and_teardown): assembly_name = assembly_obj_info[1] genome_name = genome_obj_info[1] - assembly_tuple = workspace_uploader.WSObjTuple(assembly_name, assembly_coll_src_dir, container_dir / assembly_name) - genome_tuple = workspace_uploader.WSObjTuple(genome_name, assembly_coll_src_dir, container_dir / genome_name) + assembly_tuple = workspace_uploader.WSObjTuple(assembly_name, genome_coll_src_dir, container_dir / assembly_name) + genome_tuple = workspace_uploader.WSObjTuple(genome_name, genome_coll_src_dir, container_dir / genome_name) assembly_upa = obj_info_to_upa(assembly_obj_info, underscore_sep=True) genome_upa = obj_info_to_upa(genome_obj_info, underscore_sep=True) @@ -375,7 +354,7 @@ def test_post_process_with_genome(setup_and_teardown): ) data, uploaded = workspace_uploader._read_upload_status_yaml_file( - "CI", 88888, "214", assembly_coll_src_dir + "CI", 88888, "214", genome_coll_src_dir ) expected_data = {'CI': {88888: {'214': {'assembly_filename': assembly_name, @@ -389,7 +368,7 @@ def test_post_process_with_genome(setup_and_teardown): # check softlink assert os.readlink(os.path.join(collections_source_dir, assembly_upa)) == os.path.join(source_dir, assembly_upa) # check hardlink - assembly_src_file = params.target_files[1] + assembly_src_file = params.assembly_files[1] assembly_dest_file = source_dir / assembly_upa / f"{assembly_upa}.fna.gz" assert os.path.samefile(assembly_src_file, assembly_dest_file) # check metadata file @@ -409,37 +388,6 @@ def test_post_process_with_genome(setup_and_teardown): assert data == expected_metadata -def test_upload_assembly_to_workspace(setup_and_teardown): - params = setup_and_teardown - assembly_name = ASSEMBLY_NAMES[0] - host_assembly_dir = params.assembly_dirs[0] - - asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - asu.save_assemblies_from_fastas.return_value = { - "results": [{"upa": "12345/1/58", - "object_info": [1, 'assembly_1', 'KBaseGenomeAnnotations.Assembly-6', 'time', 58, 'user', 12345, 'wsname', 'md5', 78, {'foo': 'bar'}]}] - } - assembly_tuple = workspace_uploader.WSObjTuple( - assembly_name, host_assembly_dir, "/path/to/file/in/AssembilyUtil" - ) - upload_results = workspace_uploader._upload_assemblies_to_workspace( - asu, 12345, "214", [assembly_tuple] - ) - assert [r.assembly_upa for r in upload_results] == ["12345_1_58"] - asu.save_assemblies_from_fastas.assert_called_once_with( - { - "workspace_id": 12345, - "inputs": [ - { - "file": assembly_tuple.container_internal_file, - "assembly_name": assembly_tuple.obj_name, - "object_metadata": {"load_id": "214"}, - } - ] - } - ) - - def test_upload_genome_to_workspace(setup_and_teardown): params = setup_and_teardown genome_name = GEMOME_NAMES[0] @@ -448,7 +396,7 @@ def test_upload_genome_to_workspace(setup_and_teardown): genome_coll_src_dir.mkdir(parents=True) container_dir = Path(params.tmp_dir) / "kb/module/work/tmp" container_dir.mkdir(parents=True) - shutil.copy(params.target_files[0], container_dir) + shutil.copy(params.assembly_files[0], container_dir) genome_container_file = container_dir / genome_name # the existence of this file is not checked in the test @@ -465,7 +413,7 @@ def test_upload_genome_to_workspace(setup_and_teardown): genome_name, genome_coll_src_dir, genome_container_file ) - with patch.object(workspace_uploader, '_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER', new=container_dir): + with patch.object(workspace_uploader, '_JOB_DIR_IN_CONTAINER', new=container_dir): upload_results = workspace_uploader._upload_genomes_to_workspace( gfu, 12345, "214", [genome_tuple], container_dir ) @@ -505,29 +453,29 @@ def test_upload_genome_to_workspace(setup_and_teardown): def test_generator(setup_and_teardown): params = setup_and_teardown - assembly_dirs = params.assembly_dirs - wait_to_upload_assemblies = { - assembly_name: assembly_dir - for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) + genome_dirs = params.genome_dirs + wait_to_upload_genomes = { + genome_name: genome_dir + for genome_name, genome_dir in zip(GEMOME_NAMES, genome_dirs) } - assemblyTuple_list = list(workspace_uploader._gen(wait_to_upload_assemblies, 1)) - expected_assemblyTuple_list = [ + genomeTuple_list = list(workspace_uploader._gen(wait_to_upload_genomes, 1)) + expected_genomeTuple_list = [ [ workspace_uploader.WSObjTuple( - "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", - assembly_dirs[0], - "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", + "GCF_000979855.1_gtlEnvA5udCFS_genomic.gbff.gz", + genome_dirs[0], + "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.gbff.gz", ) ], [ workspace_uploader.WSObjTuple( - "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", - assembly_dirs[1], - "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", + "GCF_000979175.1_gtlEnvA5udCFS_genomic.gbff.gz", + genome_dirs[1], + "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.gbff.gz", ) ], ] - assert assemblyTuple_list == expected_assemblyTuple_list + assert genomeTuple_list == expected_genomeTuple_list def test_upload_genome_files_in_parallel(setup_and_teardown): @@ -536,7 +484,7 @@ def test_upload_genome_files_in_parallel(setup_and_teardown): sourcedata_dir = params.sourcedata_dir # Assembly files are normally produced by the GFU. # This test fakes that process by placing them in the container job directory in the setup below - assembly_files = params.target_files + assembly_files = params.assembly_files genbank_files = params.genbank_files genome_ids = ["GCF_000979115.1", "GCF_000979555.1"] genome_collection_source_dirs = list() @@ -580,7 +528,7 @@ def test_upload_genome_files_in_parallel(setup_and_teardown): } gfu.genbanks_to_genomes.return_value = genbanks_to_genomes_results - with patch.object(workspace_uploader, '_JOB_DIR_IN_ASSEMBLYUTIL_CONTAINER', new=container_dir): + with patch.object(workspace_uploader, '_JOB_DIR_IN_CONTAINER', new=container_dir): uploaded_count = workspace_uploader._upload_objects_in_parallel( ws=ws, upload_env_key="CI", @@ -593,7 +541,6 @@ def test_upload_genome_files_in_parallel(setup_and_teardown): asu_client=Mock(), gfu_client=gfu, job_data_dir=container_dir, - upload_assembly_only=False, ) assert uploaded_count == 2 @@ -672,130 +619,6 @@ def test_upload_genome_files_in_parallel(setup_and_teardown): assert data == expected_metadata -def test_upload_assembly_files_in_parallel(setup_and_teardown): - params = setup_and_teardown - assembly_dirs = params.assembly_dirs - - wait_to_upload_assemblies = { - assembly_name: assembly_dir - for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) - } - - # ws.get_object_info3() is unused in this test case - ws = create_autospec(Workspace, spec_set=True, instance=True) - asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - asu.save_assemblies_from_fastas.return_value = { - "results": [ - {"upa": "12345/58/1", - "object_info": [58, 'assembly_1', 'KBaseGenomeAnnotations.Assembly-6', 'time', 1, 'user', 12345, 'wsname', 'md5', 78, {'foo': 'bar'}]}, - {"upa": "12345/60/1", - "object_info": [60, 'assembly_2', 'KBaseGenomeAnnotations.Assembly-10', 'time', 1, 'user', 12345, 'wsname', 'md5', 78, {}]} - ] - } - - uploaded_count = workspace_uploader._upload_objects_in_parallel( - ws, - "CI", - 12345, - "214", - params.collection_source_dir, - wait_to_upload_assemblies, - 2, - params.sourcedata_dir, - asu_client=asu, - gfu_client=Mock(), - job_data_dir='path/to/job_data_dir' - ) - - assert uploaded_count == 2 - - # assert that no interactions occurred with ws - ws.get_object_info3.assert_not_called() - - # assert that asu was called correctly - asu.save_assemblies_from_fastas.assert_called_once_with( - { - "workspace_id": 12345, - "inputs": [ - { - "file": "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", - "assembly_name": "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", - "object_metadata": {"load_id": "214"}, - }, - { - "file": "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", - "assembly_name": "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", - "object_metadata": {"load_id": "214"}, - } - ] - } - ) - - -def test_fail_upload_assembly_files_in_parallel(setup_and_teardown): - params = setup_and_teardown - assembly_dirs = params.assembly_dirs - - wait_to_upload_assemblies = { - assembly_name: assembly_dir - for assembly_name, assembly_dir in zip(ASSEMBLY_NAMES, assembly_dirs) - } - - ws = create_autospec(Workspace, spec_set=True, instance=True) - asu = create_autospec(AssemblyUtil, spec_set=True, instance=True) - asu.save_assemblies_from_fastas.side_effect = Exception("Illegal character in object name") - ws.get_object_info3.return_value = { - 'infos': [None, None], 'paths': [None, None] - } - - uploaded_count = workspace_uploader._upload_objects_in_parallel( - ws=ws, - upload_env_key="CI", - workspace_id=12345, - load_id="214", - collections_source_dir=params.sourcedata_dir, - wait_to_upload_objs=wait_to_upload_assemblies, - batch_size=2, - source_data_dir=params.sourcedata_dir, - asu_client=asu, - gfu_client=Mock(), - job_data_dir='path/to/job_data_dir' - ) - - assert uploaded_count == 0 - - # assert that asu was called correctly - asu.save_assemblies_from_fastas.assert_called_once_with( - { - "workspace_id": 12345, - "inputs": [ - { - "file": "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", - "assembly_name": "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", - "object_metadata": {"load_id": "214"}, - }, - { - "file": "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", - "assembly_name": "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", - "object_metadata": {"load_id": "214"}, - } - ] - } - ) - - # assert that ws was called correctly - ws.get_object_info3.assert_called_once_with( - { - "objects": [ - {"wsid": 12345, "name": "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz"}, - {"wsid": 12345, "name": "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz"} - ], - "ignoreErrors": 1, - "includeMetadata": 1 - } - ) - - def test_fail_query_workspace_with_load_id_mass(setup_and_teardown): ws = create_autospec(Workspace, spec_set=True, instance=True) with pytest.raises( @@ -813,84 +636,8 @@ def test_fail_query_workspace_with_load_id_mass(setup_and_teardown): ws.get_object_info3.assert_not_called() -def test_query_workspace_with_load_id_mass_assembly(setup_and_teardown): - # test with assembly objects only - ws = create_autospec(Workspace, spec_set=True, instance=True) - mock_object_info = { - 'infos': [ - [ - 1086, - 'GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz', - 'KBaseGenomeAnnotations.Assembly-6.3', - '2024-01-18T23:12:44+0000', - 18, - 'sijiex', - 69046, - 'sijiex:narrative_1688077625427', - 'aaa726d2b976e27e729ac288812e81f6', - 71823, - { - 'GC content': '0.41571', - 'Size': '4079204', - 'N Contigs': '260', - 'MD5': '8aa6b1244e18c4f93bb3307902bd3a4d', - "load_id": "998" - } - ], - [ - 1068, - 'GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz', - 'KBaseGenomeAnnotations.Assembly-6.3', - '2024-01-18T23:12:35+0000', - 18, - 'sijiex', - 69046, - 'sijiex:narrative_1688077625427', - '866033827fd54569c953e8b3dd58d0aa', - 38242, - { - 'GC content': '0.41526', - 'Size': '4092300', - 'N Contigs': '136', - 'MD5': '1e007bad0811a6d6e09a882d3bf802ab', - "load_id": "998" - } - ], - None], - 'paths': [['69046/1086/18'], ['69046/1068/18'], None] - } - ws.get_object_info3.return_value = mock_object_info - - assembly_objs_info, genome_objs_info = workspace_uploader._query_workspace_with_load_id_mass( - ws, - 69046, - "998", - [ - "GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz", - "GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz", - "aloha", - ] - ) - - assert assembly_objs_info == mock_object_info['infos'][:2] - assert genome_objs_info == [] - - # assert that ws was called correctly - ws.get_object_info3.assert_called_once_with( - { - "objects": [ - {"wsid": 69046, "name": "GCF_000980105.1_gtlEnvA5udCFS_genomic.fna.gz"}, - {"wsid": 69046, "name": "GCF_000979375.1_gtlEnvA5udCFS_genomic.fna.gz"}, - {"wsid": 69046, "name": "aloha"} - ], - "ignoreErrors": 1, - "includeMetadata": 1 - } - ) - - def test_query_workspace_with_load_id_mass_genome(setup_and_teardown): - # Test case 1: Valid scenario - test with genome objects and assembly_objs_only is set to False + # Test case 1: Valid scenario - test with genome objects ws = create_autospec(Workspace, spec_set=True, instance=True) load_id = "998" assembly_objs_response = { @@ -916,7 +663,6 @@ def test_query_workspace_with_load_id_mass_genome(setup_and_teardown): 69046, load_id, ["genome_1", "genome_2",], - assembly_objs_only=False ) assert assembly_objs_info == assembly_objs_response['infos'] assert genome_objs_info == genome_objs_response['infos'] @@ -951,7 +697,6 @@ def test_query_workspace_with_load_id_mass_genome_fail(setup_and_teardown): 69046, load_id, ["genome_1"], - assembly_objs_only=False ) assert "Genome object 42/1/75 does not have an assembly object linked to it" == str(excinfo.value) From 639d36f9ce163579801f3b3ba20f415372ea56ea Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 09:22:21 -0500 Subject: [PATCH 10/50] increase eggnog chunk size --- src/loaders/jobs/taskfarmer/task_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 01ae1af4f..2fbed7f21 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -53,7 +53,7 @@ # estimated execution time (in minutes) for each tool to process a chunk of data TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65}, - 'eggnog': {'chunk_size': 500, 'exe_time': 65}, # TODO: update this value after performance testing + 'eggnog': {'chunk_size': 1000, 'exe_time': 65}, # TODO: update this value after performance testing 'default': {'chunk_size': 5000, 'exe_time': 60}} NODE_TIME_LIMIT = 5 # hours # TODO: automatically calculate this based on tool execution time and NODE_THREADS MAX_NODE_NUM = 100 # maximum number of nodes to use From f440b434eb8fb8e4883701fdd2a70f1e1f772a0c Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 09:26:08 -0500 Subject: [PATCH 11/50] add more notes --- src/loaders/compute_tools/eggnog/eggnog.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index a335f04e2..14942677a 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -1,5 +1,9 @@ """ Run eggNOG tool on a set of faa files. + +This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work. +Therefore, the parser program is not compatible with data generated by this tool. + """ import json from pathlib import Path From 1a121598d8b806dce8d38c650e7d171fbfe84e7d Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 09:35:26 -0500 Subject: [PATCH 12/50] update genome_dirs in test --- .../loaders/workspace_uploader/workspace_uploader_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index dcd6f9e3e..9b4cb2a15 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -15,7 +15,7 @@ from src.loaders.workspace_uploader import workspace_uploader from src.loaders.workspace_uploader.upload_result import UploadResult -ASSEMBLY_DIR_NAMES = ["GCF_000979855.1", "GCF_000979175.1"] +GENOME_DIR_NAMES = ["GCF_000979855.1", "GCF_000979175.1"] ASSEMBLY_NAMES = [ "GCF_000979855.1_gtlEnvA5udCFS_genomic.fna.gz", "GCF_000979175.1_gtlEnvA5udCFS_genomic.fna.gz", @@ -148,14 +148,14 @@ def setup_and_teardown(): collection_source_dir.mkdir() genome_dirs, assembly_files, genbank_files = list(), list(), list() - for assembly_dir_name, assembly_name, genome_name in zip(ASSEMBLY_DIR_NAMES, ASSEMBLY_NAMES, GEMOME_NAMES): - target_dir_path = sourcedata_dir.joinpath(assembly_dir_name) + for genome_dir_name, assembly_name, genome_name in zip(GENOME_DIR_NAMES, ASSEMBLY_NAMES, GEMOME_NAMES): + target_dir_path = sourcedata_dir.joinpath(genome_dir_name) target_dir_path.mkdir() assembly_file_path = target_dir_path.joinpath(assembly_name) assembly_file_path.touch() genbank_file_path = target_dir_path.joinpath(genome_name) genbank_file_path.touch() - new_dir_path = collection_source_dir.joinpath(assembly_dir_name) + new_dir_path = collection_source_dir.joinpath(genome_dir_name) os.symlink( target_dir_path.resolve(), new_dir_path.resolve(), target_is_directory=True ) From 5999ba1fe4690493ebb833e8e3875fc3542f6dbb Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 14:39:28 -0500 Subject: [PATCH 13/50] save output files to collectiondata dir --- src/loaders/compute_tools/eggnog/eggnog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 14942677a..2895d36bf 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -30,8 +30,8 @@ def _run_eggnog_single( # RUN eggNOG for a single genome command = ['emapper.py', '-i', source_file, # Input file. - '-o', source_file, # Output prefix. - # Save result file to source file directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. + '-o', output_dir / source_file.name, # Output prefix. + # Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. '--itype', f'{INPUT_TYPE}', '--cpu', f'{THREADS}', '--excel', From 6bc10db0e3b514f7cd07702a125d155c25fd3402 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 14:43:46 -0500 Subject: [PATCH 14/50] update tool name --- src/loaders/compute_tools/eggnog/eggnog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 2895d36bf..639cc5d7b 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -50,7 +50,7 @@ def _run_eggnog_single( def main(): - runner = ToolRunner("eggNOG") + runner = ToolRunner("eggnog") runner.parallel_single_execution(_run_eggnog_single, unzip=True) From a8330bf69d65a13adcfe05ecc8669af91dc72278 Mon Sep 17 00:00:00 2001 From: Tianhao Gu Date: Thu, 14 Mar 2024 16:28:26 -0500 Subject: [PATCH 15/50] Update src/loaders/common/loader_common_names.py Co-authored-by: MrCreosote --- src/loaders/common/loader_common_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 1255045fc..4c80b5d7b 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -94,7 +94,7 @@ # The metadata file name created during the Mash run MASH_METADATA = 'mash_run_metadata.json' -# The metadata file name created during the Mash run +# The metadata file name created during the Eggnog run EGGNOG_METADATA = 'eggnog_run_metadata.json' # The fatal error file created if a data file cannot be successfully processed From 17438b56de8e22162779bf4d63e92f9955ed6ae0 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 18:15:27 -0500 Subject: [PATCH 16/50] add readme file --- src/loaders/compute_tools/eggnog/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/loaders/compute_tools/eggnog/README.md diff --git a/src/loaders/compute_tools/eggnog/README.md b/src/loaders/compute_tools/eggnog/README.md new file mode 100644 index 000000000..7f0464ef7 --- /dev/null +++ b/src/loaders/compute_tools/eggnog/README.md @@ -0,0 +1,10 @@ + +# eggNOG tool + +## Overview +The eggNOG tool is designed to utilize the collections infrastructure for execution and storage of result data. + +This tool is exclusively intended for use with the CDM project. + +The Collections parser program ([parse_tool_results.py](../../genome_collection/parse_tool_results.py)) will skip parsing the result files generated by this tool, as the result data is +specifically tailored for the CDM project. \ No newline at end of file From e03bfd9a9eee5a6130d3e09e9dfeb9ca59e29710 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 18:38:18 -0500 Subject: [PATCH 17/50] remove _upload_assemblies_to_workspace --- .../workspace_uploader/workspace_uploader.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 4f5c6d174..d95a01baa 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -292,43 +292,6 @@ def _upload_genomes_to_workspace( return upload_results -def _upload_assemblies_to_workspace( - asu: AssemblyUtil, - workspace_id: int, - load_id: str, - ws_obj_tuples: list[WSObjTuple], -) -> list[UploadResult]: - """ - Upload assembly files to the target workspace in batch. The bulk method fails - and an error will be thrown if any of the assembly files in batch fails to upload. - The order of elements in the returned list corresponds to the order of `ws_obj_tuples`. - """ - inputs = [ - { - "file": obj_tuple.container_internal_file, - "assembly_name": obj_tuple.obj_name, - "object_metadata": {"load_id": load_id}, - } - for obj_tuple in ws_obj_tuples - ] - - assembly_ref = asu.save_assemblies_from_fastas( - {"workspace_id": workspace_id, "inputs": inputs} - ) - - upload_results = [] - for result_dict, obj_tuple in zip(assembly_ref["results"], ws_obj_tuples): - assembly_obj_info = result_dict["object_info"] - upload_result = UploadResult( - assembly_obj_info=assembly_obj_info, - assembly_tuple=obj_tuple, - ) - - upload_results.append(upload_result) - - return upload_results - - def _read_upload_status_yaml_file( upload_env_key: str, workspace_id: int, From b203fd20024e992d640910ad382292c52515bc46 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 14 Mar 2024 22:13:58 -0500 Subject: [PATCH 18/50] lower CPU to 4 --- src/loaders/compute_tools/eggnog/eggnog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 639cc5d7b..8ff5fcac2 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -12,7 +12,7 @@ from src.loaders.compute_tools.tool_common import ToolRunner, run_command INPUT_TYPE = 'proteins' -THREADS = 16 +THREADS = 4 def _run_eggnog_single( From 1337c37b1b4e2ef3ce716089e4b67e142c746d7b Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 12:42:05 -0500 Subject: [PATCH 19/50] increase threads to 8 --- src/loaders/compute_tools/eggnog/eggnog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 8ff5fcac2..c278a2809 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -12,7 +12,7 @@ from src.loaders.compute_tools.tool_common import ToolRunner, run_command INPUT_TYPE = 'proteins' -THREADS = 4 +THREADS = 8 def _run_eggnog_single( From 7b8a4917e05b5c465d33bd9839766c4c470cce3d Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 13:38:15 -0500 Subject: [PATCH 20/50] pass program threads to tools --- src/loaders/compute_tools/eggnog/eggnog.py | 4 ++-- src/loaders/compute_tools/eggnog/versions.yaml | 6 ++++++ src/loaders/compute_tools/mash/mash.py | 2 ++ src/loaders/compute_tools/mash/versions.yaml | 6 +++++- src/loaders/compute_tools/microtrait/microtrait.py | 8 +++++++- src/loaders/compute_tools/microtrait/versions.yaml | 6 +++++- src/loaders/compute_tools/tool_common.py | 1 + 7 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index c278a2809..9ae48ae4b 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -12,7 +12,6 @@ from src.loaders.compute_tools.tool_common import ToolRunner, run_command INPUT_TYPE = 'proteins' -THREADS = 8 def _run_eggnog_single( @@ -20,6 +19,7 @@ def _run_eggnog_single( data_id: str, source_file: Path, output_dir: Path, + program_threads: int, debug: bool) -> None: metadata_file = output_dir / EGGNOG_METADATA @@ -33,7 +33,7 @@ def _run_eggnog_single( '-o', output_dir / source_file.name, # Output prefix. # Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. '--itype', f'{INPUT_TYPE}', - '--cpu', f'{THREADS}', + '--cpu', program_threads, '--excel', '--sensmode', 'fast', '--dmnd_iterate', 'no', diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml index ece3f5da3..782f09efc 100644 --- a/src/loaders/compute_tools/eggnog/versions.yaml +++ b/src/loaders/compute_tools/eggnog/versions.yaml @@ -1,4 +1,10 @@ versions: - version: 0.1.0 date: 2024-03-13 + reference_db_version: 5.0.2 + + - version: 0.1.1 + date: 2024-03-15 + notes: | + - add ability to specify thread number for execution reference_db_version: 5.0.2 \ No newline at end of file diff --git a/src/loaders/compute_tools/mash/mash.py b/src/loaders/compute_tools/mash/mash.py index 32c45c2ac..a21ee12d2 100644 --- a/src/loaders/compute_tools/mash/mash.py +++ b/src/loaders/compute_tools/mash/mash.py @@ -16,6 +16,7 @@ def _run_mash_single( data_id: str, source_file: Path, output_dir: Path, + program_threads: int, debug: bool, kmer_size: int = KMER_SIZE, sketch_size: int = SKETCH_SIZE) -> None: @@ -25,6 +26,7 @@ def _run_mash_single( # Save result file to source file directory. The suffix '.msh' will be appended. '-k', f'{kmer_size}', '-s', f'{sketch_size}', + '-p', program_threads, source_file] run_command(command, output_dir if debug else None) diff --git a/src/loaders/compute_tools/mash/versions.yaml b/src/loaders/compute_tools/mash/versions.yaml index 2818a86fe..ad8d3ef57 100644 --- a/src/loaders/compute_tools/mash/versions.yaml +++ b/src/loaders/compute_tools/mash/versions.yaml @@ -2,4 +2,8 @@ versions: - version: 0.1.0 date: 2023-07-18 - version: 0.1.1 - date: 2023-07-19 \ No newline at end of file + date: 2023-07-19 + - version: 0.1.2 + date: 2024-03-15 + notes: | + - add ability to specify thread number for execution \ No newline at end of file diff --git a/src/loaders/compute_tools/microtrait/microtrait.py b/src/loaders/compute_tools/microtrait/microtrait.py index 57f81c021..6877bc2d7 100644 --- a/src/loaders/compute_tools/microtrait/microtrait.py +++ b/src/loaders/compute_tools/microtrait/microtrait.py @@ -186,7 +186,13 @@ def _process_trait_counts( return heatmap_row, cells_meta, traits_meta -def _run_microtrait(tool_safe_data_id: str, data_id: str, fna_file: Path, genome_dir: Path, debug: bool): +def _run_microtrait( + tool_safe_data_id: str, + data_id: str, + fna_file: Path, + genome_dir: Path, + program_threads: int, + debug: bool): # run microtrait.extract_traits on the genome file # https://github.com/ukaraoz/microtrait diff --git a/src/loaders/compute_tools/microtrait/versions.yaml b/src/loaders/compute_tools/microtrait/versions.yaml index f2180a8ad..083f830b1 100644 --- a/src/loaders/compute_tools/microtrait/versions.yaml +++ b/src/loaders/compute_tools/microtrait/versions.yaml @@ -12,4 +12,8 @@ versions: - version: 0.1.3 date: 2023-10-16 notes: | - - fix cells value data type \ No newline at end of file + - fix cells value data type + - version: 0.1.4 + date: 2024-03-15 + notes: | + - add ability to specify thread number for execution \ No newline at end of file diff --git a/src/loaders/compute_tools/tool_common.py b/src/loaders/compute_tools/tool_common.py index 566f2df3d..b98ee54d3 100644 --- a/src/loaders/compute_tools/tool_common.py +++ b/src/loaders/compute_tools/tool_common.py @@ -298,6 +298,7 @@ def parallel_single_execution(self, tool_callable: Callable[[str, str, Path, Pat meta.get(loader_common_names.META_UNCOMPRESSED_FILE, meta[loader_common_names.META_SOURCE_FILE]), output_dir, + self._program_threads, self._debug)) try: From ffcc686461abb1948c0cd8c3a8a4d3f9b86e1e3a Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 15:48:46 -0500 Subject: [PATCH 21/50] fix command line --- src/loaders/compute_tools/eggnog/eggnog.py | 2 +- src/loaders/compute_tools/mash/mash.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 9ae48ae4b..f61b8a944 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -33,7 +33,7 @@ def _run_eggnog_single( '-o', output_dir / source_file.name, # Output prefix. # Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files. '--itype', f'{INPUT_TYPE}', - '--cpu', program_threads, + '--cpu', f'{program_threads}', '--excel', '--sensmode', 'fast', '--dmnd_iterate', 'no', diff --git a/src/loaders/compute_tools/mash/mash.py b/src/loaders/compute_tools/mash/mash.py index a21ee12d2..82d72ddd4 100644 --- a/src/loaders/compute_tools/mash/mash.py +++ b/src/loaders/compute_tools/mash/mash.py @@ -26,7 +26,7 @@ def _run_mash_single( # Save result file to source file directory. The suffix '.msh' will be appended. '-k', f'{kmer_size}', '-s', f'{sketch_size}', - '-p', program_threads, + '-p', f'{program_threads}', source_file] run_command(command, output_dir if debug else None) From bc63b8e0fff07d3a4690509724ae355fffd4614a Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 15:55:44 -0500 Subject: [PATCH 22/50] update arg type --- src/loaders/compute_tools/tool_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/loaders/compute_tools/tool_common.py b/src/loaders/compute_tools/tool_common.py index b98ee54d3..97a905691 100644 --- a/src/loaders/compute_tools/tool_common.py +++ b/src/loaders/compute_tools/tool_common.py @@ -246,7 +246,7 @@ def _get_data_ids(self): data_ids = all_data_ids return list(set(data_ids)) - def parallel_single_execution(self, tool_callable: Callable[[str, str, Path, Path, bool], None], unzip=False): + def parallel_single_execution(self, tool_callable: Callable[[str, str, Path, Path, int, bool], None], unzip=False): """ Run a tool by a single data file, storing the results in a single batch directory with the individual runs stored in directories by the data ID. @@ -379,7 +379,7 @@ def _execute( self, threads: int, tool_callable: Callable[..., None], - args: List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], + args: List[Tuple[Dict[str, GenomeTuple], Path, int, bool]] | List[Tuple[str, str, Path, Path, int, bool]], start: datetime.datetime, total: bool, ): From 7ec8b0756d60af72998d736f91fc7d81af6f6ec3 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 18:37:57 -0500 Subject: [PATCH 23/50] add comments to microtrait script --- src/loaders/compute_tools/microtrait/microtrait.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/loaders/compute_tools/microtrait/microtrait.py b/src/loaders/compute_tools/microtrait/microtrait.py index 6877bc2d7..77aba3262 100644 --- a/src/loaders/compute_tools/microtrait/microtrait.py +++ b/src/loaders/compute_tools/microtrait/microtrait.py @@ -204,6 +204,10 @@ def _run_microtrait( # object returned by the # extract_traits function. + # programe_threads is not used in this function, but it is kept for consistency with another tools (e.g., eggnog, mash) + # since extract_traits function doesn't take the number of threads as an argument + # https://github.com/ukaraoz/microtrait/blob/master/R/extract_traits.R#L22-L26 + # Load the R script as an R function r_script = """ library(microtrait) From 8e4a516de880b626842541ff7ff30a645fdd6e07 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 18:43:11 -0500 Subject: [PATCH 24/50] remove microtrait release --- src/loaders/compute_tools/microtrait/versions.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/loaders/compute_tools/microtrait/versions.yaml b/src/loaders/compute_tools/microtrait/versions.yaml index 083f830b1..f2180a8ad 100644 --- a/src/loaders/compute_tools/microtrait/versions.yaml +++ b/src/loaders/compute_tools/microtrait/versions.yaml @@ -12,8 +12,4 @@ versions: - version: 0.1.3 date: 2023-10-16 notes: | - - fix cells value data type - - version: 0.1.4 - date: 2024-03-15 - notes: | - - add ability to specify thread number for execution \ No newline at end of file + - fix cells value data type \ No newline at end of file From 385326e6d557bf29785860773b2d07b134c10003 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 19:59:42 -0500 Subject: [PATCH 25/50] refactor task_generator --- src/loaders/jobs/taskfarmer/task_generator.py | 59 +++++++++++++++---- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 2fbed7f21..51cbd3ec9 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -52,14 +52,13 @@ TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog'] # estimated execution time (in minutes) for each tool to process a chunk of data -TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65}, - 'eggnog': {'chunk_size': 1000, 'exe_time': 65}, # TODO: update this value after performance testing +TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65, 'tasks_per_node': 4}, + 'eggnog': {'chunk_size': 100, 'exe_time': 15, 'node_time_limit': 0.5}, # Memory intensive tool - reserve more nodes with less node reservation time 'default': {'chunk_size': 5000, 'exe_time': 60}} NODE_TIME_LIMIT = 5 # hours # TODO: automatically calculate this based on tool execution time and NODE_THREADS MAX_NODE_NUM = 100 # maximum number of nodes to use -# The THREADS variable controls the number of parallel tasks per node -# TODO: make this configurable based on tool used. At present, we have set the value to 4 for optimal performance with GTDB-Tk. -NODE_THREADS = 4 +# Used as THREADS variable in the batch script which controls the number of parallel tasks per node +TASKS_PER_NODE = 1 REGISTRY = 'ghcr.io/kbase/collections' VERSION_FILE = 'versions.yaml' @@ -205,8 +204,16 @@ def _create_task_list( program_threads: number of threads to use per task For instance, if "threads" is set to 128 and "program_threads" to 32, then each task will run 4 batches in parallel. - We have chosen to use "taskfarmer" for parallelization, which means that "threads" and "program_threads" should - have the same value. This ensures that parallelization only happens between tasks, and not within them. + For tools capable of processing a batch of genomes, such as GTDB-TK and checkm2, + we have chosen to utilize the THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for parallelization, + which means that "threads" and "program_threads" should have the same value. + This ensures that parallelization only happens between tasks, and not within them. + + For tools that can only handle genomes individually, such as microtrait, mash and eggNOG, + we still use the THREADS variable for task parallelization. However, within each task, we further parallelize tool + execution using the 'threads' variable. For instance, with 'threads' set to 16 and 'tasks_per_node' set to 4, + each node will concurrently execute 4 tasks, with each task executing 16 parallel tool operations. + TODO: make threads/program_threads configurable based on tool used. However, for the time being, we have set these parameters to 32 and , since this value has produced the highest throughput in our experiments. @@ -259,9 +266,9 @@ def _cal_node_num(tool, n_jobs): """ tool_exe_time = TASK_META.get(tool, TASK_META['default'])['exe_time'] - jobs_per_node = NODE_TIME_LIMIT * 60 // tool_exe_time + max_jobs_per_node = max(_get_node_time_limit(tool) * 60 // tool_exe_time, 1) * _get_note_task_count(tool) - num_nodes = math.ceil(n_jobs / jobs_per_node) + num_nodes = math.ceil(n_jobs / max_jobs_per_node) if num_nodes > MAX_NODE_NUM: raise ValueError(f"The number of nodes required ({num_nodes}) is greater than the maximum " @@ -270,6 +277,36 @@ def _cal_node_num(tool, n_jobs): return num_nodes +def _get_note_task_count(tool: str) -> int: + """ + Get the number of parallel tasks to run on a node + """ + + return TASK_META.get(tool, TASK_META['default']).get('tasks_per_node', TASKS_PER_NODE) + + +def _get_node_time_limit(tool: str) -> float: + """ + Get the time limit for the node we reserved for the task + + By default, we set the time limit to NODE_TIME_LIMIT (5 hours). + If in TASK_META, the tool has a different time limit (node_time_limit), we will use that. + """ + + return TASK_META.get(tool, TASK_META['default']).get('node_time_limit', NODE_TIME_LIMIT) + + +def _float_to_time(float_hours: float) -> str: + """ + Convert a floating point number of hours to a time string in the format HH:MM:SS + """ + hours = int(float_hours) + decimal_part = float_hours - hours + minutes = int(decimal_part * 60) + seconds = int(decimal_part * 3600) % 60 + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + def _create_batch_script(job_dir, task_list_file, n_jobs, tool): """ Create the batch script (submit_taskfarmer.sl) for job submission @@ -281,13 +318,13 @@ def _create_batch_script(job_dir, task_list_file, n_jobs, tool): #SBATCH -N {node_num + 1} -c 256 #SBATCH -q regular -#SBATCH --time={NODE_TIME_LIMIT}:00:00 +#SBATCH --time={_float_to_time(_get_node_time_limit(tool))} #SBATCH -C cpu module load taskfarmer cd {job_dir} -export THREADS={NODE_THREADS} +export THREADS={_get_note_task_count(tool)} runcommands.sh {task_list_file}''' From 1a3954780e8dd95f72f16d7783ce38a84038baea Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 20:08:38 -0500 Subject: [PATCH 26/50] fix a comment --- src/loaders/workspace_uploader/upload_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/workspace_uploader/upload_result.py b/src/loaders/workspace_uploader/upload_result.py index 566091b59..dee2f8044 100644 --- a/src/loaders/workspace_uploader/upload_result.py +++ b/src/loaders/workspace_uploader/upload_result.py @@ -18,7 +18,7 @@ WSObjTuple is a named tuple that contains the following fields: - obj_name: the name of the object (in many cases, also serves as the file name) - obj_coll_src_dir: the directory of the associated file in the collection source directory -- container_internal_file: the associated file in the docker container +- container_internal_file: the associated file path in the docker container """ From 11f0327cadae1a648d6c1451ebb63d72b363e728 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 15 Mar 2024 21:41:55 -0500 Subject: [PATCH 27/50] fix a typo --- src/loaders/jobs/taskfarmer/task_generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 51cbd3ec9..f35c457ec 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -266,7 +266,7 @@ def _cal_node_num(tool, n_jobs): """ tool_exe_time = TASK_META.get(tool, TASK_META['default'])['exe_time'] - max_jobs_per_node = max(_get_node_time_limit(tool) * 60 // tool_exe_time, 1) * _get_note_task_count(tool) + max_jobs_per_node = max(_get_node_time_limit(tool) * 60 // tool_exe_time, 1) * _get_node_task_count(tool) num_nodes = math.ceil(n_jobs / max_jobs_per_node) @@ -277,7 +277,7 @@ def _cal_node_num(tool, n_jobs): return num_nodes -def _get_note_task_count(tool: str) -> int: +def _get_node_task_count(tool: str) -> int: """ Get the number of parallel tasks to run on a node """ @@ -324,7 +324,7 @@ def _create_batch_script(job_dir, task_list_file, n_jobs, tool): module load taskfarmer cd {job_dir} -export THREADS={_get_note_task_count(tool)} +export THREADS={_get_node_task_count(tool)} runcommands.sh {task_list_file}''' From 811c23f609efe1c7d71bff0929f9d37973e48023 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 11:21:07 -0500 Subject: [PATCH 28/50] reduce main() length --- .../workspace_uploader/workspace_uploader.py | 249 ++++++++++++------ 1 file changed, 167 insertions(+), 82 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index d95a01baa..18a62b968 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -61,6 +61,7 @@ import argparse import os import shutil +import subprocess import time import traceback import uuid @@ -772,6 +773,148 @@ def _gen( yield obj_tuple_list[idx: idx + batch_size] +def _check_existing_uploads_and_recovery( + ws: Workspace, + env: str, + workspace_id: int, + load_id: str, + collections_source_dir: str, + source_dir: str, + wait_to_upload_objs: dict[str, str], + asu_client: AssemblyUtil, + job_data_dir: str, +) -> list[str]: + """ + Process existing uploads, perform post-processing, and handle recovery. + """ + obj_names_processed = [] + + wait_to_upload_tuples = _dict2tuple_list(wait_to_upload_objs) + upload_results = _process_failed_uploads( + ws, + workspace_id, + load_id, + wait_to_upload_tuples, + asu_client, + job_data_dir, + ) + + # Fix inconsistencies between the workspace and the local yaml files + if upload_results: + print("Start failure recovery process ...") + for upload_result in upload_results: + _post_process( + env, + workspace_id, + load_id, + collections_source_dir, + source_dir, + upload_result + ) + obj_names_processed.append(upload_result.genome_tuple.obj_name) + + print("Recovery process completed ...") + + return obj_names_processed + + +def _cleanup_resources( + conf: Conf, + proc: subprocess.Popen, + job_dir: str, + keep_job_dir: bool): + """ + Cleanup resources including stopping callback server, terminating Podman service, and removing job directory. + """ + + # Stop callback server if it is on + if conf: + conf.stop_callback_server() + + # Stop Podman service if it is on + if proc: + proc.terminate() + + # Remove job directory if needed + if not keep_job_dir: + shutil.rmtree(job_dir) + + +def _setup_and_start_services( + job_dir, + kb_base_url, + token_filepath, + cbs_max_tasks, + catalog_admin +) -> tuple[subprocess.Popen | None, Conf | None]: + """ + Setup and start services including Podman service and callback server. + """ + + # Setup container.conf file for the callback server logs if needed + if loader_helper.is_config_modification_required(): + if click.confirm( + f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" + f"needs to be modified to allow for container logging.\n" + f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" + ): + loader_helper.setup_callback_server_logs() + else: + print("Permission denied and exiting ...") + return None, None + + # Start podman service + uid = os.getuid() + proc = loader_helper.start_podman_service(uid) + + # Set up configuration for uploader + conf = Conf( + job_dir=job_dir, + kb_base_url=kb_base_url, + token_filepath=token_filepath, + max_callback_server_tasks=cbs_max_tasks, + catalog_admin=catalog_admin, + ) + + return proc, conf + + +def _validate_arguments( + workspace_id: int, + batch_size: int, + cbs_max_tasks: int, +): + if workspace_id <= 0: + raise ValueError("workspace_id needs to be > 0") + if batch_size <= 0: + raise ValueError("batch_size needs to be > 0") + if cbs_max_tasks <= 0: + raise ValueError("cbs_max_tasks needs to be > 0") + + +def _prepare_directories( + root_dir: str, + username: str, + kbase_collection: str, + source_version: str, + env: str, + workspace_id: int, +) -> tuple[str, str, str, str]: + """ + Prepare directories used for the uploader. + """ + job_dir = loader_helper.make_job_dir(root_dir, username) + collection_source_dir = loader_helper.make_collection_source_dir( + root_dir, loader_common_names.DEFAULT_ENV, kbase_collection, source_version + ) + collections_source_dir = loader_helper.make_collection_source_dir( + root_dir, env, kbase_collection, source_version + ) + source_dir = loader_helper.make_sourcedata_ws_dir(root_dir, env, workspace_id) + + return job_dir, collection_source_dir, collections_source_dir, source_dir + + def main(): parser = _get_parser() args = parser.parse_args() @@ -788,64 +931,28 @@ def main(): gfu_service_ver = args.gfu_service_ver keep_job_dir = args.keep_job_dir catalog_admin = args.as_catalog_admin - load_id = args.load_id - if not load_id: - print("load_id is not provided. Generating a load_id ...") - load_id = uuid.uuid4().hex + load_id = args.load_id or uuid.uuid4().hex print( f"load_id is {load_id}.\n" f"Please keep using this load version until the load is complete!" ) - env = args.env kb_base_url = loader_common_names.KB_BASE_URL_MAP[env] - if workspace_id <= 0: - parser.error(f"workspace_id needs to be > 0") - if batch_size <= 0: - parser.error(f"batch_size needs to be > 0") - if cbs_max_tasks <= 0: - parser.error(f"cbs_max_tasks needs to be > 0") + _validate_arguments(workspace_id, batch_size, cbs_max_tasks) - uid = os.getuid() username = os.getlogin() - - job_dir = loader_helper.make_job_dir(root_dir, username) - collection_source_dir = loader_helper.make_collection_source_dir( - root_dir, loader_common_names.DEFAULT_ENV, kbase_collection, source_version - ) - collections_source_dir = loader_helper.make_collection_source_dir( - root_dir, env, kbase_collection, source_version + job_dir, collection_source_dir, collections_source_dir, source_dir = _prepare_directories( + root_dir, username, kbase_collection, source_version, env, workspace_id ) - source_dir = loader_helper.make_sourcedata_ws_dir(root_dir, env, workspace_id) proc = None conf = None - try: - # setup container.conf file for the callback server logs if needed - if loader_helper.is_config_modification_required(): - if click.confirm( - f"The config file at {loader_common_names.CONTAINERS_CONF_PATH}\n" - f"needs to be modified to allow for container logging.\n" - f"Params 'seccomp_profile' and 'log_driver' will be added/updated under section [containers]. Do so now?\n" - ): - loader_helper.setup_callback_server_logs() - else: - print("Permission denied and exiting ...") - return - - # start podman service - proc = loader_helper.start_podman_service(uid) - - # set up conf for uploader, start callback server, and upload objects to workspace - conf = Conf( - job_dir=job_dir, - kb_base_url=kb_base_url, - token_filepath=token_filepath, - max_callback_server_tasks=cbs_max_tasks, - catalog_admin=catalog_admin, - ) + proc, conf = _setup_and_start_services(job_dir, kb_base_url, token_filepath, cbs_max_tasks, catalog_admin) + if not proc or not conf: + print("Failed to start services. Exiting ...") + return count, wait_to_upload_objs = _fetch_objects_to_upload( env, @@ -858,38 +965,27 @@ def main(): # set up workspace client ws_url = os.path.join(kb_base_url, "ws") ws = Workspace(ws_url, token=conf.token) + asu_client = AssemblyUtil(conf.callback_url, service_ver=au_service_ver, token=conf.token) + gfu_client = GenomeFileUtil(conf.callback_url, service_ver=gfu_service_ver, token=conf.token) - wait_to_upload_tuples = _dict2tuple_list(wait_to_upload_objs) - upload_results = _process_failed_uploads( + # check if the objects are already uploaded to the workspace + obj_names_processed = _check_existing_uploads_and_recovery( ws, + env, workspace_id, load_id, - wait_to_upload_tuples, - AssemblyUtil(conf.callback_url, service_ver=au_service_ver, token=conf.token), + collections_source_dir, + source_dir, + wait_to_upload_objs, + asu_client, conf.job_data_dir, ) - # fix inconsistencies between the workspace and the local yaml files - if upload_results: - print("Start failure recovery process ...") - for upload_result in upload_results: - _post_process( - env, - workspace_id, - load_id, - collections_source_dir, - source_dir, - upload_result - ) - obj_name = upload_result.genome_tuple.obj_name - wait_to_upload_objs.pop(obj_name) - - print("Recovery process completed ...") + for obj_name in obj_names_processed: + wait_to_upload_objs.pop(obj_name) if not wait_to_upload_objs: - print( - f"All {count} files already exist in workspace {workspace_id}" - ) + print(f"All {count} files already exist in workspace {workspace_id}") return wtus_len = len(wait_to_upload_objs) @@ -897,9 +993,7 @@ def main(): print(f"Detected {count - wtus_len} object files already exist in workspace") _prepare_skd_job_dir_to_upload(conf, wait_to_upload_objs) - print( - f"{wtus_len} objects in {conf.job_data_dir} are ready to upload to workspace {workspace_id}" - ) + print(f"{wtus_len} objects in {conf.job_data_dir} are ready to upload to workspace {workspace_id}") start = time.time() @@ -912,8 +1006,8 @@ def main(): wait_to_upload_objs, batch_size, source_dir, - AssemblyUtil(conf.callback_url, service_ver=au_service_ver, token=conf.token), - GenomeFileUtil(conf.callback_url, service_ver=gfu_service_ver, token=conf.token), + asu_client, + gfu_client, conf.job_data_dir, ) @@ -926,16 +1020,7 @@ def main(): ) finally: - # stop callback server if it is on - if conf: - conf.stop_callback_server() - - # stop podman service if it is on - if proc: - proc.terminate() - - if not keep_job_dir: - shutil.rmtree(job_dir) + _cleanup_resources(conf, proc, job_dir, keep_job_dir) if __name__ == "__main__": From 1fb30aa5b554317bd4135bbbf77fc6b1cde27476 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 13:23:11 -0500 Subject: [PATCH 29/50] rename collection_source_dir/collections_source_dir --- .../workspace_uploader/workspace_uploader.py | 54 +++++++++---------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 18a62b968..1d56db893 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -368,16 +368,16 @@ def _fetch_objects_to_upload( upload_env_key: str, workspace_id: int, load_id: str, - collection_source_dir: str, + ncbi_coll_src_dir: str, upload_file_ext: list[str], ) -> tuple[int, dict[str, str]]: count = 0 wait_to_upload_objs = dict() obj_dirs = [ - os.path.join(collection_source_dir, d) - for d in os.listdir(collection_source_dir) - if os.path.isdir(os.path.join(collection_source_dir, d)) + os.path.join(ncbi_coll_src_dir, d) + for d in os.listdir(ncbi_coll_src_dir) + if os.path.isdir(os.path.join(ncbi_coll_src_dir, d)) ] for obj_dir in obj_dirs: @@ -506,7 +506,7 @@ def _post_process( upload_env_key: str, workspace_id: int, load_id: str, - collections_source_dir: str, + ws_coll_src_dir: str, source_data_dir: str, upload_result: UploadResult, ) -> None: @@ -519,7 +519,7 @@ def _post_process( Creates a softlink from new_dir in collectionssource to the contents of target_dir in sourcedata. """ - _process_genome_objects(collections_source_dir, + _process_genome_objects(ws_coll_src_dir, source_data_dir, upload_result, ) @@ -535,7 +535,7 @@ def _post_process( def _process_genome_objects( - collections_source_dir: str, + ws_coll_src_dir: str, source_data_dir: str, upload_result: UploadResult, ) -> None: @@ -562,7 +562,7 @@ def _process_genome_objects( upload_result.genome_obj_info) # create a softlink from new_dir in collectionssource to the contents of target_dir in sourcedata - new_dir = os.path.join(collections_source_dir, assembly_upa) + new_dir = os.path.join(ws_coll_src_dir, assembly_upa) loader_helper.create_softlink_between_dirs(new_dir, ws_source_data_dir) @@ -667,7 +667,7 @@ def _upload_objects_in_parallel( upload_env_key: str, workspace_id: int, load_id: str, - collections_source_dir: str, + ws_coll_src_dir: str, wait_to_upload_objs: dict[str, str], batch_size: int, source_data_dir: str, @@ -683,7 +683,7 @@ def _upload_objects_in_parallel( upload_env_key: environment variable key in uploaded.yaml file workspace_id: target workspace id load_id: load id - collections_source_dir: a directory in collectionssource that creates new directories linking to sourcedata. i.e. /root_dir/collectionssource + ws_coll_src_dir: a directory in collectionssource representing workspace that creates new directories linking to sourcedata. i.e. /root_dir/collectionssource/// wait_to_upload_objs: a dictionary that maps object file name to object directory batch_size: a number of files to upload per batch source_data_dir: directory for all source data. i.e. /root_dir/sourcedata @@ -731,7 +731,7 @@ def _upload_objects_in_parallel( upload_env_key, workspace_id, load_id, - collections_source_dir, + ws_coll_src_dir, source_data_dir, upload_result ) @@ -778,7 +778,7 @@ def _check_existing_uploads_and_recovery( env: str, workspace_id: int, load_id: str, - collections_source_dir: str, + ws_coll_src_dir: str, source_dir: str, wait_to_upload_objs: dict[str, str], asu_client: AssemblyUtil, @@ -807,7 +807,7 @@ def _check_existing_uploads_and_recovery( env, workspace_id, load_id, - collections_source_dir, + ws_coll_src_dir, source_dir, upload_result ) @@ -883,13 +883,14 @@ def _validate_arguments( workspace_id: int, batch_size: int, cbs_max_tasks: int, + parser: argparse.ArgumentParser ): if workspace_id <= 0: - raise ValueError("workspace_id needs to be > 0") + parser.error(f"workspace_id needs to be > 0") if batch_size <= 0: - raise ValueError("batch_size needs to be > 0") + parser.error(f"batch_size needs to be > 0") if cbs_max_tasks <= 0: - raise ValueError("cbs_max_tasks needs to be > 0") + parser.error(f"cbs_max_tasks needs to be > 0") def _prepare_directories( @@ -904,15 +905,15 @@ def _prepare_directories( Prepare directories used for the uploader. """ job_dir = loader_helper.make_job_dir(root_dir, username) - collection_source_dir = loader_helper.make_collection_source_dir( + ncbi_coll_src_dir = loader_helper.make_collection_source_dir( root_dir, loader_common_names.DEFAULT_ENV, kbase_collection, source_version ) - collections_source_dir = loader_helper.make_collection_source_dir( + ws_coll_src_dir = loader_helper.make_collection_source_dir( root_dir, env, kbase_collection, source_version ) source_dir = loader_helper.make_sourcedata_ws_dir(root_dir, env, workspace_id) - return job_dir, collection_source_dir, collections_source_dir, source_dir + return job_dir, ncbi_coll_src_dir, ws_coll_src_dir, source_dir def main(): @@ -939,10 +940,10 @@ def main(): env = args.env kb_base_url = loader_common_names.KB_BASE_URL_MAP[env] - _validate_arguments(workspace_id, batch_size, cbs_max_tasks) + _validate_arguments(workspace_id, batch_size, cbs_max_tasks, parser) username = os.getlogin() - job_dir, collection_source_dir, collections_source_dir, source_dir = _prepare_directories( + job_dir, ncbi_coll_src_dir, ws_coll_src_dir, source_dir = _prepare_directories( root_dir, username, kbase_collection, source_version, env, workspace_id ) @@ -955,12 +956,7 @@ def main(): return count, wait_to_upload_objs = _fetch_objects_to_upload( - env, - workspace_id, - load_id, - collection_source_dir, - upload_file_ext, - ) + env, workspace_id, load_id, ncbi_coll_src_dir, upload_file_ext) # set up workspace client ws_url = os.path.join(kb_base_url, "ws") @@ -974,7 +970,7 @@ def main(): env, workspace_id, load_id, - collections_source_dir, + ws_coll_src_dir, source_dir, wait_to_upload_objs, asu_client, @@ -1002,7 +998,7 @@ def main(): env, workspace_id, load_id, - collections_source_dir, + ws_coll_src_dir, wait_to_upload_objs, batch_size, source_dir, From ea38ae78fdc7b1357c912d99eff1f2fe7d4abf7f Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 13:31:05 -0500 Subject: [PATCH 30/50] fix test --- test/src/loaders/workspace_uploader/workspace_uploader_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 9b4cb2a15..17ab97294 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -534,7 +534,7 @@ def test_upload_genome_files_in_parallel(setup_and_teardown): upload_env_key="CI", workspace_id=72231, load_id="214", - collections_source_dir=collection_source_dir, + ws_coll_src_dir=collection_source_dir, wait_to_upload_objs=wait_to_upload_genomes, batch_size=2, source_data_dir=sourcedata_dir, From 76c4124b6c8ce0374ac73649c2aa760e275ae151 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 13:44:20 -0500 Subject: [PATCH 31/50] further reduce main() length --- .../workspace_uploader/workspace_uploader.py | 101 ++++++++++++------ 1 file changed, 70 insertions(+), 31 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 1d56db893..9457251b9 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -916,6 +916,71 @@ def _prepare_directories( return job_dir, ncbi_coll_src_dir, ws_coll_src_dir, source_dir +def _create_ws_clients( + kb_base_url, + callback_url, + token, + au_service_ver, + gfu_service_ver +) -> tuple[Workspace, AssemblyUtil, GenomeFileUtil]: + """ + Create workspace clients. + """ + ws_url = os.path.join(kb_base_url, "ws") + ws = Workspace(ws_url, token=token) + asu_client = AssemblyUtil(callback_url, service_ver=au_service_ver, token=token) + gfu_client = GenomeFileUtil(callback_url, service_ver=gfu_service_ver, token=token) + + return ws, asu_client, gfu_client + + +def _fetch_objs_to_upload( + env: str, + ws: Workspace, + workspace_id: int, + load_id: str, + ncbi_coll_src_dir: str, + ws_coll_src_dir: str, + source_dir: str, + upload_file_ext: list[str], + asu_client: AssemblyUtil, + job_data_dir: str +): + """ + Fetch objects to be uploaded to the workspace. + + Check if the objects are already uploaded to the workspace and perform recovery if needed. + """ + count, wait_to_upload_objs = _fetch_objects_to_upload( + env, workspace_id, load_id, ncbi_coll_src_dir, upload_file_ext) + + # check if the objects are already uploaded to the workspace + obj_names_processed = _check_existing_uploads_and_recovery( + ws, + env, + workspace_id, + load_id, + ws_coll_src_dir, + source_dir, + wait_to_upload_objs, + asu_client, + job_data_dir, + ) + + for obj_name in obj_names_processed: + wait_to_upload_objs.pop(obj_name) + + if not wait_to_upload_objs: + print(f"All {count} files already exist in workspace {workspace_id}") + return wait_to_upload_objs, count + + wtus_len = len(wait_to_upload_objs) + print(f"Originally planned to upload {count} object files") + print(f"Detected {count - wtus_len} object files already exist in workspace") + + return wait_to_upload_objs, count + + def main(): parser = _get_parser() args = parser.parse_args() @@ -955,41 +1020,15 @@ def main(): print("Failed to start services. Exiting ...") return - count, wait_to_upload_objs = _fetch_objects_to_upload( - env, workspace_id, load_id, ncbi_coll_src_dir, upload_file_ext) - - # set up workspace client - ws_url = os.path.join(kb_base_url, "ws") - ws = Workspace(ws_url, token=conf.token) - asu_client = AssemblyUtil(conf.callback_url, service_ver=au_service_ver, token=conf.token) - gfu_client = GenomeFileUtil(conf.callback_url, service_ver=gfu_service_ver, token=conf.token) + ws, asu_client, gfu_client = _create_ws_clients( + kb_base_url, conf.callback_url, conf.token, au_service_ver, gfu_service_ver) - # check if the objects are already uploaded to the workspace - obj_names_processed = _check_existing_uploads_and_recovery( - ws, - env, - workspace_id, - load_id, - ws_coll_src_dir, - source_dir, - wait_to_upload_objs, - asu_client, - conf.job_data_dir, + wait_to_upload_objs, count = _fetch_objs_to_upload( + env, ws, workspace_id, load_id, ncbi_coll_src_dir, ws_coll_src_dir, source_dir, upload_file_ext, asu_client, conf.job_data_dir ) - for obj_name in obj_names_processed: - wait_to_upload_objs.pop(obj_name) - - if not wait_to_upload_objs: - print(f"All {count} files already exist in workspace {workspace_id}") - return - - wtus_len = len(wait_to_upload_objs) - print(f"Originally planned to upload {count} object files") - print(f"Detected {count - wtus_len} object files already exist in workspace") - _prepare_skd_job_dir_to_upload(conf, wait_to_upload_objs) - print(f"{wtus_len} objects in {conf.job_data_dir} are ready to upload to workspace {workspace_id}") + print(f"{len(wait_to_upload_objs)} objects in {conf.job_data_dir} are ready to upload to workspace {workspace_id}") start = time.time() From 912120c6a8032ae51f7c4ec67993e60e54c7ea77 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 13:51:17 -0500 Subject: [PATCH 32/50] further reduce main() length --- .../workspace_uploader/workspace_uploader.py | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 9457251b9..60acf5074 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -879,12 +879,30 @@ def _setup_and_start_services( return proc, conf -def _validate_arguments( - workspace_id: int, - batch_size: int, - cbs_max_tasks: int, - parser: argparse.ArgumentParser -): +def _get_parser_args(): + parser = _get_parser() + args = parser.parse_args() + + workspace_id = args.workspace_id + kbase_collection = getattr(args, loader_common_names.KBASE_COLLECTION_ARG_NAME) + source_version = getattr(args, loader_common_names.SOURCE_VER_ARG_NAME) + root_dir = getattr(args, loader_common_names.ROOT_DIR_ARG_NAME) + token_filepath = args.token_filepath + upload_file_ext = args.upload_file_ext + batch_size = args.batch_size + cbs_max_tasks = args.cbs_max_tasks + au_service_ver = args.au_service_ver + gfu_service_ver = args.gfu_service_ver + keep_job_dir = args.keep_job_dir + catalog_admin = args.as_catalog_admin + load_id = args.load_id or uuid.uuid4().hex + print( + f"load_id is {load_id}.\n" + f"Please keep using this load version until the load is complete!" + ) + env = args.env + kb_base_url = loader_common_names.KB_BASE_URL_MAP[env] + if workspace_id <= 0: parser.error(f"workspace_id needs to be > 0") if batch_size <= 0: @@ -892,6 +910,10 @@ def _validate_arguments( if cbs_max_tasks <= 0: parser.error(f"cbs_max_tasks needs to be > 0") + return (workspace_id, kbase_collection, source_version, root_dir, token_filepath, + upload_file_ext, batch_size, cbs_max_tasks, au_service_ver, gfu_service_ver, + keep_job_dir, catalog_admin, load_id, env, kb_base_url) + def _prepare_directories( root_dir: str, @@ -982,30 +1004,10 @@ def _fetch_objs_to_upload( def main(): - parser = _get_parser() - args = parser.parse_args() - - workspace_id = args.workspace_id - kbase_collection = getattr(args, loader_common_names.KBASE_COLLECTION_ARG_NAME) - source_version = getattr(args, loader_common_names.SOURCE_VER_ARG_NAME) - root_dir = getattr(args, loader_common_names.ROOT_DIR_ARG_NAME) - token_filepath = args.token_filepath - upload_file_ext = args.upload_file_ext - batch_size = args.batch_size - cbs_max_tasks = args.cbs_max_tasks - au_service_ver = args.au_service_ver - gfu_service_ver = args.gfu_service_ver - keep_job_dir = args.keep_job_dir - catalog_admin = args.as_catalog_admin - load_id = args.load_id or uuid.uuid4().hex - print( - f"load_id is {load_id}.\n" - f"Please keep using this load version until the load is complete!" - ) - env = args.env - kb_base_url = loader_common_names.KB_BASE_URL_MAP[env] - _validate_arguments(workspace_id, batch_size, cbs_max_tasks, parser) + (workspace_id, kbase_collection, source_version, root_dir, token_filepath, + upload_file_ext, batch_size, cbs_max_tasks, au_service_ver, gfu_service_ver, + keep_job_dir, catalog_admin, load_id, env, kb_base_url) = _get_parser_args() username = os.getlogin() job_dir, ncbi_coll_src_dir, ws_coll_src_dir, source_dir = _prepare_directories( From fe17bfa7749adcc4f9751c241f975eb2e99f12e0 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 13:58:15 -0500 Subject: [PATCH 33/50] update var name --- src/loaders/jobs/taskfarmer/task_generator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index f35c457ec..ad4ea6358 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -42,7 +42,7 @@ To reflect this, we have set the "threads" and "program_threads" parameters in "_create_task_list" to 32, indicating that each batch will use 32 cores. We have also set the execution time for GTDB-Tk to 65 minutes, with some additional buffer time. To allow for a sufficient number of batches to be run within a given time limit, -we have set the "NODE_TIME_LIMIT" to 5 hours. With these settings, we expect to be able to process up to 16 batches, +we have set the "NODE_TIME_LIMIT_DEFAULT" to 5 hours. With these settings, we expect to be able to process up to 16 batches, or 16,000 genomes, per node within the 5-hour time limit. We plan to make these parameters configurable based on the specific tool being used. After conducting performance tests, we found that utilizing 32 cores per batch and running 4 batches in parallel per NERSC node resulted in optimal performance, despite each node having a total of @@ -55,10 +55,10 @@ TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65, 'tasks_per_node': 4}, 'eggnog': {'chunk_size': 100, 'exe_time': 15, 'node_time_limit': 0.5}, # Memory intensive tool - reserve more nodes with less node reservation time 'default': {'chunk_size': 5000, 'exe_time': 60}} -NODE_TIME_LIMIT = 5 # hours # TODO: automatically calculate this based on tool execution time and NODE_THREADS +NODE_TIME_LIMIT_DEFAULT = 5 # hours # TODO: automatically calculate this based on tool execution time and NODE_THREADS MAX_NODE_NUM = 100 # maximum number of nodes to use # Used as THREADS variable in the batch script which controls the number of parallel tasks per node -TASKS_PER_NODE = 1 +TASKS_PER_NODE_DEFAULT = 1 REGISTRY = 'ghcr.io/kbase/collections' VERSION_FILE = 'versions.yaml' @@ -282,7 +282,7 @@ def _get_node_task_count(tool: str) -> int: Get the number of parallel tasks to run on a node """ - return TASK_META.get(tool, TASK_META['default']).get('tasks_per_node', TASKS_PER_NODE) + return TASK_META.get(tool, TASK_META['default']).get('tasks_per_node', TASKS_PER_NODE_DEFAULT) def _get_node_time_limit(tool: str) -> float: @@ -293,7 +293,7 @@ def _get_node_time_limit(tool: str) -> float: If in TASK_META, the tool has a different time limit (node_time_limit), we will use that. """ - return TASK_META.get(tool, TASK_META['default']).get('node_time_limit', NODE_TIME_LIMIT) + return TASK_META.get(tool, TASK_META['default']).get('node_time_limit', NODE_TIME_LIMIT_DEFAULT) def _float_to_time(float_hours: float) -> str: From 7a53aa75803db88f96a8fd48ec6929b65acbd0d8 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 16:58:27 -0500 Subject: [PATCH 34/50] rename ncbi_coll_src_dir --- .../workspace_uploader/workspace_uploader.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 60acf5074..299083edc 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -368,16 +368,16 @@ def _fetch_objects_to_upload( upload_env_key: str, workspace_id: int, load_id: str, - ncbi_coll_src_dir: str, + external_coll_src_dir: str, upload_file_ext: list[str], ) -> tuple[int, dict[str, str]]: count = 0 wait_to_upload_objs = dict() obj_dirs = [ - os.path.join(ncbi_coll_src_dir, d) - for d in os.listdir(ncbi_coll_src_dir) - if os.path.isdir(os.path.join(ncbi_coll_src_dir, d)) + os.path.join(external_coll_src_dir, d) + for d in os.listdir(external_coll_src_dir) + if os.path.isdir(os.path.join(external_coll_src_dir, d)) ] for obj_dir in obj_dirs: @@ -927,7 +927,7 @@ def _prepare_directories( Prepare directories used for the uploader. """ job_dir = loader_helper.make_job_dir(root_dir, username) - ncbi_coll_src_dir = loader_helper.make_collection_source_dir( + external_coll_src_dir = loader_helper.make_collection_source_dir( root_dir, loader_common_names.DEFAULT_ENV, kbase_collection, source_version ) ws_coll_src_dir = loader_helper.make_collection_source_dir( @@ -935,7 +935,7 @@ def _prepare_directories( ) source_dir = loader_helper.make_sourcedata_ws_dir(root_dir, env, workspace_id) - return job_dir, ncbi_coll_src_dir, ws_coll_src_dir, source_dir + return job_dir, external_coll_src_dir, ws_coll_src_dir, source_dir def _create_ws_clients( @@ -961,7 +961,7 @@ def _fetch_objs_to_upload( ws: Workspace, workspace_id: int, load_id: str, - ncbi_coll_src_dir: str, + external_coll_src_dir: str, ws_coll_src_dir: str, source_dir: str, upload_file_ext: list[str], @@ -974,7 +974,7 @@ def _fetch_objs_to_upload( Check if the objects are already uploaded to the workspace and perform recovery if needed. """ count, wait_to_upload_objs = _fetch_objects_to_upload( - env, workspace_id, load_id, ncbi_coll_src_dir, upload_file_ext) + env, workspace_id, load_id, external_coll_src_dir, upload_file_ext) # check if the objects are already uploaded to the workspace obj_names_processed = _check_existing_uploads_and_recovery( @@ -1010,7 +1010,7 @@ def main(): keep_job_dir, catalog_admin, load_id, env, kb_base_url) = _get_parser_args() username = os.getlogin() - job_dir, ncbi_coll_src_dir, ws_coll_src_dir, source_dir = _prepare_directories( + job_dir, external_coll_src_dir, ws_coll_src_dir, source_dir = _prepare_directories( root_dir, username, kbase_collection, source_version, env, workspace_id ) @@ -1026,7 +1026,7 @@ def main(): kb_base_url, conf.callback_url, conf.token, au_service_ver, gfu_service_ver) wait_to_upload_objs, count = _fetch_objs_to_upload( - env, ws, workspace_id, load_id, ncbi_coll_src_dir, ws_coll_src_dir, source_dir, upload_file_ext, asu_client, conf.job_data_dir + env, ws, workspace_id, load_id, external_coll_src_dir, ws_coll_src_dir, source_dir, upload_file_ext, asu_client, conf.job_data_dir ) _prepare_skd_job_dir_to_upload(conf, wait_to_upload_objs) From 414435b4f3bfe84f146bd1f4068ea4197f953301 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 18:28:49 -0500 Subject: [PATCH 35/50] update var name --- .../workspace_uploader/workspace_uploader.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index 299083edc..aa8961b49 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -938,7 +938,7 @@ def _prepare_directories( return job_dir, external_coll_src_dir, ws_coll_src_dir, source_dir -def _create_ws_clients( +def _create_kbase_clients( kb_base_url, callback_url, token, @@ -967,7 +967,7 @@ def _fetch_objs_to_upload( upload_file_ext: list[str], asu_client: AssemblyUtil, job_data_dir: str -): +) -> dict[str, str]: """ Fetch objects to be uploaded to the workspace. @@ -994,13 +994,13 @@ def _fetch_objs_to_upload( if not wait_to_upload_objs: print(f"All {count} files already exist in workspace {workspace_id}") - return wait_to_upload_objs, count + return wait_to_upload_objs wtus_len = len(wait_to_upload_objs) print(f"Originally planned to upload {count} object files") print(f"Detected {count - wtus_len} object files already exist in workspace") - return wait_to_upload_objs, count + return wait_to_upload_objs def main(): @@ -1022,12 +1022,14 @@ def main(): print("Failed to start services. Exiting ...") return - ws, asu_client, gfu_client = _create_ws_clients( + ws, asu_client, gfu_client = _create_kbase_clients( kb_base_url, conf.callback_url, conf.token, au_service_ver, gfu_service_ver) - wait_to_upload_objs, count = _fetch_objs_to_upload( + wait_to_upload_objs = _fetch_objs_to_upload( env, ws, workspace_id, load_id, external_coll_src_dir, ws_coll_src_dir, source_dir, upload_file_ext, asu_client, conf.job_data_dir ) + if not wait_to_upload_objs: + return _prepare_skd_job_dir_to_upload(conf, wait_to_upload_objs) print(f"{len(wait_to_upload_objs)} objects in {conf.job_data_dir} are ready to upload to workspace {workspace_id}") From e3cc212e9457b1699f28c6eee931b3f0ef18ae75 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 18:36:21 -0500 Subject: [PATCH 36/50] update some comments --- src/loaders/jobs/taskfarmer/task_generator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index ad4ea6358..d5ee7fe7d 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -205,8 +205,9 @@ def _create_task_list( For instance, if "threads" is set to 128 and "program_threads" to 32, then each task will run 4 batches in parallel. For tools capable of processing a batch of genomes, such as GTDB-TK and checkm2, - we have chosen to utilize the THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for parallelization, - which means that "threads" and "program_threads" should have the same value. + we have chosen to utilize the THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for parallelization. + The 'program_threads' is configured to match the CPU count per execution. To ensure each thread handles only one + task at a time, both 'threads' and 'program_threads' should be set to the same value. This ensures that parallelization only happens between tasks, and not within them. For tools that can only handle genomes individually, such as microtrait, mash and eggNOG, From 58248c65f8ad0c5e9d9799993baab1c25a7a624c Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 19:06:44 -0500 Subject: [PATCH 37/50] update some comments --- src/loaders/jobs/taskfarmer/task_generator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index d5ee7fe7d..a0ef7a210 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -207,8 +207,10 @@ def _create_task_list( For tools capable of processing a batch of genomes, such as GTDB-TK and checkm2, we have chosen to utilize the THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for parallelization. The 'program_threads' is configured to match the CPU count per execution. To ensure each thread handles only one - task at a time, both 'threads' and 'program_threads' should be set to the same value. - This ensures that parallelization only happens between tasks, and not within them. + task at a time, both 'threads' and 'program_threads' should be set to the same value. Due to the fact the underlying + program utilizes 'threads'/'program_threads' to determine the quantity of parallel execution on a thread. + Setting 'threads' and 'program_threads' the same same number ensures that parallelization only happens between tasks, + and not within tasks. For tools that can only handle genomes individually, such as microtrait, mash and eggNOG, we still use the THREADS variable for task parallelization. However, within each task, we further parallelize tool From bda78e154665ae664638851fbe9088c409010646 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 19:16:04 -0500 Subject: [PATCH 38/50] refactor WSObjTuple --- .../workspace_uploader/upload_result.py | 3 ++- .../workspace_uploader/workspace_uploader.py | 27 +++++++++++++------ .../workspace_uploader_test.py | 17 +++++++----- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/loaders/workspace_uploader/upload_result.py b/src/loaders/workspace_uploader/upload_result.py index dee2f8044..6089c12a2 100644 --- a/src/loaders/workspace_uploader/upload_result.py +++ b/src/loaders/workspace_uploader/upload_result.py @@ -12,11 +12,12 @@ WSObjTuple = namedtuple( "WSObjTuple", - ["obj_name", "obj_coll_src_dir", "container_internal_file"], + ["obj_name", "obj_file_name", "obj_coll_src_dir", "container_internal_file"], ) """ WSObjTuple is a named tuple that contains the following fields: - obj_name: the name of the object (in many cases, also serves as the file name) +- obj_file_name: the file name of the associated file - obj_coll_src_dir: the directory of the associated file in the collection source directory - container_internal_file: the associated file path in the docker container """ diff --git a/src/loaders/workspace_uploader/workspace_uploader.py b/src/loaders/workspace_uploader/workspace_uploader.py index aa8961b49..b598bc23c 100644 --- a/src/loaders/workspace_uploader/workspace_uploader.py +++ b/src/loaders/workspace_uploader/workspace_uploader.py @@ -279,7 +279,12 @@ def _upload_genomes_to_workspace( loader_helper.create_hardlink_between_files(collection_source_data_dir / fasta_file_name, local_assembly_path) - assembly_tuple = WSObjTuple(fasta_file_name, collection_source_data_dir, container_assembly_path) + assembly_tuple = WSObjTuple( + obj_name=assembly_obj_info[1], + obj_file_name=fasta_file_name, + obj_coll_src_dir=collection_source_data_dir, + container_internal_file=container_assembly_path, + ) upload_result = UploadResult( genome_obj_info=genome_obj_info, @@ -354,9 +359,9 @@ def _update_upload_status_yaml_file( data[upload_env_key][workspace_id][load_id] = { _KEY_ASSEMBLY_UPA: upload_result.assembly_upa, - _KEY_ASSEMBLY_FILENAME: upload_result.assembly_tuple.obj_name, + _KEY_ASSEMBLY_FILENAME: upload_result.assembly_tuple.obj_file_name, _KEY_GENOME_UPA: upload_result.genome_upa, - _KEY_GENOME_FILENAME: upload_result.genome_tuple.obj_name, + _KEY_GENOME_FILENAME: upload_result.genome_tuple.obj_file_name, } file_path = _get_yaml_file_path(obj_tuple.obj_coll_src_dir) @@ -545,7 +550,7 @@ def _process_genome_objects( # create hardlink for the FASTA file from upload data collection source directory (e.g. GTDB) # to the corresponding workspace object directory. assembly_tuple, assembly_upa = upload_result.assembly_tuple, upload_result.assembly_upa - coll_src_assembly = Path(_get_source_file(assembly_tuple.obj_coll_src_dir, assembly_tuple.obj_name)) + coll_src_assembly = Path(_get_source_file(assembly_tuple.obj_coll_src_dir, assembly_tuple.obj_file_name)) ws_source_data_dir = os.path.join(source_data_dir, assembly_upa) os.makedirs(ws_source_data_dir, exist_ok=True) @@ -611,9 +616,12 @@ def _build_assembly_tuples( coll_src_assembly_path, job_data_dir) - assembly_tuple = WSObjTuple(assembly_file, - genome_tuple.obj_coll_src_dir, - None) # container_internal_file is fine to be None here as it's not used during failure recovery + assembly_tuple = WSObjTuple( + obj_name=genome_tuple.obj_name + "_assembly", # aligns with the Assembly object name generated by GFU. Given that the assembly object name isn't utilized during failure recovery, there's no need to invoke a workspace call to retrieve it. + obj_file_name=assembly_file, + obj_coll_src_dir=genome_tuple.obj_coll_src_dir, + container_internal_file=None, # container_internal_file is fine to be None here as it's not used during failure recovery + ) assembly_tuples.append(assembly_tuple) return assembly_tuples @@ -753,7 +761,10 @@ def _upload_objects_in_parallel( def _dict2tuple_list(objs_dict: dict[str, str]) -> list[WSObjTuple]: ws_object_tuple_list = [ WSObjTuple( - i[0], i[1], os.path.join(_JOB_DIR_IN_CONTAINER, i[0]) + obj_name=i[0], + obj_file_name=i[0], + obj_coll_src_dir=i[1], + container_internal_file=os.path.join(_JOB_DIR_IN_CONTAINER, i[0]), ) for i in objs_dict.items() ] diff --git a/test/src/loaders/workspace_uploader/workspace_uploader_test.py b/test/src/loaders/workspace_uploader/workspace_uploader_test.py index 17ab97294..5d8bc0051 100644 --- a/test/src/loaders/workspace_uploader/workspace_uploader_test.py +++ b/test/src/loaders/workspace_uploader/workspace_uploader_test.py @@ -210,12 +210,12 @@ def test_update_upload_status_yaml_file(setup_and_teardown): genome_dir = params.genome_dirs[0] assembly_name = ASSEMBLY_NAMES[0] assembly_tuple = workspace_uploader.WSObjTuple( - assembly_name, genome_dir, "/path/to/file/in/AssembilyUtil" + assembly_name, assembly_name, genome_dir, "/path/to/file/in/AssembilyUtil" ) genome_name = GEMOME_NAMES[0] genome_tuple = workspace_uploader.WSObjTuple( - genome_name, genome_dir, "/path/to/file/in/GenomeFileUtil" + genome_name, genome_name, genome_dir, "/path/to/file/in/GenomeFileUtil" ) upload_result = UploadResult(assembly_tuple=assembly_tuple, @@ -272,10 +272,10 @@ def test_fetch_objects_to_upload(setup_and_teardown): # Both genome files will be skipped in the next fetch_assemblies_to_upload call for genome_name, assembly_name, genome_dir in zip(GEMOME_NAMES, ASSEMBLY_NAMES, genome_dirs): assembly_tuple = workspace_uploader.WSObjTuple( - assembly_name, genome_dir, "/path/to/file/in/AssembilyUtil" + assembly_name, assembly_name, genome_dir, "/path/to/file/in/AssembilyUtil" ) genome_tuple = workspace_uploader.WSObjTuple( - genome_name, genome_dir, "/path/to/file/in/GenomeFileUtil" + genome_name, genome_name, genome_dir, "/path/to/file/in/GenomeFileUtil" ) upload_result = UploadResult(assembly_tuple=assembly_tuple, assembly_obj_info=ASSEMBLY_OBJ_INFOS[0], @@ -335,8 +335,8 @@ def test_post_process_with_genome(setup_and_teardown): assembly_name = assembly_obj_info[1] genome_name = genome_obj_info[1] - assembly_tuple = workspace_uploader.WSObjTuple(assembly_name, genome_coll_src_dir, container_dir / assembly_name) - genome_tuple = workspace_uploader.WSObjTuple(genome_name, genome_coll_src_dir, container_dir / genome_name) + assembly_tuple = workspace_uploader.WSObjTuple(assembly_name, assembly_name, genome_coll_src_dir, container_dir / assembly_name) + genome_tuple = workspace_uploader.WSObjTuple(genome_name, genome_name, genome_coll_src_dir, container_dir / genome_name) assembly_upa = obj_info_to_upa(assembly_obj_info, underscore_sep=True) genome_upa = obj_info_to_upa(genome_obj_info, underscore_sep=True) @@ -410,7 +410,7 @@ def test_upload_genome_to_workspace(setup_and_teardown): "assembly_info": assembly_obj_info, "genome_info": genome_obj_info}]} genome_tuple = workspace_uploader.WSObjTuple( - genome_name, genome_coll_src_dir, genome_container_file + genome_name, genome_name, genome_coll_src_dir, genome_container_file ) with patch.object(workspace_uploader, '_JOB_DIR_IN_CONTAINER', new=container_dir): @@ -420,6 +420,7 @@ def test_upload_genome_to_workspace(setup_and_teardown): expected_assembly_tuple = workspace_uploader.WSObjTuple( obj_name=assembly_name, + obj_file_name=assembly_name, obj_coll_src_dir=genome_coll_src_dir, container_internal_file=container_dir / assembly_name) @@ -462,6 +463,7 @@ def test_generator(setup_and_teardown): expected_genomeTuple_list = [ [ workspace_uploader.WSObjTuple( + "GCF_000979855.1_gtlEnvA5udCFS_genomic.gbff.gz", "GCF_000979855.1_gtlEnvA5udCFS_genomic.gbff.gz", genome_dirs[0], "/kb/module/work/tmp/GCF_000979855.1_gtlEnvA5udCFS_genomic.gbff.gz", @@ -469,6 +471,7 @@ def test_generator(setup_and_teardown): ], [ workspace_uploader.WSObjTuple( + "GCF_000979175.1_gtlEnvA5udCFS_genomic.gbff.gz", "GCF_000979175.1_gtlEnvA5udCFS_genomic.gbff.gz", genome_dirs[1], "/kb/module/work/tmp/GCF_000979175.1_gtlEnvA5udCFS_genomic.gbff.gz", From b7fad238a31c15e943e087fc09ef5ed01e8ee3a9 Mon Sep 17 00:00:00 2001 From: Tianhao Gu Date: Mon, 18 Mar 2024 19:22:26 -0500 Subject: [PATCH 39/50] Update src/loaders/jobs/taskfarmer/task_generator.py Co-authored-by: MrCreosote --- src/loaders/jobs/taskfarmer/task_generator.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index a0ef7a210..3cbfd8b0d 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -205,17 +205,23 @@ def _create_task_list( For instance, if "threads" is set to 128 and "program_threads" to 32, then each task will run 4 batches in parallel. For tools capable of processing a batch of genomes, such as GTDB-TK and checkm2, - we have chosen to utilize the THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for parallelization. - The 'program_threads' is configured to match the CPU count per execution. To ensure each thread handles only one - task at a time, both 'threads' and 'program_threads' should be set to the same value. Due to the fact the underlying - program utilizes 'threads'/'program_threads' to determine the quantity of parallel execution on a thread. - Setting 'threads' and 'program_threads' the same same number ensures that parallelization only happens between tasks, - and not within tasks. + we have chosen to utilize the Slurm THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for tool parallelization. + From here on we will call this value 'tasks' rather than 'threads' to avoid confusion with operating system + threads. + It determines the number of simultaneous tool executions per node. + 'program_threads' is configured to match the CPU count per task execution. To ensure each task handles only one + tool execution at a time, both 'threads' and 'program_threads' should be set to the same value, due to the fact the task generator + program utilizes 'threads' divided 'program_threads' to determine the quantity of parallel execution in a task. + Setting 'threads' and 'program_threads' the same same number ensures that parallelization pf tool executions + only happens between tasks, and not within tasks. + Each tool execution is specified to use 'threads' operating system threads. For tools that can only handle genomes individually, such as microtrait, mash and eggNOG, we still use the THREADS variable for task parallelization. However, within each task, we further parallelize tool execution using the 'threads' variable. For instance, with 'threads' set to 16 and 'tasks_per_node' set to 4, each node will concurrently execute 4 tasks, with each task executing 16 parallel tool operations. + + TODO: how does program_threads affect single genome tools? TODO: make threads/program_threads configurable based on tool used. However, for the time being, we have set From accb66572f1d68a86386e04d68c4527443a0d46a Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 20:41:01 -0500 Subject: [PATCH 40/50] add more comments --- src/loaders/jobs/taskfarmer/task_generator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 3cbfd8b0d..80325bbde 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -212,7 +212,7 @@ def _create_task_list( 'program_threads' is configured to match the CPU count per task execution. To ensure each task handles only one tool execution at a time, both 'threads' and 'program_threads' should be set to the same value, due to the fact the task generator program utilizes 'threads' divided 'program_threads' to determine the quantity of parallel execution in a task. - Setting 'threads' and 'program_threads' the same same number ensures that parallelization pf tool executions + Setting 'threads' and 'program_threads' the same same number ensures that parallelization of tool executions only happens between tasks, and not within tasks. Each tool execution is specified to use 'threads' operating system threads. @@ -222,10 +222,8 @@ def _create_task_list( each node will concurrently execute 4 tasks, with each task executing 16 parallel tool operations. TODO: how does program_threads affect single genome tools? - - TODO: make threads/program_threads configurable based on tool used. However, for the time being, we have set - these parameters to 32 and , since this value has produced the highest throughput in our experiments. + these parameters to 32 since this value has produced the highest throughput for GTDB-TK in our experiments. """ source_data_dir = make_collection_source_dir(root_dir, env, kbase_collection, source_ver) genome_ids = [path for path in os.listdir(source_data_dir) if From 36fc02542da71f45344e14a55133a7a7568f4f6e Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 20:43:01 -0500 Subject: [PATCH 41/50] remove todo item --- src/loaders/workspace_uploader/upload_result.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/loaders/workspace_uploader/upload_result.py b/src/loaders/workspace_uploader/upload_result.py index 6089c12a2..2a0596ffe 100644 --- a/src/loaders/workspace_uploader/upload_result.py +++ b/src/loaders/workspace_uploader/upload_result.py @@ -4,12 +4,6 @@ from src.common.common_helper import obj_info_to_upa -# TODO: this struct assumes obj_name is the file name, -# but not true for downloaded fasta file associated with Assembly object -# This is okay for now as we are currently focusing solely on Genome/Assembly uploads and not a combination of both. -# i.g. assembly object name: 'GCF_000979555.1_gtlEnvA5udCFS_genomic.gbff.gz_assembly' -# downloaded fasta file name: 'GCF_000979555.1_gtlEnvA5udCFS_genomic.gbff.gz_assembly.fasta' - WSObjTuple = namedtuple( "WSObjTuple", ["obj_name", "obj_file_name", "obj_coll_src_dir", "container_internal_file"], From 8f81ec2e00d8fb80da0dd0cb1de81d620ce847d9 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 18 Mar 2024 20:43:33 -0500 Subject: [PATCH 42/50] update a comment --- src/loaders/workspace_uploader/upload_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/workspace_uploader/upload_result.py b/src/loaders/workspace_uploader/upload_result.py index 2a0596ffe..b381737a5 100644 --- a/src/loaders/workspace_uploader/upload_result.py +++ b/src/loaders/workspace_uploader/upload_result.py @@ -10,7 +10,7 @@ ) """ WSObjTuple is a named tuple that contains the following fields: -- obj_name: the name of the object (in many cases, also serves as the file name) +- obj_name: the name of the object - obj_file_name: the file name of the associated file - obj_coll_src_dir: the directory of the associated file in the collection source directory - container_internal_file: the associated file path in the docker container From 98406d5b7f713bbc541e927d356b96f90cfe71ba Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Tue, 19 Mar 2024 09:51:44 -0500 Subject: [PATCH 43/50] update a comment --- src/loaders/jobs/taskfarmer/task_generator.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 80325bbde..cfa5b6ae2 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -205,16 +205,17 @@ def _create_task_list( For instance, if "threads" is set to 128 and "program_threads" to 32, then each task will run 4 batches in parallel. For tools capable of processing a batch of genomes, such as GTDB-TK and checkm2, - we have chosen to utilize the Slurm THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for tool parallelization. + we have chosen to utilize the Slurm THREADS variable within Taskfarmer batch script (submit_taskfarmer.sl) for tool + parallelization. From here on we will call this value 'tasks' rather than 'threads' to avoid confusion with operating system - threads. - It determines the number of simultaneous tool executions per node. + threads. It determines the number of simultaneous tool executions per node. 'program_threads' is configured to match the CPU count per task execution. To ensure each task handles only one - tool execution at a time, both 'threads' and 'program_threads' should be set to the same value, due to the fact the task generator - program utilizes 'threads' divided 'program_threads' to determine the quantity of parallel execution in a task. + tool execution at a time, both 'threads' and 'program_threads' should be set to the same value, due to the fact the + task generator program utilizes 'threads' divided 'program_threads' to determine the quantity of parallel execution + in a task. Setting 'threads' and 'program_threads' the same same number ensures that parallelization of tool executions only happens between tasks, and not within tasks. - Each tool execution is specified to use 'threads' operating system threads. + Each tool execution is specified to use 'program_threads' operating system threads. For tools that can only handle genomes individually, such as microtrait, mash and eggNOG, we still use the THREADS variable for task parallelization. However, within each task, we further parallelize tool From cc0b88d4d45bf00e3001f8f37b2e903765c1af73 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Tue, 19 Mar 2024 09:54:20 -0500 Subject: [PATCH 44/50] update a comment --- src/loaders/jobs/taskfarmer/task_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index cfa5b6ae2..ad174556e 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -221,8 +221,8 @@ def _create_task_list( we still use the THREADS variable for task parallelization. However, within each task, we further parallelize tool execution using the 'threads' variable. For instance, with 'threads' set to 16 and 'tasks_per_node' set to 4, each node will concurrently execute 4 tasks, with each task executing 16 parallel tool operations. - - TODO: how does program_threads affect single genome tools? + 'program_threads' is also utilized to match the operating system threads per tool execution. + TODO: make threads/program_threads configurable based on tool used. However, for the time being, we have set these parameters to 32 since this value has produced the highest throughput for GTDB-TK in our experiments. """ From 44a608e9bd8fd3d5108bf787fe4d4991acc82f75 Mon Sep 17 00:00:00 2001 From: Tianhao Gu Date: Tue, 19 Mar 2024 13:38:06 -0500 Subject: [PATCH 45/50] Update src/loaders/jobs/taskfarmer/task_generator.py Co-authored-by: MrCreosote --- src/loaders/jobs/taskfarmer/task_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index ad174556e..57392d0ce 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -211,7 +211,7 @@ def _create_task_list( threads. It determines the number of simultaneous tool executions per node. 'program_threads' is configured to match the CPU count per task execution. To ensure each task handles only one tool execution at a time, both 'threads' and 'program_threads' should be set to the same value, due to the fact the - task generator program utilizes 'threads' divided 'program_threads' to determine the quantity of parallel execution + task generator program utilizes 'threads' divided by 'program_threads' to determine the quantity of parallel execution in a task. Setting 'threads' and 'program_threads' the same same number ensures that parallelization of tool executions only happens between tasks, and not within tasks. From 151958712f4cf8416fe02e169939fcebe67b2bc0 Mon Sep 17 00:00:00 2001 From: Tianhao Gu Date: Tue, 19 Mar 2024 14:05:31 -0500 Subject: [PATCH 46/50] Update src/loaders/jobs/taskfarmer/task_generator.py Co-authored-by: MrCreosote --- src/loaders/jobs/taskfarmer/task_generator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 57392d0ce..49920285a 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -221,7 +221,9 @@ def _create_task_list( we still use the THREADS variable for task parallelization. However, within each task, we further parallelize tool execution using the 'threads' variable. For instance, with 'threads' set to 16 and 'tasks_per_node' set to 4, each node will concurrently execute 4 tasks, with each task executing 16 parallel tool operations. - 'program_threads' is also utilized to match the operating system threads per tool execution. + Each tool will also use 'program_threads' operating threads, and so in the current example if 'program_threads' + is set to 8, the total number of operating system threads will be 4 * 16 * 8 = 512 threads. This is a significant + difference from the batch tasks described above TODO: make threads/program_threads configurable based on tool used. However, for the time being, we have set these parameters to 32 since this value has produced the highest throughput for GTDB-TK in our experiments. From 47bf209d03640db465b1e3bb76c625b5c63ff983 Mon Sep 17 00:00:00 2001 From: Gavin Date: Tue, 26 Mar 2024 13:28:02 -0700 Subject: [PATCH 47/50] Safe filter fields in AQL The classic mistake of thinking since you control the fields you can just use dot notation. Nope, some sample fields have colons in them which borks things up. --- src/service/filtering/filters.py | 14 +++--- test/src/service/filtering/filters_test.py | 50 +++++++++++----------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/service/filtering/filters.py b/src/service/filtering/filters.py index 4f70573b9..747643acd 100644 --- a/src/service/filtering/filters.py +++ b/src/service/filtering/filters.py @@ -69,7 +69,7 @@ def to_arangosearch_aql(self, identifier: str, var_prefix: str) -> SearchQueryPa Convert the filter to lines of ArangoSearch AQL and bind variables. identifier - the identifier for where the search is to take place, for example - `doc.classification`. This will be inserted verbatim into the search constraint, e.g. + `doc["classification"]`. This will be inserted verbatim into the search constraint, e.g. `f"{identifer} > 47` var_prefix - a prefix to apply to variable names, including bind variables, to prevent collisions between multiple filters. @@ -145,7 +145,7 @@ def to_arangosearch_aql(self, identifier: str, var_prefix: str) -> SearchQueryPa Convert the filter to lines of ArangoSearch AQL and bind variables. identifier - the identifier for where the search is to take place, for example - `doc.classification`. This will be inserted verbatim into the search constraint, e.g. + `doc["classification"]`. This will be inserted verbatim into the search constraint, e.g. `f"{identifer} == true` var_prefix - a prefix to apply to variable names, including bind variables, to prevent collisions between multiple filters. @@ -291,7 +291,7 @@ def to_arangosearch_aql(self, identifier: str, var_prefix: str) -> SearchQueryPa Convert the filter to lines of ArangoSearch AQL and bind variables. identifier - the identifier for where the search is to take place, for example - `doc.classification`. This will be inserted verbatim into the search constraint, e.g. + `doc["classification"]`. This will be inserted verbatim into the search constraint, e.g. `f"{identifer} > 47` var_prefix - a prefix to apply to variable names, including bind variables, to prevent collisions between multiple filters. @@ -374,7 +374,7 @@ def to_arangosearch_aql(self, identifier: str, var_prefix: str) -> SearchQueryPa Convert the filter to lines of ArangoSearch AQL and bind variables. identifier - the identifier for where the search is to take place, for example - `doc.classification`. This will be inserted verbatim into the search constraint, e.g. + `doc["classification"]. This will be inserted verbatim into the search constraint, e.g. `f"{identifer} > 47` var_prefix - a prefix to apply to variable names, including bind variables, to prevent collisions between multiple filters. @@ -548,7 +548,7 @@ def _process_filters(self): "load_ver": self.load_ver, } for i, (field, filter_) in enumerate(self._filters.items(), start=1): - search_part = filter_.to_arangosearch_aql(f"{self.doc_var}.{field}", f"v{i}_") + search_part = filter_.to_arangosearch_aql(f"{self.doc_var}[\"{field}\"]", f"v{i}_") if search_part.variable_assignments: for var, expression in search_part.variable_assignments.items(): var_lines.append(f"LET {var} = {expression}") @@ -583,7 +583,7 @@ def _to_standard_aql(self): aql += f" FILTER {self.doc_var}.{names.FLD_LOAD_VERSION} == @load_ver\n" if self.keep_filter_nulls: for i, k in enumerate(self.keep): - aql += f" FILTER {self.doc_var}.@keep{i} != null\n" + aql += f" FILTER {self.doc_var}[@keep{i}] != null\n" bind_vars[f"keep{i}"] = k matchsel = f"{self.doc_var}.{names.FLD_MATCHES_SELECTIONS}" if self.match_spec.get_subset_filtering_id(): @@ -644,7 +644,7 @@ def _to_arangosearch_aql(self): if self.keep_filter_nulls: for i, k in enumerate(self.keep): aql += f" AND\n" - aql += f" {self.doc_var}.@keep{i} != null\n" + aql += f" {self.doc_var}[@keep{i}] != null\n" bind_vars[f"keep{i}"] = k if self.match_spec.get_subset_filtering_id(): bind_vars["internal_match_id"] = self.match_spec.get_subset_filtering_id() diff --git a/test/src/service/filtering/filters_test.py b/test/src/service/filtering/filters_test.py index 3afeddca5..50d021c6f 100644 --- a/test/src/service/filtering/filters_test.py +++ b/test/src/service/filtering/filters_test.py @@ -280,19 +280,19 @@ def test_filterset_arangosearch_w_defaults(): AND doc.load_ver == @load_ver ) AND ( - IN_RANGE(doc.rangefield, @v1_low, @v1_high, true, true) + IN_RANGE(doc["rangefield"], @v1_low, @v1_high, true, true) AND - ANALYZER(STARTS_WITH(doc.prefixfield, v2_prefixes, LENGTH(v2_prefixes)), "text_en") + ANALYZER(STARTS_WITH(doc["prefixfield"], v2_prefixes, LENGTH(v2_prefixes)), "text_en") AND - doc.rangefield2 > @v3_low + doc["rangefield2"] > @v3_low AND - ANALYZER(v4_prefixes ALL == doc.fulltextfield, "text_rs") + ANALYZER(v4_prefixes ALL == doc["fulltextfield"], "text_rs") AND - doc.datefield <= @v5_high + doc["datefield"] <= @v5_high AND - NGRAM_MATCH(doc.ngramfield, @v6_input, 1, "ngram_stuff") + NGRAM_MATCH(doc["ngramfield"], @v6_input, 1, "ngram_stuff") AND - doc.strident == @v7_input + doc["strident"] == @v7_input ) LIMIT @skip, @limit RETURN doc @@ -351,19 +351,19 @@ def test_filterset_arangosearch_w_defaults_count(): AND doc.load_ver == @load_ver ) AND ( - IN_RANGE(doc.rangefield, @v1_low, @v1_high, true, true) + IN_RANGE(doc["rangefield"], @v1_low, @v1_high, true, true) AND - ANALYZER(STARTS_WITH(doc.prefixfield, v2_prefixes, LENGTH(v2_prefixes)), "text_en") + ANALYZER(STARTS_WITH(doc["prefixfield"], v2_prefixes, LENGTH(v2_prefixes)), "text_en") AND - doc.rangefield2 > @v3_low + doc["rangefield2"] > @v3_low AND - ANALYZER(v4_prefixes ALL == doc.fulltextfield, "text_rs") + ANALYZER(v4_prefixes ALL == doc["fulltextfield"], "text_rs") AND - doc.datefield <= @v5_high + doc["datefield"] <= @v5_high AND - NGRAM_MATCH(doc.ngramfield, @v6_input, 1, "ngram_stuff") + NGRAM_MATCH(doc["ngramfield"], @v6_input, 1, "ngram_stuff") AND - doc.strident == @v7_input + doc["strident"] == @v7_input ) RETURN doc ) @@ -435,9 +435,9 @@ def test_filterset_arangosearch_w_all_args(): AND d._mtchsel == @internal_selection_id ) AND ( - IN_RANGE(d.rangefield, @v1_low, @v1_high, true, true) + IN_RANGE(d["rangefield"], @v1_low, @v1_high, true, true) OR - ANALYZER(STARTS_WITH(d.prefixfield, v2_prefixes, LENGTH(v2_prefixes)), "text_en") + ANALYZER(STARTS_WITH(d["prefixfield"], v2_prefixes, LENGTH(v2_prefixes)), "text_en") ) SORT d.@sort @sortdir LIMIT @skip, @limit @@ -479,11 +479,11 @@ def test_filterset_arangosearch_w_keep_filter(): AND doc.load_ver == @load_ver AND - doc.@keep0 != null + doc[@keep0] != null AND - doc.@keep1 != null + doc[@keep1] != null ) AND ( - IN_RANGE(doc.rangefield, @v1_low, @v1_high, true, true) + IN_RANGE(doc["rangefield"], @v1_low, @v1_high, true, true) ) LIMIT @skip, @limit RETURN KEEP(doc, @keep) @@ -562,8 +562,8 @@ def test_filterset_aql_w_keep_filter(): FOR doc IN @@collection FILTER doc.coll == @collid FILTER doc.load_ver == @load_ver - FILTER doc.@keep0 != null - FILTER doc.@keep1 != null + FILTER doc[@keep0] != null + FILTER doc[@keep1] != null LIMIT @skip, @limit RETURN KEEP(doc, @keep) """.strip() + "\n" @@ -609,9 +609,9 @@ def test_filterset_arangosearch_w_all_args_count(): AND d._mtchsel == @internal_selection_id ) AND ( - IN_RANGE(d.rangefield, @v1_low, @v1_high, true, true) + IN_RANGE(d["rangefield"], @v1_low, @v1_high, true, true) OR - ANALYZER(STARTS_WITH(d.prefixfield, v2_prefixes, LENGTH(v2_prefixes)), "text_en") + ANALYZER(STARTS_WITH(d["prefixfield"], v2_prefixes, LENGTH(v2_prefixes)), "text_en") ) RETURN d ) @@ -683,7 +683,7 @@ def test_filterset_arangosearch_sort_asc_and_limit_0(): AND doc.load_ver == @load_ver ) AND ( - IN_RANGE(doc.rangefield, @v1_low, @v1_high, true, true) + IN_RANGE(doc["rangefield"], @v1_low, @v1_high, true, true) ) SORT doc.@sort @sortdir LIMIT @skip, @limit @@ -751,7 +751,7 @@ def test_filterset_arangosearch_w_1_filter_and_0_skip_limit(): AND doc.load_ver == @load_ver ) AND ( - IN_RANGE(doc.shoe_size, @v1_low, @v1_high, true, false) + IN_RANGE(doc["shoe_size"], @v1_low, @v1_high, true, false) ) RETURN doc """.strip() + "\n" From d99d844c79a1d08e6906f862421d90442bd16e6d Mon Sep 17 00:00:00 2001 From: Gavin Date: Tue, 26 Mar 2024 13:40:35 -0700 Subject: [PATCH 48/50] bump version --- RELEASE_NOTES.md | 6 ++++++ src/common/version.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 8972f1622..b7a73b040 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,11 @@ # KBase Collections Release Notes +## 0.1.2 + +* Fixed a bug that caused requests with filters to fail for filter keys containing colons. +* Adds eggNOG mappper as an experimental tool in the pipeline. It currently does not integrate + into the collections pipeline proper. + ## 0.1.1 * Fixed a bug in the service manager script that would cause it to fail if the service had never diff --git a/src/common/version.py b/src/common/version.py index 970ff83ba..ed49400ab 100644 --- a/src/common/version.py +++ b/src/common/version.py @@ -2,4 +2,4 @@ The version of the KBase collections software. ''' -VERSION = "0.1.1" +VERSION = "0.1.2" From 363fa1af9725fd7de7fab0e372d229874673a0d5 Mon Sep 17 00:00:00 2001 From: Gavin Date: Tue, 26 Mar 2024 13:45:56 -0700 Subject: [PATCH 49/50] typo --- RELEASE_NOTES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index b7a73b040..6f20195e0 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -3,7 +3,7 @@ ## 0.1.2 * Fixed a bug that caused requests with filters to fail for filter keys containing colons. -* Adds eggNOG mappper as an experimental tool in the pipeline. It currently does not integrate +* Adds eggNOG mapper as an experimental tool in the pipeline. It currently does not integrate into the collections pipeline proper. ## 0.1.1 From cdb5dd8480a8f19013a061764d29d9c7d53b8aec Mon Sep 17 00:00:00 2001 From: Gavin Date: Tue, 26 Mar 2024 14:20:04 -0700 Subject: [PATCH 50/50] Update release notes --- RELEASE_NOTES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 6f20195e0..2f49cb1c3 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -5,6 +5,8 @@ * Fixed a bug that caused requests with filters to fail for filter keys containing colons. * Adds eggNOG mapper as an experimental tool in the pipeline. It currently does not integrate into the collections pipeline proper. +* Pipeline tools now have the number of threads to use for tool invocations in the argument list. +* The `--create_assembly_only` CLI flag was removed from the workspace uploader. ## 0.1.1