diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index 15acfb407..72b2c0ded 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -5,10 +5,13 @@ from pathlib import Path from typing import Dict +from src.common.storage.field_names import FLD_KBASE_ID from src.loaders.compute_tools.tool_common import ( GenomeTuple, ToolRunner, run_command, + write_fatal_tuples_to_dict, + create_fatal_tuple, ) from src.loaders.compute_tools.tool_result_parser import ( process_genome_attri_result, @@ -47,12 +50,23 @@ def _run_checkm2( ) tool_file_name, genome_id_col = 'quality_report.tsv', 'Name' - process_genome_attri_result(output_dir, - SELECTED_CHECKM2_FEATURES, - genome_id_col, - ids_to_files, - [tool_file_name], - ) + genome_attri_docs = process_genome_attri_result(output_dir, + SELECTED_CHECKM2_FEATURES, + genome_id_col, + ids_to_files, + [tool_file_name], + ) + + # create fatal error file if any genome is missing from the result file + fatal_tuples = [] + error_message = f"The genome is absent in the quality_report.tsv file generated by CheckM2." + success_ids = [genome_attri_doc[FLD_KBASE_ID] for genome_attri_doc in genome_attri_docs] + missing_ids = set(ids_to_files.keys()) - set(success_ids) + print(f"Found {len(missing_ids)} genomes missing from the CheckM2 output file.") + for missing_id in missing_ids: + fatal_tuple = create_fatal_tuple(missing_id, ids_to_files, error_message) + fatal_tuples.append(fatal_tuple) + write_fatal_tuples_to_dict(fatal_tuples, output_dir) def main(): diff --git a/src/loaders/compute_tools/checkm2/versions.yaml b/src/loaders/compute_tools/checkm2/versions.yaml index 877126ff9..eabc8b09a 100644 --- a/src/loaders/compute_tools/checkm2/versions.yaml +++ b/src/loaders/compute_tools/checkm2/versions.yaml @@ -14,4 +14,9 @@ versions: date: 2023-10-24 notes: | - update diamond reference DB default path + reference_db_version: 1.0.1 + - version: 0.1.3 + date: 2023-11-08 + notes: | + - create fatal error file for CheckM2 reference_db_version: 1.0.1 \ No newline at end of file diff --git a/src/loaders/compute_tools/tool_result_parser.py b/src/loaders/compute_tools/tool_result_parser.py index 5cb673328..d915a1704 100644 --- a/src/loaders/compute_tools/tool_result_parser.py +++ b/src/loaders/compute_tools/tool_result_parser.py @@ -23,7 +23,7 @@ def process_genome_attri_result( result_files: List[str], check_file_exists: bool = True, prefix: str = '' -) -> None: +) -> List[Dict]: """ process the output files generated by the tool (checkm2, gtdb-tk, etc) to create a format suitable for importing into ArangoDB @@ -57,6 +57,8 @@ def process_genome_attri_result( output = output_dir / TOOL_GENOME_ATTRI_FILE create_jsonl_files(output, genome_attri_docs) + return genome_attri_docs + def create_jsonl_files( file_path: Path,