Skip to content

Commit

Permalink
Merge pull request #537 from kbase/dev_checkm2_fatal_error_file
Browse files Browse the repository at this point in the history
RE2022-261: add fatal error file for checkm2
  • Loading branch information
Tianhao-Gu authored Nov 9, 2023
2 parents 274e5c5 + 1be4c9c commit c9ffcf3
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 7 deletions.
26 changes: 20 additions & 6 deletions src/loaders/compute_tools/checkm2/checkm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
from pathlib import Path
from typing import Dict

from src.common.storage.field_names import FLD_KBASE_ID
from src.loaders.compute_tools.tool_common import (
GenomeTuple,
ToolRunner,
run_command,
write_fatal_tuples_to_dict,
create_fatal_tuple,
)
from src.loaders.compute_tools.tool_result_parser import (
process_genome_attri_result,
Expand Down Expand Up @@ -47,12 +50,23 @@ def _run_checkm2(
)

tool_file_name, genome_id_col = 'quality_report.tsv', 'Name'
process_genome_attri_result(output_dir,
SELECTED_CHECKM2_FEATURES,
genome_id_col,
ids_to_files,
[tool_file_name],
)
genome_attri_docs = process_genome_attri_result(output_dir,
SELECTED_CHECKM2_FEATURES,
genome_id_col,
ids_to_files,
[tool_file_name],
)

# create fatal error file if any genome is missing from the result file
fatal_tuples = []
error_message = f"The genome is absent in the quality_report.tsv file generated by CheckM2."
success_ids = [genome_attri_doc[FLD_KBASE_ID] for genome_attri_doc in genome_attri_docs]
missing_ids = set(ids_to_files.keys()) - set(success_ids)
print(f"Found {len(missing_ids)} genomes missing from the CheckM2 output file.")
for missing_id in missing_ids:
fatal_tuple = create_fatal_tuple(missing_id, ids_to_files, error_message)
fatal_tuples.append(fatal_tuple)
write_fatal_tuples_to_dict(fatal_tuples, output_dir)


def main():
Expand Down
5 changes: 5 additions & 0 deletions src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,9 @@ versions:
date: 2023-10-24
notes: |
- update diamond reference DB default path
reference_db_version: 1.0.1
- version: 0.1.3
date: 2023-11-08
notes: |
- create fatal error file for CheckM2
reference_db_version: 1.0.1
4 changes: 3 additions & 1 deletion src/loaders/compute_tools/tool_result_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def process_genome_attri_result(
result_files: List[str],
check_file_exists: bool = True,
prefix: str = ''
) -> None:
) -> List[Dict]:
"""
process the output files generated by the tool (checkm2, gtdb-tk, etc) to create a format suitable for
importing into ArangoDB
Expand Down Expand Up @@ -57,6 +57,8 @@ def process_genome_attri_result(
output = output_dir / TOOL_GENOME_ATTRI_FILE
create_jsonl_files(output, genome_attri_docs)

return genome_attri_docs


def create_jsonl_files(
file_path: Path,
Expand Down

0 comments on commit c9ffcf3

Please sign in to comment.