Merge pull request #749 from kbase/dev_fix_metadata_field

fix a typo in metadata field
kbase · Aug 20, 2024 · 4c4a7f9 · 4c4a7f9
2 parents 0b9bf50 + 8def914
commit 4c4a7f9
Show file tree

Hide file tree

Showing 12 changed files with 200 additions and 63 deletions.
diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py
@@ -94,9 +94,6 @@
 # The metadata file name created during the tool's execution
 TOOL_METADATA = 'tool_metadata.json'
 
-# Tool metadata file required keys
-TOOL_METADATA_REQUIRED_KEYS = ['tool_name', 'version', 'command']
-
 # The fatal error file created if a data file cannot be successfully processed
 FATAL_ERROR_FILE = "fatal_error.json"
 

diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py
@@ -45,10 +45,6 @@ def _run_checkm2(
     # checkm2 will clear output_dir before it starts, which will delete any log files
     log_dir = output_dir.parent / ("checkm2_log_" + output_dir.parts[-1])
     run_command(command, log_dir if debug else None)
-    end_time = time.time()
-    print(f"Used {round((end_time - start) / 60, 2)} minutes to execute checkM2 predict "
-          + f"for {size} genomes"
-          )
 
     tool_file_name, genome_id_col = 'quality_report.tsv', 'Name'
     genome_attri_docs = process_genome_attri_result(output_dir,
@@ -69,15 +65,28 @@ def _run_checkm2(
         fatal_tuples.append(fatal_tuple)
     write_fatal_tuples_to_dict(fatal_tuples, output_dir)
 
-    metadata = {'tool': 'checkm2',
-                'version': '1.0.1',
-                'command': command,
-                "reference_db": {
-                    "version": None,
-                    "comment": "diamond_db, ver unknown",
-                    },
-                'ids_to_files': ids_to_files}
-    create_tool_metadata(output_dir, metadata)
+    end_time = time.time()
+    run_time = end_time - start
+    print(f"Used {round(run_time / 60, 2)} minutes to execute checkM2 predict "
+          + f"for {size} genomes"
+          )
+
+    additional_metadata = {
+        "reference_db": {
+            "version": None,
+            "comment": "diamond_db, ver unknown",
+        },
+        'ids_to_files': ids_to_files,
+    }
+    create_tool_metadata(
+        output_dir,
+        tool_name="checkm2",
+        version="1.0.1",
+        command=command,
+        run_time=round(run_time, 2),
+        batch_size=size,
+        additional_metadata=additional_metadata,
+    )
 
 
 def main():

diff --git a/src/loaders/compute_tools/checkm2/versions.yaml b/src/loaders/compute_tools/checkm2/versions.yaml
@@ -36,6 +36,9 @@ versions:
     date: 2024-08-16
     notes: |
       - Create metadata file after running CheckM2
+      - Fix a typo for 'tool_name' metadata field
+      - Add method to ensure 'ids_to_files' is JSON serializable
+      - Include execution time in metadata
     reference_db_version: 1.0.1
 
 #Please keep this reminder at the end of this file

diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py
@@ -5,6 +5,7 @@
 Therefore, the parser program is not compatible with data generated by this tool.
 
 """
+import time
 from pathlib import Path
 
 from src.loaders.common.loader_common_names import TOOL_METADATA
@@ -20,6 +21,8 @@ def _run_eggnog_single(
         output_dir: Path,
         threads_per_tool_run: int,
         debug: bool) -> None:
+    start = time.time()
+    print(f'Start executing EggNog for {data_id}')
 
     metadata_file = output_dir / TOOL_METADATA
     if metadata_file.exists():
@@ -30,7 +33,7 @@ def _run_eggnog_single(
     command = ['emapper.py',
                '-i', source_file,  # Input file.
                '-o', output_dir / source_file.name,  # Output prefix.
-                                                     # Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and  'emapper.seed_orthologs' files.
+               # Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and  'emapper.seed_orthologs' files.
                '--itype', f'{INPUT_TYPE}',
                '--cpu', f'{threads_per_tool_run}',
                '--excel',
@@ -41,18 +44,27 @@ def _run_eggnog_single(
 
     run_command(command, output_dir if debug else None)
 
+    end_time = time.time()
+    run_time = end_time - start
+    print(
+        f'Used {round(run_time / 60, 2)} minutes to execute EggNog for {data_id}')
+
     # Save run info to a metadata file in the output directory for parsing later
-    metadata = {'source_file': str(source_file),
-                'input_type': INPUT_TYPE,
-                'data_id': data_id,
-                'tool_name': 'eggnog',
-                'version': '2.1.12',
-                'command': command,
-                "reference_db": {
-                    "version": "5.0.2",
-                    },
-                }
-    create_tool_metadata(output_dir, metadata)
+    additional_metadata = {
+        'source_file': str(source_file),
+        'data_id': data_id,
+        "reference_db": {
+            "version": "5.0.2",
+        },
+    }
+    create_tool_metadata(
+        output_dir,
+        tool_name="eggnog",
+        version="2.1.12",
+        command=command,
+        run_time=round(run_time, 2),
+        batch_size=1,
+        additional_metadata=additional_metadata)
 
 
 def main():

diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml
@@ -27,6 +27,7 @@ versions:
     date: 2024-08-16
     notes: |
       - Create metadata file after running Eggnog
+      - Include execution time in metadata
     reference_db_version: 5.0.2
 
 #Please keep this reminder at the end of this file

diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py
@@ -97,11 +97,6 @@ def _run_gtdb_tk(
     print(f'running {" ".join(command)}')
     run_command(command, output_dir / "classify_wf_log" if debug else None)
 
-    end_time = time.time()
-    print(
-        f'Used {round((end_time - start) / 60, 2)} minutes to execute gtdbtk classify_wf for '
-        f'{len(ids_to_files)} genomes')
-
     summary_files = find_gtdbtk_summary_files(output_dir)
     if not summary_files:
         raise ValueError(f"No summary files exist for gtdb-tk in the specified "
@@ -151,14 +146,26 @@ def _run_gtdb_tk(
                                 summary_files,
                                 )
 
-    metadata = {'tool': 'gtdb_tk',
-                'version': '2.3.2',
-                'command': command,
-                "reference_db": {
-                    "version": "release214",
-                    },
-                'ids_to_files': ids_to_files}
-    create_tool_metadata(output_dir, metadata)
+    end_time = time.time()
+    run_time = end_time - start
+    print(
+        f'Used {round(run_time / 60, 2)} minutes to execute gtdbtk classify_wf for '
+        f'{size} genomes')
+
+    additional_metadata = {
+        "reference_db": {
+            "version": "release214",
+        },
+        'ids_to_files': ids_to_files,
+    }
+    create_tool_metadata(
+        output_dir,
+        tool_name="gtdb_tk",
+        version="2.3.2",
+        command=command,
+        run_time=round(run_time, 2),
+        batch_size=size,
+        additional_metadata=additional_metadata, )
 
 
 def main():

diff --git a/src/loaders/compute_tools/gtdb_tk/versions.yaml b/src/loaders/compute_tools/gtdb_tk/versions.yaml
@@ -45,6 +45,9 @@ versions:
     date: 2024-08-16
     notes: |
       - Create metadata file after running GTDB-Tk
+      - Fix a typo for 'tool_name' metadata field
+      - Add method to ensure 'ids_to_files' is JSON serializable
+      - Include execution time in metadata
     reference_db_version: release214
 
 #Please keep this reminder at the end of this file

diff --git a/src/loaders/compute_tools/mash/mash.py b/src/loaders/compute_tools/mash/mash.py
@@ -1,8 +1,10 @@
 """
 Run Mash on a set of assemblies.
 """
+import time
 from pathlib import Path
 
+from src.loaders.common.loader_common_names import TOOL_METADATA
 from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata
 
 KMER_SIZE = 19
@@ -18,28 +20,47 @@ def _run_mash_single(
         debug: bool,
         kmer_size: int = KMER_SIZE,
         sketch_size: int = SKETCH_SIZE) -> None:
+    start = time.time()
+    print(f'Start executing Mash for {data_id}')
+
+    metadata_file = output_dir / TOOL_METADATA
+    if metadata_file.exists():
+        print(f"Skipping {source_file} as it has already been processed.")
+        return
+
     # RUN mash sketch for a single genome
     command = ['mash', 'sketch',
                '-o', source_file,  # Output prefix.
-                                   # Save result file to source file directory. The suffix '.msh' will be appended.
+               # Save result file to source file directory. The suffix '.msh' will be appended.
                '-k', f'{kmer_size}',
                '-s', f'{sketch_size}',
                '-p', f'{threads_per_tool_run}',
                source_file]
 
     run_command(command, output_dir if debug else None)
 
+    end_time = time.time()
+    run_time = end_time - start
+    print(
+        f'Used {round(run_time / 60, 2)} minutes to execute Mash for {data_id}')
+
     # Save run info to a metadata file in the output directory for parsing later
-    metadata = {'source_file': str(source_file),
-                # Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
-                'sketch_file': str(source_file) + '.msh',
-                'kmer_size': kmer_size,
-                'sketch_size': sketch_size,
-                'data_id': data_id,
-                'tool_name': 'mash',
-                'version': '2.0',
-                'command': command}
-    create_tool_metadata(output_dir, metadata)
+    additional_metadata = {
+        'source_file': str(source_file),
+        # Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
+        'sketch_file': str(source_file) + '.msh',
+        'kmer_size': kmer_size,
+        'sketch_size': sketch_size,
+        'data_id': data_id,
+    }
+    create_tool_metadata(
+        output_dir,
+        tool_name="mash",
+        version="2.0",
+        command=command,
+        run_time=round(run_time, 2),
+        batch_size=1,
+        additional_metadata=additional_metadata)
 
 
 def main():

diff --git a/src/loaders/compute_tools/mash/versions.yaml b/src/loaders/compute_tools/mash/versions.yaml
@@ -20,4 +20,5 @@ versions:
   - version: 0.1.5
     date: 2024-08-16
     notes: |
-      - Create metadata file after running Mash
+      - Create metadata file after running Mash
+      - Include execution time in metadata
diff --git a/src/loaders/compute_tools/microtrait/microtrait.py b/src/loaders/compute_tools/microtrait/microtrait.py
@@ -2,6 +2,7 @@
 Runs microtrait on a set of assemblies.
 """
 import os
+import time
 import uuid
 from pathlib import Path
 from typing import Any
@@ -21,13 +22,14 @@
     FIELD_HEATMAP_CATEGORY,
     FIELD_HEATMAP_CELL_DETAIL_ENTRY_ID,
     FIELD_HEATMAP_CELL_DETAIL_ENTRY_VALUE,
-    ColumnType,)
+    ColumnType, )
 from src.common.storage.field_names import FLD_KBASE_ID
 from src.loaders.common import loader_common_names
 from src.loaders.compute_tools.tool_common import (
     FatalTuple,
     ToolRunner,
     write_fatal_tuples_to_dict,
+    create_tool_metadata,
 )
 from src.loaders.compute_tools.tool_result_parser import (
     create_jsonl_files,
@@ -208,6 +210,14 @@ def _run_microtrait(
     # since extract_traits function doesn't take the number of threads as an argument
     # https://github.com/ukaraoz/microtrait/blob/master/R/extract_traits.R#L22-L26
 
+    start = time.time()
+    print(f'Start executing Microtrait for {data_id}')
+
+    metadata_file = genome_dir / loader_common_names.TOOL_METADATA
+    if metadata_file.exists():
+        print(f"Skipping {fna_file} as it has already been processed.")
+        return
+
     # Load the R script as an R function
     r_script = """
         library(microtrait)
@@ -262,6 +272,29 @@ def _run_microtrait(
     create_jsonl_files(genome_dir / MICROTRAIT_CELLS, cells_meta)
     create_jsonl_files(genome_dir / MICROTRAIT_DATA, heatmap_row)
 
+    end_time = time.time()
+    run_time = end_time - start
+    print(
+        f'Used {round(run_time / 60, 2)} minutes to execute Microtrait for {data_id}')
+
+    # Save run info to a metadata file in the output directory for parsing later
+    additional_metadata = {
+        'source_file': str(fna_file),
+        'data_id': data_id,
+    }
+    create_tool_metadata(
+        genome_dir,
+        tool_name="microtrait",
+        version={
+            'git_url': 'https://github.com/ukaraoz/microtrait',
+            'release_tag': 'kb',
+        },
+        command=["None - R script"],
+        run_time=round(run_time, 2),
+        batch_size=1,
+        additional_metadata=additional_metadata,
+    )
+
 
 def main():
     runner = ToolRunner("microtrait")

diff --git a/src/loaders/compute_tools/microtrait/versions.yaml b/src/loaders/compute_tools/microtrait/versions.yaml
@@ -22,4 +22,9 @@ versions:
   - version: 0.1.5
     date: 2024-06-25
     notes: |
-      - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
+      - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
+  - version: 0.1.6
+    date: 2024-08-16
+    notes: |
+      - Create metadata file after running Microtrait
+      - Include execution time in metadata