Skip to content

Commit

Permalink
Merge pull request #747 from kbase/dev_add_tool_metadata
Browse files Browse the repository at this point in the history
add tool metadata file
  • Loading branch information
Tianhao-Gu authored Aug 17, 2024
2 parents 02f0fbe + 8ceb7e7 commit bf3733f
Show file tree
Hide file tree
Showing 15 changed files with 107 additions and 25 deletions.
8 changes: 4 additions & 4 deletions src/loaders/common/loader_common_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,11 @@
OBJECTS_NAME_ASSEMBLY = "KBaseGenomeAnnotations.Assembly"
OBJECTS_NAME_GENOME = "KBaseGenomes.Genome"

# The metadata file name created during the Mash run
MASH_METADATA = 'mash_run_metadata.json'
# The metadata file name created during the tool's execution
TOOL_METADATA = 'tool_metadata.json'

# The metadata file name created during the Eggnog run
EGGNOG_METADATA = 'eggnog_run_metadata.json'
# Tool metadata file required keys
TOOL_METADATA_REQUIRED_KEYS = ['tool_name', 'version', 'command']

# The fatal error file created if a data file cannot be successfully processed
FATAL_ERROR_FILE = "fatal_error.json"
Expand Down
1 change: 1 addition & 0 deletions src/loaders/compute_tools/checkm2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM continuumio/miniconda3:22.11.1

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated
ARG CHECKM2_VER=1.0.1
ENV CONDA_ENV checkm2-$CHECKM2_VER

Expand Down
11 changes: 11 additions & 0 deletions src/loaders/compute_tools/checkm2/checkm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
run_command,
write_fatal_tuples_to_dict,
create_fatal_tuple,
create_tool_metadata,
)
from src.loaders.compute_tools.tool_result_parser import (
process_genome_attri_result,
Expand Down Expand Up @@ -68,6 +69,16 @@ def _run_checkm2(
fatal_tuples.append(fatal_tuple)
write_fatal_tuples_to_dict(fatal_tuples, output_dir)

metadata = {'tool': 'checkm2',
'version': '1.0.1',
'command': command,
"reference_db": {
"version": None,
"comment": "diamond_db, ver unknown",
},
'ids_to_files': ids_to_files}
create_tool_metadata(output_dir, metadata)


def main():
runner = ToolRunner("checkm2", tool_data_id_from_filename=True)
Expand Down
10 changes: 9 additions & 1 deletion src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,12 @@ versions:
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
reference_db_version: 1.0.1
reference_db_version: 1.0.1
- version: 0.1.6
date: 2024-08-16
notes: |
- Create metadata file after running CheckM2
reference_db_version: 1.0.1

#Please keep this reminder at the end of this file
#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated
1 change: 1 addition & 0 deletions src/loaders/compute_tools/eggnog/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM continuumio/miniconda3:24.1.2-0

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated
ENV EGGNOG_VER 2.1.12
ENV CONDA_ENV eggnog-$EGGNOG_VER
ENV PYTHON_VER 3.11
Expand Down
20 changes: 13 additions & 7 deletions src/loaders/compute_tools/eggnog/eggnog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
Therefore, the parser program is not compatible with data generated by this tool.
"""
import json
from pathlib import Path

from src.loaders.common.loader_common_names import EGGNOG_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command
from src.loaders.common.loader_common_names import TOOL_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata

INPUT_TYPE = 'proteins'

Expand All @@ -22,7 +21,7 @@ def _run_eggnog_single(
threads_per_tool_run: int,
debug: bool) -> None:

metadata_file = output_dir / EGGNOG_METADATA
metadata_file = output_dir / TOOL_METADATA
if metadata_file.exists():
print(f"Skipping {source_file} as it has already been processed.")
return
Expand All @@ -44,9 +43,16 @@ def _run_eggnog_single(

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
'input_type': INPUT_TYPE}
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)
'input_type': INPUT_TYPE,
'data_id': data_id,
'tool_name': 'eggnog',
'version': '2.1.12',
'command': command,
"reference_db": {
"version": "5.0.2",
},
}
create_tool_metadata(output_dir, metadata)


def main():
Expand Down
11 changes: 10 additions & 1 deletion src/loaders/compute_tools/eggnog/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,13 @@ versions:
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
reference_db_version: 5.0.2
reference_db_version: 5.0.2

- version: 0.1.4
date: 2024-08-16
notes: |
- Create metadata file after running Eggnog
reference_db_version: 5.0.2

#Please keep this reminder at the end of this file
#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated
1 change: 1 addition & 0 deletions src/loaders/compute_tools/gtdb_tk/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM continuumio/miniconda3:22.11.1

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated
ENV GTDB_VER 2.3.2
ENV CONDA_ENV gtdbtk-$GTDB_VER
ENV PYTHON_VER 3.8
Expand Down
10 changes: 10 additions & 0 deletions src/loaders/compute_tools/gtdb_tk/gtdb_tk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
find_gtdbtk_summary_files,
run_command,
write_fatal_tuples_to_dict,
create_tool_metadata,
)
from src.loaders.compute_tools.tool_result_parser import (
process_genome_attri_result,
Expand Down Expand Up @@ -150,6 +151,15 @@ def _run_gtdb_tk(
summary_files,
)

metadata = {'tool': 'gtdb_tk',
'version': '2.3.2',
'command': command,
"reference_db": {
"version": "release214",
},
'ids_to_files': ids_to_files}
create_tool_metadata(output_dir, metadata)


def main():
runner = ToolRunner("gtdb_tk", suffix_ids=True)
Expand Down
10 changes: 9 additions & 1 deletion src/loaders/compute_tools/gtdb_tk/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,12 @@ versions:
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
reference_db_version: release214
reference_db_version: release214
- version: 0.1.7
date: 2024-08-16
notes: |
- Create metadata file after running GTDB-Tk
reference_db_version: release214

#Please keep this reminder at the end of this file
#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated
1 change: 1 addition & 0 deletions src/loaders/compute_tools/mash/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ RUN apt-get update && \

# Set the Mash version argument
# using version 2.0 to maintain compatibility with the homology service
# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_mash_single method is updated
ARG MASH_VER=2.0
ENV CONDA_ENV mash-$MASH_VER

Expand Down
14 changes: 7 additions & 7 deletions src/loaders/compute_tools/mash/mash.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
"""
Run Mash on a set of assemblies.
"""
import json
from pathlib import Path

from src.loaders.common.loader_common_names import MASH_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command
from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata

KMER_SIZE = 19
SKETCH_SIZE = 10000
Expand All @@ -32,14 +30,16 @@ def _run_mash_single(
run_command(command, output_dir if debug else None)

# Save run info to a metadata file in the output directory for parsing later
metadata_file = output_dir / MASH_METADATA
metadata = {'source_file': str(source_file),
# Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
'sketch_file': str(source_file) + '.msh',
'kmer_size': kmer_size,
'sketch_size': sketch_size}
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)
'sketch_size': sketch_size,
'data_id': data_id,
'tool_name': 'mash',
'version': '2.0',
'command': command}
create_tool_metadata(output_dir, metadata)


def main():
Expand Down
6 changes: 5 additions & 1 deletion src/loaders/compute_tools/mash/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ versions:
- version: 0.1.4
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- version: 0.1.5
date: 2024-08-16
notes: |
- Create metadata file after running Mash
26 changes: 24 additions & 2 deletions src/loaders/compute_tools/tool_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import pandas as pd

from src.loaders.common import loader_common_names
from src.loaders.common.loader_common_names import TOOL_METADATA

# TODO CODE add a common module for saving and loading the metadata shared between the compute
# and parser
Expand Down Expand Up @@ -211,7 +212,8 @@ def _get_data_ids(self):
with self._data_id_file:
df = pd.read_csv(self._data_id_file, sep='\t')
try:
data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(str).tolist() # convert to string in case of int directory names
data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(
str).tolist() # convert to string in case of int directory names
except KeyError:
raise ValueError(
f"Please ensure {loader_common_names.DATA_ID_COLUMN_HEADER} column exists in the "
Expand Down Expand Up @@ -344,7 +346,8 @@ def parallel_batch_execution(self, tool_callable: Callable[[Dict[str, GenomeTupl
def _execute(
self,
tool_callable: Callable[..., None],
args: Union[List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]],
args: Union[
List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]],
start: datetime.datetime,
total: bool,
):
Expand Down Expand Up @@ -595,6 +598,25 @@ def create_fatal_tuple(
return fatal_tuple


def create_tool_metadata(output_dir: Path, metadata: Dict[str, str]):
"""
Save the metadata as a JSON file to the specified output directory.
Args:
output_dir (Path): The directory where the metadata file will be saved.
metadata (Dict[str, str]): A dictionary containing metadata key-value pairs.
"""
required_keys = loader_common_names.TOOL_METADATA_REQUIRED_KEYS

if not all(key in metadata for key in required_keys):
missing_keys = [key for key in required_keys if key not in metadata]
raise ValueError(f"Missing required keys in metadata: {missing_keys}")

metadata_file = output_dir / TOOL_METADATA
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)


if __name__ == "__main__":
# mostly just here to allow easily getting the help info with --help:
ToolRunner("fake_tool")
2 changes: 1 addition & 1 deletion src/loaders/genome_collection/parse_tool_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _process_mash_tool(root_dir: str,
if data_id in fatal_ids:
continue
data_dir = Path(result_dir, batch_dir, data_id)
with open(data_dir / loader_common_names.MASH_METADATA, 'r') as file:
with open(data_dir / loader_common_names.TOOL_METADATA, 'r') as file:
metadata = json.load(file)

sketch_file = metadata['sketch_file']
Expand Down

0 comments on commit bf3733f

Please sign in to comment.