Skip to content

Commit

Permalink
Merge pull request #749 from kbase/dev_fix_metadata_field
Browse files Browse the repository at this point in the history
fix a typo in metadata field
  • Loading branch information
Tianhao-Gu authored Aug 20, 2024
2 parents 0b9bf50 + 8def914 commit 4c4a7f9
Show file tree
Hide file tree
Showing 12 changed files with 200 additions and 63 deletions.
3 changes: 0 additions & 3 deletions src/loaders/common/loader_common_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,6 @@
# The metadata file name created during the tool's execution
TOOL_METADATA = 'tool_metadata.json'

# Tool metadata file required keys
TOOL_METADATA_REQUIRED_KEYS = ['tool_name', 'version', 'command']

# The fatal error file created if a data file cannot be successfully processed
FATAL_ERROR_FILE = "fatal_error.json"

Expand Down
35 changes: 22 additions & 13 deletions src/loaders/compute_tools/checkm2/checkm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@ def _run_checkm2(
# checkm2 will clear output_dir before it starts, which will delete any log files
log_dir = output_dir.parent / ("checkm2_log_" + output_dir.parts[-1])
run_command(command, log_dir if debug else None)
end_time = time.time()
print(f"Used {round((end_time - start) / 60, 2)} minutes to execute checkM2 predict "
+ f"for {size} genomes"
)

tool_file_name, genome_id_col = 'quality_report.tsv', 'Name'
genome_attri_docs = process_genome_attri_result(output_dir,
Expand All @@ -69,15 +65,28 @@ def _run_checkm2(
fatal_tuples.append(fatal_tuple)
write_fatal_tuples_to_dict(fatal_tuples, output_dir)

metadata = {'tool': 'checkm2',
'version': '1.0.1',
'command': command,
"reference_db": {
"version": None,
"comment": "diamond_db, ver unknown",
},
'ids_to_files': ids_to_files}
create_tool_metadata(output_dir, metadata)
end_time = time.time()
run_time = end_time - start
print(f"Used {round(run_time / 60, 2)} minutes to execute checkM2 predict "
+ f"for {size} genomes"
)

additional_metadata = {
"reference_db": {
"version": None,
"comment": "diamond_db, ver unknown",
},
'ids_to_files': ids_to_files,
}
create_tool_metadata(
output_dir,
tool_name="checkm2",
version="1.0.1",
command=command,
run_time=round(run_time, 2),
batch_size=size,
additional_metadata=additional_metadata,
)


def main():
Expand Down
3 changes: 3 additions & 0 deletions src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ versions:
date: 2024-08-16
notes: |
- Create metadata file after running CheckM2
- Fix a typo for 'tool_name' metadata field
- Add method to ensure 'ids_to_files' is JSON serializable
- Include execution time in metadata
reference_db_version: 1.0.1

#Please keep this reminder at the end of this file
Expand Down
36 changes: 24 additions & 12 deletions src/loaders/compute_tools/eggnog/eggnog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Therefore, the parser program is not compatible with data generated by this tool.
"""
import time
from pathlib import Path

from src.loaders.common.loader_common_names import TOOL_METADATA
Expand All @@ -20,6 +21,8 @@ def _run_eggnog_single(
output_dir: Path,
threads_per_tool_run: int,
debug: bool) -> None:
start = time.time()
print(f'Start executing EggNog for {data_id}')

metadata_file = output_dir / TOOL_METADATA
if metadata_file.exists():
Expand All @@ -30,7 +33,7 @@ def _run_eggnog_single(
command = ['emapper.py',
'-i', source_file, # Input file.
'-o', output_dir / source_file.name, # Output prefix.
# Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files.
# Save result file to collectiondata directory. Expecting 'emapper.annotations', 'emapper.hits' and 'emapper.seed_orthologs' files.
'--itype', f'{INPUT_TYPE}',
'--cpu', f'{threads_per_tool_run}',
'--excel',
Expand All @@ -41,18 +44,27 @@ def _run_eggnog_single(

run_command(command, output_dir if debug else None)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute EggNog for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
'input_type': INPUT_TYPE,
'data_id': data_id,
'tool_name': 'eggnog',
'version': '2.1.12',
'command': command,
"reference_db": {
"version": "5.0.2",
},
}
create_tool_metadata(output_dir, metadata)
additional_metadata = {
'source_file': str(source_file),
'data_id': data_id,
"reference_db": {
"version": "5.0.2",
},
}
create_tool_metadata(
output_dir,
tool_name="eggnog",
version="2.1.12",
command=command,
run_time=round(run_time, 2),
batch_size=1,
additional_metadata=additional_metadata)


def main():
Expand Down
1 change: 1 addition & 0 deletions src/loaders/compute_tools/eggnog/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ versions:
date: 2024-08-16
notes: |
- Create metadata file after running Eggnog
- Include execution time in metadata
reference_db_version: 5.0.2

#Please keep this reminder at the end of this file
Expand Down
33 changes: 20 additions & 13 deletions src/loaders/compute_tools/gtdb_tk/gtdb_tk.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,6 @@ def _run_gtdb_tk(
print(f'running {" ".join(command)}')
run_command(command, output_dir / "classify_wf_log" if debug else None)

end_time = time.time()
print(
f'Used {round((end_time - start) / 60, 2)} minutes to execute gtdbtk classify_wf for '
f'{len(ids_to_files)} genomes')

summary_files = find_gtdbtk_summary_files(output_dir)
if not summary_files:
raise ValueError(f"No summary files exist for gtdb-tk in the specified "
Expand Down Expand Up @@ -151,14 +146,26 @@ def _run_gtdb_tk(
summary_files,
)

metadata = {'tool': 'gtdb_tk',
'version': '2.3.2',
'command': command,
"reference_db": {
"version": "release214",
},
'ids_to_files': ids_to_files}
create_tool_metadata(output_dir, metadata)
end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute gtdbtk classify_wf for '
f'{size} genomes')

additional_metadata = {
"reference_db": {
"version": "release214",
},
'ids_to_files': ids_to_files,
}
create_tool_metadata(
output_dir,
tool_name="gtdb_tk",
version="2.3.2",
command=command,
run_time=round(run_time, 2),
batch_size=size,
additional_metadata=additional_metadata, )


def main():
Expand Down
3 changes: 3 additions & 0 deletions src/loaders/compute_tools/gtdb_tk/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ versions:
date: 2024-08-16
notes: |
- Create metadata file after running GTDB-Tk
- Fix a typo for 'tool_name' metadata field
- Add method to ensure 'ids_to_files' is JSON serializable
- Include execution time in metadata
reference_db_version: release214

#Please keep this reminder at the end of this file
Expand Down
43 changes: 32 additions & 11 deletions src/loaders/compute_tools/mash/mash.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""
Run Mash on a set of assemblies.
"""
import time
from pathlib import Path

from src.loaders.common.loader_common_names import TOOL_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata

KMER_SIZE = 19
Expand All @@ -18,28 +20,47 @@ def _run_mash_single(
debug: bool,
kmer_size: int = KMER_SIZE,
sketch_size: int = SKETCH_SIZE) -> None:
start = time.time()
print(f'Start executing Mash for {data_id}')

metadata_file = output_dir / TOOL_METADATA
if metadata_file.exists():
print(f"Skipping {source_file} as it has already been processed.")
return

# RUN mash sketch for a single genome
command = ['mash', 'sketch',
'-o', source_file, # Output prefix.
# Save result file to source file directory. The suffix '.msh' will be appended.
# Save result file to source file directory. The suffix '.msh' will be appended.
'-k', f'{kmer_size}',
'-s', f'{sketch_size}',
'-p', f'{threads_per_tool_run}',
source_file]

run_command(command, output_dir if debug else None)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute Mash for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
# Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
'sketch_file': str(source_file) + '.msh',
'kmer_size': kmer_size,
'sketch_size': sketch_size,
'data_id': data_id,
'tool_name': 'mash',
'version': '2.0',
'command': command}
create_tool_metadata(output_dir, metadata)
additional_metadata = {
'source_file': str(source_file),
# Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
'sketch_file': str(source_file) + '.msh',
'kmer_size': kmer_size,
'sketch_size': sketch_size,
'data_id': data_id,
}
create_tool_metadata(
output_dir,
tool_name="mash",
version="2.0",
command=command,
run_time=round(run_time, 2),
batch_size=1,
additional_metadata=additional_metadata)


def main():
Expand Down
3 changes: 2 additions & 1 deletion src/loaders/compute_tools/mash/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ versions:
- version: 0.1.5
date: 2024-08-16
notes: |
- Create metadata file after running Mash
- Create metadata file after running Mash
- Include execution time in metadata
35 changes: 34 additions & 1 deletion src/loaders/compute_tools/microtrait/microtrait.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Runs microtrait on a set of assemblies.
"""
import os
import time
import uuid
from pathlib import Path
from typing import Any
Expand All @@ -21,13 +22,14 @@
FIELD_HEATMAP_CATEGORY,
FIELD_HEATMAP_CELL_DETAIL_ENTRY_ID,
FIELD_HEATMAP_CELL_DETAIL_ENTRY_VALUE,
ColumnType,)
ColumnType, )
from src.common.storage.field_names import FLD_KBASE_ID
from src.loaders.common import loader_common_names
from src.loaders.compute_tools.tool_common import (
FatalTuple,
ToolRunner,
write_fatal_tuples_to_dict,
create_tool_metadata,
)
from src.loaders.compute_tools.tool_result_parser import (
create_jsonl_files,
Expand Down Expand Up @@ -208,6 +210,14 @@ def _run_microtrait(
# since extract_traits function doesn't take the number of threads as an argument
# https://github.com/ukaraoz/microtrait/blob/master/R/extract_traits.R#L22-L26

start = time.time()
print(f'Start executing Microtrait for {data_id}')

metadata_file = genome_dir / loader_common_names.TOOL_METADATA
if metadata_file.exists():
print(f"Skipping {fna_file} as it has already been processed.")
return

# Load the R script as an R function
r_script = """
library(microtrait)
Expand Down Expand Up @@ -262,6 +272,29 @@ def _run_microtrait(
create_jsonl_files(genome_dir / MICROTRAIT_CELLS, cells_meta)
create_jsonl_files(genome_dir / MICROTRAIT_DATA, heatmap_row)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute Microtrait for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
additional_metadata = {
'source_file': str(fna_file),
'data_id': data_id,
}
create_tool_metadata(
genome_dir,
tool_name="microtrait",
version={
'git_url': 'https://github.com/ukaraoz/microtrait',
'release_tag': 'kb',
},
command=["None - R script"],
run_time=round(run_time, 2),
batch_size=1,
additional_metadata=additional_metadata,
)


def main():
runner = ToolRunner("microtrait")
Expand Down
7 changes: 6 additions & 1 deletion src/loaders/compute_tools/microtrait/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,9 @@ versions:
- version: 0.1.5
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- version: 0.1.6
date: 2024-08-16
notes: |
- Create metadata file after running Microtrait
- Include execution time in metadata
Loading

0 comments on commit 4c4a7f9

Please sign in to comment.