From 68d20914082d6ddf33f6e1d874c045196f99fab4 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Thu, 4 Apr 2024 17:13:23 -0400 Subject: [PATCH 1/5] update BUSCO to use v5.7.1 docker image by default; added cpu flag; updated parsing code to account for adjustments to BUSCO final output summary txt file; added docker as String output. tested successfully w miniwdl --- .../advanced_metrics/task_busco.wdl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tasks/quality_control/advanced_metrics/task_busco.wdl b/tasks/quality_control/advanced_metrics/task_busco.wdl index 23b60edde..0e571bc51 100644 --- a/tasks/quality_control/advanced_metrics/task_busco.wdl +++ b/tasks/quality_control/advanced_metrics/task_busco.wdl @@ -7,7 +7,7 @@ task busco { input { File assembly String samplename - String docker = "us-docker.pkg.dev/general-theiagen/ezlabgva/busco:v5.3.2_cv1" + String docker = "us-docker.pkg.dev/general-theiagen/ezlabgva/busco:v5.7.1_cv1" Int memory = 8 Int cpu = 2 Int disk_size = 100 @@ -25,6 +25,7 @@ task busco { # --auto-lineage-prok looks at only prokaryotic organisms; default busco \ -i ~{assembly} \ + -c ~{cpu} \ -m geno \ -o ~{samplename} \ ~{true='--auto-lineage-euk' false='--auto-lineage-prok' eukaryote} @@ -33,12 +34,17 @@ task busco { if [ -f ~{samplename}/short_summary.specific.*.~{samplename}.txt ]; then # grab the database version and format it according to BUSCO recommendations - cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "dataset is:" | cut -d' ' -f 6,9 | sed 's/,//' | sed 's/ / (/' | sed 's/$/)/' | tee DATABASE + # pull line out of final specific summary file + # cut out the database name and date it was created + # sed is to remove extra comma and to add parentheses around the date and remove all tabs + # finally write to a file called DATABASE + cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "dataset is:" | cut -d' ' -f 6,9 | sed 's/,//; s/ / (/; s/$/)/; s|[\t]||g' | tee DATABASE - # extract the results string - cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "C:" | tee BUSCO_RESULTS + # extract the results string; strip off all tab and space characters; write to a file called BUSCO_RESULTS + cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "C:" | sed 's|[\t]||g; s| ||g' | tee BUSCO_RESULTS - cp ~{samplename}/short_summary.specific.*.~{samplename}.txt ~{samplename}_busco-summary.txt + # rename final output file to predictable name + cp -v ~{samplename}/short_summary.specific.*.~{samplename}.txt ~{samplename}_busco-summary.txt else echo "BUSCO FAILED" | tee BUSCO_RESULTS echo "NA" > DATABASE @@ -46,6 +52,7 @@ task busco { >>> output { String busco_version = read_string("VERSION") + String busco_docker = docker String busco_database = read_string("DATABASE") String busco_results = read_string("BUSCO_RESULTS") File? busco_report = "~{samplename}_busco-summary.txt" From 4cca8f66d71fd84b01d8a2a04cbe58f588c0b3f9 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Thu, 4 Apr 2024 17:38:26 -0400 Subject: [PATCH 2/5] update CI for task related changes --- .../workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml | 6 +++--- .../workflows/theiaprok/test_wf_theiaprok_illumina_se.yml | 8 +++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 28d09073a..5088d1cd1 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -56,7 +56,7 @@ - path: miniwdl_run/call-amrfinderplus_task/work/test_amrfinder_virulence.tsv md5sum: a5cc7b0baa9e11c9a31540800d0a740c - path: miniwdl_run/call-busco/command - md5sum: 81ed6d55f143b759289e758bdde4091c + md5sum: 8a4de431119daee0dfc4f2605413b095 - path: miniwdl_run/call-busco/inputs.json - path: miniwdl_run/call-busco/outputs.json - path: miniwdl_run/call-busco/stderr.txt @@ -67,7 +67,7 @@ - path: miniwdl_run/call-busco/work/BUSCO_RESULTS - path: miniwdl_run/call-busco/work/DATABASE - path: miniwdl_run/call-busco/work/VERSION - md5sum: 369fefb1c40f16c2a9c34d4903e9f1e2 + md5sum: 3cfdda0096f0689c9829ed27bdef6b1a - path: miniwdl_run/call-busco/work/_miniwdl_inputs/0/test_contigs.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-busco/work/busco_downloads/file_versions.tsv @@ -565,7 +565,7 @@ - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_bbduk.wdl md5sum: aec6ef024d6dff31723f44290f6b9cf5 - path: miniwdl_run/wdl/tasks/quality_control/advanced_metrics/task_busco.wdl - md5sum: 0f8b3fbca316cb940a61b2c3cdf6ebab + md5sum: 1e2ccdeac9f2137d8cab70f0de452c4c - path: miniwdl_run/wdl/tasks/quality_control/basic_statistics/task_cg_pipeline.wdl md5sum: c8e7ae1ed0fff7b14731228db80c6b30 - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_fastp.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index af3c8dc5c..87670b67f 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -56,7 +56,7 @@ - path: miniwdl_run/call-amrfinderplus_task/work/test_amrfinder_virulence.tsv md5sum: a5cc7b0baa9e11c9a31540800d0a740c - path: miniwdl_run/call-busco/command - md5sum: 81ed6d55f143b759289e758bdde4091c + md5sum: 8a4de431119daee0dfc4f2605413b095 - path: miniwdl_run/call-busco/inputs.json - path: miniwdl_run/call-busco/outputs.json - path: miniwdl_run/call-busco/stderr.txt @@ -67,7 +67,7 @@ - path: miniwdl_run/call-busco/work/BUSCO_RESULTS - path: miniwdl_run/call-busco/work/DATABASE - path: miniwdl_run/call-busco/work/VERSION - md5sum: 369fefb1c40f16c2a9c34d4903e9f1e2 + md5sum: 3cfdda0096f0689c9829ed27bdef6b1a - path: miniwdl_run/call-busco/work/_miniwdl_inputs/0/test_contigs.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-busco/work/busco_downloads/file_versions.tsv @@ -108,8 +108,6 @@ - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/tree.bacteria_odb10.2019-12-16.nwk - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/busco_sequences/single_copy_busco_sequences/84684at2157.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/busco_sequences/single_copy_busco_sequences/84684at2157.fna - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/full_table.tsv - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/101957at2157.out - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/102178at2157.out @@ -532,7 +530,7 @@ - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_bbduk.wdl md5sum: aec6ef024d6dff31723f44290f6b9cf5 - path: miniwdl_run/wdl/tasks/quality_control/advanced_metrics/task_busco.wdl - md5sum: 0f8b3fbca316cb940a61b2c3cdf6ebab + md5sum: 1e2ccdeac9f2137d8cab70f0de452c4c - path: miniwdl_run/wdl/tasks/quality_control/basic_statistics/task_cg_pipeline.wdl md5sum: c8e7ae1ed0fff7b14731228db80c6b30 - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_fastp.wdl From 83b8b23bc90041d7f8acb84d23783f953544b102 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Thu, 4 Apr 2024 17:49:49 -0400 Subject: [PATCH 3/5] add String busco_docker output to all theiaprok wfs and theiaeuk_illumina_pe. have not tested yet --- workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl | 1 + workflows/theiaprok/wf_theiaprok_fasta.wdl | 1 + workflows/theiaprok/wf_theiaprok_illumina_pe.wdl | 1 + workflows/theiaprok/wf_theiaprok_illumina_se.wdl | 1 + workflows/theiaprok/wf_theiaprok_ont.wdl | 1 + 5 files changed, 5 insertions(+) diff --git a/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl b/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl index 94fc910ce..a8feb8921 100644 --- a/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl +++ b/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl @@ -257,6 +257,7 @@ workflow theiaeuk_illumina_pe { Float? est_coverage_clean = cg_pipeline_clean.est_coverage # Assembly QC - busco outputs String? busco_version = busco.busco_version + String? busco_docker = busco.busco_docker String? busco_database = busco.busco_database String? busco_results = busco.busco_results File? busco_report = busco.busco_report diff --git a/workflows/theiaprok/wf_theiaprok_fasta.wdl b/workflows/theiaprok/wf_theiaprok_fasta.wdl index fd53361c6..686d39b49 100644 --- a/workflows/theiaprok/wf_theiaprok_fasta.wdl +++ b/workflows/theiaprok/wf_theiaprok_fasta.wdl @@ -422,6 +422,7 @@ workflow theiaprok_fasta { Float quast_gc_percent = quast.gc_percent # Assembly QC - BUSCO outputs String busco_version = busco.busco_version + String busco_docker = busco.busco_docker String busco_database = busco.busco_database String busco_results = busco.busco_results File? busco_report = busco.busco_report diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index 4f0fba2a4..54949c3e0 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -640,6 +640,7 @@ workflow theiaprok_illumina_pe { Float? est_coverage_clean = cg_pipeline_clean.est_coverage # Assembly QC - busco outputs String? busco_version = busco.busco_version + String? busco_docker = busco.busco_docker String? busco_database = busco.busco_database String? busco_results = busco.busco_results File? busco_report = busco.busco_report diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index 3fd6fff5d..35864a0d4 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -588,6 +588,7 @@ workflow theiaprok_illumina_se { Float? est_coverage_clean = cg_pipeline_clean.est_coverage # Assembly QC - busco outputs String? busco_version = busco.busco_version + String? busco_docker = busco.busco_docker String? busco_database = busco.busco_database String? busco_results = busco.busco_results File? busco_report = busco.busco_report diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 9f8818ad8..274f94428 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -542,6 +542,7 @@ workflow theiaprok_ont { Float? est_coverage_clean = nanoplot_clean.est_coverage # Assembly QC - busco outputs String? busco_version = busco.busco_version + String? busco_docker = busco.busco_docker String? busco_database = busco.busco_database String? busco_results = busco.busco_results File? busco_report = busco.busco_report From 85a19c4ff4238bb7dbbba33dbedb1f9dfa09a82c Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Thu, 4 Apr 2024 17:58:38 -0400 Subject: [PATCH 4/5] update CI for theiaprok_illumina_pe and se for wf level change --- tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml | 2 +- tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 5088d1cd1..f1833780b 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -633,7 +633,7 @@ - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl md5sum: ea141ba65f2948ae2abed7ca791e872b - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl - md5sum: d835ff7f54e03f72d4af6375e262b726 + md5sum: c28f17110081694ac36a7695fdebea76 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: cfb407b32bc9436a0f12e29dc2e3b5a1 - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 87670b67f..abd837fe7 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -596,7 +596,7 @@ - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl md5sum: ea141ba65f2948ae2abed7ca791e872b - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl - md5sum: ae0ad6faa3d9f355d0f1561f62fb2e98 + md5sum: f920448f13018432be074e3ef06ea4ee - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: cfb407b32bc9436a0f12e29dc2e3b5a1 - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl From 984a6a4272ca32690f9fd35ad7e106f65ac7e2f8 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Tue, 9 Apr 2024 11:34:13 -0400 Subject: [PATCH 5/5] add new optional Int input busco_memory with default of 24 to theiaeuk_illumina_pe workflow to account for higher RAM required for larger genomes --- workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl b/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl index a8feb8921..88eb02b87 100644 --- a/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl +++ b/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl @@ -32,6 +32,7 @@ workflow theiaeuk_illumina_pe { Int trim_min_length = 75 Int trim_quality_min_score = 20 Int trim_window_size = 10 + Int busco_memory = 24 Boolean skip_screen = false File? qc_check_table String? expected_taxon @@ -139,7 +140,8 @@ workflow theiaeuk_illumina_pe { input: assembly = shovill_pe.assembly_fasta, samplename = samplename, - eukaryote = true + eukaryote = true, + memory = busco_memory } if (defined(qc_check_table)) { call qc_check.qc_check_phb as qc_check_task {