diff --git a/tasks/quality_control/task_mummer_ani.wdl b/tasks/quality_control/task_mummer_ani.wdl index 088a985b5..fc048bfb1 100644 --- a/tasks/quality_control/task_mummer_ani.wdl +++ b/tasks/quality_control/task_mummer_ani.wdl @@ -6,12 +6,17 @@ task animummer { String samplename File? ref_genome Float mash_filter = 0.9 - String docker="us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2" + # these 2 thresholds were set as they are used by CDC enterics lab/PulseNet for ANI thresholds + Float ani_threshold = 80.0 + Float percent_bases_aligned_threshold = 70.0 + String docker= "us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2" + Int cpus = 4 + Int memory = 8 Int disk_size = 100 } command <<< # capture and version - mummer --version | tee MUMMER_VERSION + mummer --version | tee MUMMER_VERSION.txt # set the reference genome # if not defined by user, then use all 43 genomes in RGDv2 @@ -42,18 +47,22 @@ task animummer { echo "~{samplename} did not surpass the minimum mash genetic distance filter, thus ANI was not performed" echo "The output TSV only contains the header line" # set output variables as 0s or descriptive strings - echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED - echo "0.0" > ANI_HIGHEST_PERCENT - echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH + echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt + echo "0.0" > ANI_HIGHEST_PERCENT.txt + echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH.txt # if output TSV has greater than 1 lines, then parse for appropriate outputs else ## parse out highest percentBases aligned - cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED - echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" + cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt + echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)" + ANI_HIGHEST_PERCENT_BASES_ALIGNED=$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt) ## parse out ANI value using highest percentBases aligned value - grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT - echo "ANI value is: $(cat ANI_HIGHEST_PERCENT)" + grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT.txt + echo "Highest ANI value is: $(cat ANI_HIGHEST_PERCENT.txt)" + # set ANI_HIGHEST_PERCENT as a bash variable (float) + ANI_HIGHEST_PERCENT=$(cat ANI_HIGHEST_PERCENT.txt) + # have to separate out results for ani_top_species match because user-defined reference genome FASTAs will not be named as they are in RGDv2 if [[ -z "~{ref_genome}" ]]; then @@ -63,27 +72,46 @@ task animummer { # cut on periods to pull out genus_species (in future this will inlcude lineages for Listeria and other sub-species designations) # have to create assembly_file_basename bash variable since output TSV does not include full path to assembly file, only filename assembly_file_basename=$(basename ~{assembly}) - grep "$(cat ANI_HIGHEST_PERCENT)" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH - echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH)" + grep "${ANI_HIGHEST_PERCENT}" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH.txt + echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH.txt)" + + # if ANI threshold or percent_bases_aligned_threshold is defined by user (they both are by default), compare to highest ANI value and corresponding percent_bases_aligned value and only output ANI_top_species_match if both thresholds are surpassed + if [[ -n "~{ani_threshold}" || -n "~{percent_bases_aligned_threshold}" ]]; then + echo "Comparing user-defined ANI threshold to highest ANI value..." + # compare ANI_HIGHEST_PERCENT to ani_threshold using awk + if ! awk "BEGIN{ exit ($ANI_HIGHEST_PERCENT < ~{ani_threshold} )}"; then + echo "The highest ANI value $ANI_HIGHEST_PERCENT is less than the user-defined ANI threshold of ~{ani_threshold}" + echo "ANI top species match did not surpass the user-defined ANI threshold of ~{ani_threshold}" > ANI_TOP_SPECIES_MATCH.txt + # else if: compare percent_bases_aligned_threshold to ANI_HIGHEST_PERCENT_BASES_ALIGNED using awk + elif ! awk "BEGIN{ exit (${ANI_HIGHEST_PERCENT_BASES_ALIGNED} < ~{percent_bases_aligned_threshold} )}"; then + echo "The highest ANI percent bases aligned value ${ANI_HIGHEST_PERCENT_BASES_ALIGNED} is less than the user-defined threshold of ~{percent_bases_aligned_threshold}" + # overwrite ANI_TOP_SPECIES_MATCH.txt when percent_bases_aligned threshold is not surpassed + echo "ANI top species match did not surpass the user-defined percent bases aligned threshold of ~{percent_bases_aligned_threshold}" > ANI_TOP_SPECIES_MATCH.txt + else + echo "The highest ANI value ${ANI_HIGHEST_PERCENT} is greater than the user-defined threshold ~{ani_threshold}" + echo "The highest percent bases aligned value ${ANI_HIGHEST_PERCENT_BASES_ALIGNED} is greater than the user-defined threshold ~{percent_bases_aligned_threshold}" + fi + fi else # User specified a reference genome, use fasta filename as output string - basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH + basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH.txt echo "Reference genome used for ANI is: ${REF_GENOME}" fi fi >>> output { - Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT") - Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED") + Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT.txt") + Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt") File ani_output_tsv = "~{samplename}.ani-mummer.out.tsv" - String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH") - String ani_mummer_version = read_string("MUMMER_VERSION") + String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH.txt") + String ani_mummer_version = read_string("MUMMER_VERSION.txt") + String ani_docker = "~{docker}" } runtime { docker: "~{docker}" - memory: "8 GB" - cpu: 4 + memory: "~{memory} GB" + cpu: cpus disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" maxRetries: 3 diff --git a/tasks/utilities/task_broad_terra_tools.wdl b/tasks/utilities/task_broad_terra_tools.wdl index 2ae737a5c..eb819b302 100644 --- a/tasks/utilities/task_broad_terra_tools.wdl +++ b/tasks/utilities/task_broad_terra_tools.wdl @@ -92,6 +92,7 @@ task export_taxon_tables { File? ani_output_tsv String? ani_top_species_match String? ani_mummer_version + String? ani_docker String? kmerfinder_docker File? kmerfinder_results_tsv String? kmerfinder_top_hit @@ -595,6 +596,7 @@ task export_taxon_tables { "ani_output_tsv": "~{ani_output_tsv}", "ani_top_species_match": "~{ani_top_species_match}", "ani_mummer_version": "~{ani_mummer_version}", + "ani_docker": "~{ani_docker}", "kmerfinder_docker": "~{kmerfinder_docker}", "kmerfinder_results_tsv": "~{kmerfinder_results_tsv}", "kmerfinder_top_hit": "~{kmerfinder_top_hit}", diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 5ecac975e..f113ac6bd 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -574,7 +574,7 @@ - path: miniwdl_run/wdl/tasks/quality_control/task_fastq_scan.wdl contains: ["version", "fastq_scan", "output"] - path: miniwdl_run/wdl/tasks/quality_control/task_mummer_ani.wdl - md5sum: 21e8b1cbe4c8be26d1ac8b8013970166 + md5sum: d50a40d533d51834d0000971dc2c2014 - path: miniwdl_run/wdl/tasks/quality_control/task_ncbi_scrub.wdl contains: ["version", "scrub", "output"] - path: miniwdl_run/wdl/tasks/quality_control/task_qc_check_phb.wdl @@ -632,9 +632,9 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl md5sum: faacd87946ee3fbdf70f3a15b79ce547 - path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl - md5sum: 43ef050bde1fb8755f38e697a1794918 + md5sum: 26ac9a20c8043a28d373bfe0ca361cc6 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl - md5sum: 0c87a7279c4870a821c3dc1db9a6a94b + md5sum: ac4971ad992c3e8abee5d3817928a8f2 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: 00bd2489b2a7aa5b88340a940961a857 - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 7bb296e92..a46d0ea6b 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -542,7 +542,7 @@ - path: miniwdl_run/wdl/tasks/quality_control/task_fastq_scan.wdl contains: ["version", "fastq_scan", "output"] - path: miniwdl_run/wdl/tasks/quality_control/task_mummer_ani.wdl - md5sum: 21e8b1cbe4c8be26d1ac8b8013970166 + md5sum: d50a40d533d51834d0000971dc2c2014 - path: miniwdl_run/wdl/tasks/quality_control/task_qc_check_phb.wdl contains: ["version", "qc_check_table", "output"] - path: miniwdl_run/wdl/tasks/quality_control/task_quast.wdl @@ -598,9 +598,9 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl md5sum: faacd87946ee3fbdf70f3a15b79ce547 - path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl - md5sum: 43ef050bde1fb8755f38e697a1794918 + md5sum: 26ac9a20c8043a28d373bfe0ca361cc6 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl - md5sum: e1d9e75dae5176ceeb95b88a5d3bbba7 + md5sum: 7303247badeb82119ccef528f7367f89 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: 00bd2489b2a7aa5b88340a940961a857 - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl diff --git a/workflows/theiaprok/wf_theiaprok_fasta.wdl b/workflows/theiaprok/wf_theiaprok_fasta.wdl index 2c4ce9f15..c5fe43576 100644 --- a/workflows/theiaprok/wf_theiaprok_fasta.wdl +++ b/workflows/theiaprok/wf_theiaprok_fasta.wdl @@ -175,6 +175,7 @@ workflow theiaprok_fasta { ani_output_tsv = ani.ani_output_tsv, ani_top_species_match = ani.ani_top_species_match, ani_mummer_version = ani.ani_mummer_version, + ani_docker = ani.ani_docker, kmerfinder_docker = kmerfinder.kmerfinder_docker, kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv, kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit, @@ -427,6 +428,7 @@ workflow theiaprok_fasta { File? ani_output_tsv = ani.ani_output_tsv String? ani_top_species_match = ani.ani_top_species_match String? ani_mummer_version = ani.ani_mummer_version + String? ani_mummer_docker = ani.ani_docker # kmerfinder outputs String? kmerfinder_docker = kmerfinder.kmerfinder_docker File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index 4db88acc1..2d105442a 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -300,6 +300,7 @@ workflow theiaprok_illumina_pe { ani_output_tsv = ani.ani_output_tsv, ani_top_species_match = ani.ani_top_species_match, ani_mummer_version = ani.ani_mummer_version, + ani_docker = ani.ani_docker, kmerfinder_docker = kmerfinder.kmerfinder_docker, kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv, kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit, @@ -621,6 +622,7 @@ workflow theiaprok_illumina_pe { File? ani_output_tsv = ani.ani_output_tsv String? ani_top_species_match = ani.ani_top_species_match String? ani_mummer_version = ani.ani_mummer_version + String? ani_mummer_docker = ani.ani_docker # kmerfinder outputs String? kmerfinder_docker = kmerfinder.kmerfinder_docker File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index c8ac53280..8ee18cd09 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -273,6 +273,7 @@ workflow theiaprok_illumina_se { ani_output_tsv = ani.ani_output_tsv, ani_top_species_match = ani.ani_top_species_match, ani_mummer_version = ani.ani_mummer_version, + ani_docker = ani.ani_docker, kmerfinder_docker = kmerfinder.kmerfinder_docker, kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv, kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit, @@ -573,6 +574,7 @@ workflow theiaprok_illumina_se { File? ani_output_tsv = ani.ani_output_tsv String? ani_top_species_match = ani.ani_top_species_match String? ani_mummer_version = ani.ani_mummer_version + String? ani_mummer_docker = ani.ani_docker # kmerfinder outputs String? kmerfinder_docker = kmerfinder.kmerfinder_docker File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 0bfa87302..049a2bd37 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -271,6 +271,7 @@ workflow theiaprok_ont { ani_output_tsv = ani.ani_output_tsv, ani_top_species_match = ani.ani_top_species_match, ani_mummer_version = ani.ani_mummer_version, + ani_docker = ani.ani_docker, kmerfinder_docker = kmerfinder.kmerfinder_docker, kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv, kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit, @@ -547,6 +548,7 @@ workflow theiaprok_ont { File? ani_output_tsv = ani.ani_output_tsv String? ani_top_species_match = ani.ani_top_species_match String? ani_mummer_version = ani.ani_mummer_version + String? ani_mummer_docker = ani.ani_docker # kmerfinder outputs String? kmerfinder_docker = kmerfinder.kmerfinder_docker File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv