Skip to content

Commit

Permalink
added 2 QC thresholds to ANI task to reduce false positives (#168)
Browse files Browse the repository at this point in the history
* added ani_threshold Float input to animummer task. also exposed cpus and memory. added logic for comparing ANI_HIGHEST_PERCENTAGE to ani_threshold and only outputting the name of the match if the threshold is surpassed. tested successfully in miniwdl.

* fixed awk syntax

* added ani_docker to export_taxon_tables task and illumina pe workflow

* remove "The"

* added ani_docker output to theiaprok_fasta wf

* removed comma that I accidentally added to call block of qc_check_table in theiaprok illumina pe wf

* added ani_docker output to theiaprok illumina se and ONT workflows

* update CI

* update CI

* mummer_ani task: added percent_base_aligned_threshold with 70 as default. default ani_threshold is now 85. Also added logic to only output ani_top_species_match if both thresholds are surpassed.

* clarified messages when 2 thresholds are not passed

* added ani_threshold Float input to animummer task. also exposed cpus and memory. added logic for comparing ANI_HIGHEST_PERCENTAGE to ani_threshold and only outputting the name of the match if the threshold is surpassed. tested successfully in miniwdl.

* fixed awk syntax

* added ani_docker to export_taxon_tables task and illumina pe workflow

* remove "The"

* added ani_docker output to theiaprok_fasta wf

* removed comma that I accidentally added to call block of qc_check_table in theiaprok illumina pe wf

* added ani_docker output to theiaprok illumina se and ONT workflows

* update CI

* mummer_ani task: added percent_base_aligned_threshold with 70 as default. default ani_threshold is now 85. Also added logic to only output ani_top_species_match if both thresholds are surpassed.

* clarified messages when 2 thresholds are not passed

* removed accidental duplication in code introduced during merge conflict resolution (my bad!)

* update CI for theiaprok illumina pe and se

* lower default ani_threshold to 80 (CDC standard) from 85

* update CI
  • Loading branch information
kapsakcj authored Dec 5, 2023
1 parent 203ec3b commit d68a45f
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 24 deletions.
64 changes: 46 additions & 18 deletions tasks/quality_control/task_mummer_ani.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@ task animummer {
String samplename
File? ref_genome
Float mash_filter = 0.9
String docker="us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2"
# these 2 thresholds were set as they are used by CDC enterics lab/PulseNet for ANI thresholds
Float ani_threshold = 80.0
Float percent_bases_aligned_threshold = 70.0
String docker= "us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2"
Int cpus = 4
Int memory = 8
Int disk_size = 100
}
command <<<
# capture and version
mummer --version | tee MUMMER_VERSION
mummer --version | tee MUMMER_VERSION.txt

# set the reference genome
# if not defined by user, then use all 43 genomes in RGDv2
Expand Down Expand Up @@ -42,18 +47,22 @@ task animummer {
echo "~{samplename} did not surpass the minimum mash genetic distance filter, thus ANI was not performed"
echo "The output TSV only contains the header line"
# set output variables as 0s or descriptive strings
echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED
echo "0.0" > ANI_HIGHEST_PERCENT
echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH
echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt
echo "0.0" > ANI_HIGHEST_PERCENT.txt
echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH.txt
# if output TSV has greater than 1 lines, then parse for appropriate outputs
else
## parse out highest percentBases aligned
cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED
echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)"
cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt
echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)"
ANI_HIGHEST_PERCENT_BASES_ALIGNED=$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)

## parse out ANI value using highest percentBases aligned value
grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT
echo "ANI value is: $(cat ANI_HIGHEST_PERCENT)"
grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT.txt
echo "Highest ANI value is: $(cat ANI_HIGHEST_PERCENT.txt)"
# set ANI_HIGHEST_PERCENT as a bash variable (float)
ANI_HIGHEST_PERCENT=$(cat ANI_HIGHEST_PERCENT.txt)


# have to separate out results for ani_top_species match because user-defined reference genome FASTAs will not be named as they are in RGDv2
if [[ -z "~{ref_genome}" ]]; then
Expand All @@ -63,27 +72,46 @@ task animummer {
# cut on periods to pull out genus_species (in future this will inlcude lineages for Listeria and other sub-species designations)
# have to create assembly_file_basename bash variable since output TSV does not include full path to assembly file, only filename
assembly_file_basename=$(basename ~{assembly})
grep "$(cat ANI_HIGHEST_PERCENT)" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH
echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH)"
grep "${ANI_HIGHEST_PERCENT}" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH.txt
echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH.txt)"

# if ANI threshold or percent_bases_aligned_threshold is defined by user (they both are by default), compare to highest ANI value and corresponding percent_bases_aligned value and only output ANI_top_species_match if both thresholds are surpassed
if [[ -n "~{ani_threshold}" || -n "~{percent_bases_aligned_threshold}" ]]; then
echo "Comparing user-defined ANI threshold to highest ANI value..."
# compare ANI_HIGHEST_PERCENT to ani_threshold using awk
if ! awk "BEGIN{ exit ($ANI_HIGHEST_PERCENT < ~{ani_threshold} )}"; then
echo "The highest ANI value $ANI_HIGHEST_PERCENT is less than the user-defined ANI threshold of ~{ani_threshold}"
echo "ANI top species match did not surpass the user-defined ANI threshold of ~{ani_threshold}" > ANI_TOP_SPECIES_MATCH.txt
# else if: compare percent_bases_aligned_threshold to ANI_HIGHEST_PERCENT_BASES_ALIGNED using awk
elif ! awk "BEGIN{ exit (${ANI_HIGHEST_PERCENT_BASES_ALIGNED} < ~{percent_bases_aligned_threshold} )}"; then
echo "The highest ANI percent bases aligned value ${ANI_HIGHEST_PERCENT_BASES_ALIGNED} is less than the user-defined threshold of ~{percent_bases_aligned_threshold}"
# overwrite ANI_TOP_SPECIES_MATCH.txt when percent_bases_aligned threshold is not surpassed
echo "ANI top species match did not surpass the user-defined percent bases aligned threshold of ~{percent_bases_aligned_threshold}" > ANI_TOP_SPECIES_MATCH.txt
else
echo "The highest ANI value ${ANI_HIGHEST_PERCENT} is greater than the user-defined threshold ~{ani_threshold}"
echo "The highest percent bases aligned value ${ANI_HIGHEST_PERCENT_BASES_ALIGNED} is greater than the user-defined threshold ~{percent_bases_aligned_threshold}"
fi
fi
else
# User specified a reference genome, use fasta filename as output string
basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH
basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH.txt
echo "Reference genome used for ANI is: ${REF_GENOME}"
fi
fi

>>>
output {
Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT")
Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED")
Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT.txt")
Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt")
File ani_output_tsv = "~{samplename}.ani-mummer.out.tsv"
String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH")
String ani_mummer_version = read_string("MUMMER_VERSION")
String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH.txt")
String ani_mummer_version = read_string("MUMMER_VERSION.txt")
String ani_docker = "~{docker}"
}
runtime {
docker: "~{docker}"
memory: "8 GB"
cpu: 4
memory: "~{memory} GB"
cpu: cpus
disks: "local-disk " + disk_size + " SSD"
disk: disk_size + " GB"
maxRetries: 3
Expand Down
2 changes: 2 additions & 0 deletions tasks/utilities/task_broad_terra_tools.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ task export_taxon_tables {
File? ani_output_tsv
String? ani_top_species_match
String? ani_mummer_version
String? ani_docker
String? kmerfinder_docker
File? kmerfinder_results_tsv
String? kmerfinder_top_hit
Expand Down Expand Up @@ -595,6 +596,7 @@ task export_taxon_tables {
"ani_output_tsv": "~{ani_output_tsv}",
"ani_top_species_match": "~{ani_top_species_match}",
"ani_mummer_version": "~{ani_mummer_version}",
"ani_docker": "~{ani_docker}",
"kmerfinder_docker": "~{kmerfinder_docker}",
"kmerfinder_results_tsv": "~{kmerfinder_results_tsv}",
"kmerfinder_top_hit": "~{kmerfinder_top_hit}",
Expand Down
6 changes: 3 additions & 3 deletions tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@
- path: miniwdl_run/wdl/tasks/quality_control/task_fastq_scan.wdl
contains: ["version", "fastq_scan", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_mummer_ani.wdl
md5sum: 21e8b1cbe4c8be26d1ac8b8013970166
md5sum: d50a40d533d51834d0000971dc2c2014
- path: miniwdl_run/wdl/tasks/quality_control/task_ncbi_scrub.wdl
contains: ["version", "scrub", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_qc_check_phb.wdl
Expand Down Expand Up @@ -632,9 +632,9 @@
- path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl
md5sum: faacd87946ee3fbdf70f3a15b79ce547
- path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl
md5sum: 43ef050bde1fb8755f38e697a1794918
md5sum: 26ac9a20c8043a28d373bfe0ca361cc6
- path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl
md5sum: 0c87a7279c4870a821c3dc1db9a6a94b
md5sum: ac4971ad992c3e8abee5d3817928a8f2
- path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl
md5sum: 00bd2489b2a7aa5b88340a940961a857
- path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl
Expand Down
6 changes: 3 additions & 3 deletions tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@
- path: miniwdl_run/wdl/tasks/quality_control/task_fastq_scan.wdl
contains: ["version", "fastq_scan", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_mummer_ani.wdl
md5sum: 21e8b1cbe4c8be26d1ac8b8013970166
md5sum: d50a40d533d51834d0000971dc2c2014
- path: miniwdl_run/wdl/tasks/quality_control/task_qc_check_phb.wdl
contains: ["version", "qc_check_table", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_quast.wdl
Expand Down Expand Up @@ -598,9 +598,9 @@
- path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl
md5sum: faacd87946ee3fbdf70f3a15b79ce547
- path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl
md5sum: 43ef050bde1fb8755f38e697a1794918
md5sum: 26ac9a20c8043a28d373bfe0ca361cc6
- path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl
md5sum: e1d9e75dae5176ceeb95b88a5d3bbba7
md5sum: 7303247badeb82119ccef528f7367f89
- path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl
md5sum: 00bd2489b2a7aa5b88340a940961a857
- path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_fasta.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ workflow theiaprok_fasta {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -427,6 +428,7 @@ workflow theiaprok_fasta {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_illumina_pe.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ workflow theiaprok_illumina_pe {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -621,6 +622,7 @@ workflow theiaprok_illumina_pe {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_illumina_se.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ workflow theiaprok_illumina_se {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -573,6 +574,7 @@ workflow theiaprok_illumina_se {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_ont.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ workflow theiaprok_ont {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -547,6 +548,7 @@ workflow theiaprok_ont {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down

0 comments on commit d68a45f

Please sign in to comment.