From b0c2a0e4e7479b681b1290df8bdde760b8127444 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 11 Sep 2023 20:17:39 +0000 Subject: [PATCH 01/23] add to theiaprok --- .../species_typing/task_tb_gene_coverage.wdl | 210 -------- tasks/species_typing/task_tbp_parser.wdl | 55 ++ tasks/species_typing/task_tbprofiler.wdl | 12 +- .../task_tbprofiler_output_parsing.wdl | 484 ------------------ .../theiaprok/wf_theiaprok_illumina_pe.wdl | 9 +- .../theiaprok/wf_theiaprok_illumina_se.wdl | 9 +- workflows/theiaprok/wf_theiaprok_ont.wdl | 11 +- workflows/utilities/wf_merlin_magic.wdl | 46 +- 8 files changed, 108 insertions(+), 728 deletions(-) delete mode 100644 tasks/species_typing/task_tb_gene_coverage.wdl create mode 100644 tasks/species_typing/task_tbp_parser.wdl delete mode 100644 tasks/species_typing/task_tbprofiler_output_parsing.wdl diff --git a/tasks/species_typing/task_tb_gene_coverage.wdl b/tasks/species_typing/task_tb_gene_coverage.wdl deleted file mode 100644 index 212214256..000000000 --- a/tasks/species_typing/task_tb_gene_coverage.wdl +++ /dev/null @@ -1,210 +0,0 @@ -version 1.0 - -task tb_gene_coverage { - input { - File bamfile - File bamindex - String samplename - Int min_depth = 10 - Int disk_size = 100 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15" - } - command <<< - chr=$(samtools idxstats ~{bamfile} | cut -f 1 | head -1) - - # samtools outputs 3 columns; column 3 is the depth of coverage per nucleotide position, piped to awk to count the positions - # above min_depth, then wc -l counts them all - gyrB=$(samtools depth -J -r "${chr}:5040-7467" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - gyrA=$(samtools depth -J -r "${chr}:7102-10018" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fgd1=$(samtools depth -J -r "${chr}:490583-491993" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mshA=$(samtools depth -J -r "${chr}:575148-576990" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ccsA=$(samtools depth -J -r "${chr}:619691-621065" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpoB=$(samtools depth -J -r "${chr}:759607-763525" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpoC=$(samtools depth -J -r "${chr}:763170-767520" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mmpL5=$(samtools depth -J -r "${chr}:775386-778680" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mmpS5=$(samtools depth -J -r "${chr}:778277-779105" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mmpR5=$(samtools depth -J -r "${chr}:778790-779687" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpsL=$(samtools depth -J -r "${chr}:781360-782134" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rplC=$(samtools depth -J -r "${chr}:800609-801662" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiC=$(samtools depth -J -r "${chr}:1302731-1305701" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv1258c=$(samtools depth -J -r "${chr}:1405881-1407540" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embR=$(samtools depth -J -r "${chr}:1415981-1417547" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - atpE=$(samtools depth -J -r "${chr}:1460845-1461490" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rrs=$(samtools depth -J -r "${chr}:1471646-1473582" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rrl=$(samtools depth -J -r "${chr}:1473458-1476995" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fabG1=$(samtools depth -J -r "${chr}:1673148-1674383" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - inhA=$(samtools depth -J -r "${chr}:1673848-1675211" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpsA=$(samtools depth -J -r "${chr}:1833342-1835187" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - tlyA=$(samtools depth -J -r "${chr}:1917740-1918946" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ndh=$(samtools depth -J -r "${chr}:2101451-2103242" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - katG=$(samtools depth -J -r "${chr}:2153689-2156570" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - PPE35=$(samtools depth -J -r "${chr}:2167449-2170812" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv1979c=$(samtools depth -J -r "${chr}:2221519-2223364" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - pncA=$(samtools depth -J -r "${chr}:2288481-2290323" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - kasA=$(samtools depth -J -r "${chr}:2517915-2519565" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - eis=$(samtools depth -J -r "${chr}:2713924-2715586" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ahpC=$(samtools depth -J -r "${chr}:2725912-2726980" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - folC=$(samtools depth -J -r "${chr}:2745935-2747798" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - pepQ=$(samtools depth -J -r "${chr}:2859100-2860618" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ribD=$(samtools depth -J -r "${chr}:2986639-2987815" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv2752c=$(samtools depth -J -r "${chr}:3064315-3066391" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - thyX=$(samtools depth -J -r "${chr}:3066993-3068161" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - thyA=$(samtools depth -J -r "${chr}:3073480-3074671" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ald=$(samtools depth -J -r "${chr}:3086620-3088135" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiD=$(samtools depth -J -r "${chr}:3338918-3339962" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv3083=$(samtools depth -J -r "${chr}:3448304-3450191" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fprA=$(samtools depth -J -r "${chr}:3473807-3475577" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - whiB7=$(samtools depth -J -r "${chr}:3568201-3568879" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv3236c=$(samtools depth -J -r "${chr}:3611759-3613316" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiA=$(samtools depth -J -r "${chr}:3640343-3641738" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiB=$(samtools depth -J -r "${chr}:3641335-3643081" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - alr=$(samtools depth -J -r "${chr}:3839994-3841620" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpoA=$(samtools depth -J -r "${chr}:3877264-3878707" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ddn=$(samtools depth -J -r "${chr}:3986644-3987499" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - clpC1=$(samtools depth -J -r "${chr}:4037958-4040904" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - panD=$(samtools depth -J -r "${chr}:4043662-4044481" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embC=$(samtools depth -J -r "${chr}:4239663-4243347" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embA=$(samtools depth -J -r "${chr}:4243004-4246717" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embB=$(samtools depth -J -r "${chr}:4246314-4250010" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - aftB=$(samtools depth -J -r "${chr}:4266753-4269036" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ubiA=$(samtools depth -J -r "${chr}:4268725-4270033" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ethA=$(samtools depth -J -r "${chr}:4325804-4330174" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ethR=$(samtools depth -J -r "${chr}:4327349-4328399" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - whiB6=$(samtools depth -J -r "${chr}:4337971-4338721" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - gid=$(samtools depth -J -r "${chr}:4407328-4408476" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - - # add one to gene lenth to compensate for subtraction - gyrB_pc=$(python3 -c "print ( round ( ($gyrB / 2428 ) * 100, 2 ) )") - gyrA_pc=$(python3 -c "print ( round( ($gyrA / 2917 ) * 100, 2 ) )") - fgd1_pc=$(python3 -c "print ( round( ($fgd1 / 1411 ) * 100, 2 ) )") - mshA_pc=$(python3 -c "print ( round( ($mshA / 1843 ) * 100, 2 ) )") - ccsA_pc=$(python3 -c "print ( round( ($ccsA / 1375 ) * 100, 2 ) )") - rpoB_pc=$(python3 -c "print ( round( ($rpoB / 3919 ) * 100, 2 ) )") - rpoC_pc=$(python3 -c "print ( round( ($rpoC / 4351 ) * 100, 2 ) )") - mmpL5_pc=$(python3 -c "print ( round( ($mmpL5 / 3295 ) * 100, 2 ) )") - mmpS5_pc=$(python3 -c "print ( round( ($mmpS5 / 829 ) * 100, 2 ) )") - mmpR5_pc=$(python3 -c "print ( round( ($mmpR5 / 898 ) * 100, 2 ) )") - rpsL_pc=$(python3 -c "print ( round( ($rpsL / 775 ) * 100, 2 ) )") - rplC_pc=$(python3 -c "print ( round( ($rplC / 1054 ) * 100, 2 ) )") - fbiC_pc=$(python3 -c "print ( round( ($fbiC / 2971 ) * 100, 2 ) )") - Rv1258c_pc=$(python3 -c "print ( round( ($Rv1258c / 1660 ) * 100, 2 ) )") - embR_pc=$(python3 -c "print ( round( ($embR / 1567 ) * 100, 2 ) )") - atpE_pc=$(python3 -c "print ( round( ($atpE / 646 ) * 100, 2 ) )") - rrs_pc=$(python3 -c "print ( round( ($rrs / 1937 ) * 100, 2 ) )") - rrl_pc=$(python3 -c "print ( round( ($rrl / 3538 ) * 100, 2 ) )") - fabG1_pc=$(python3 -c "print ( round( ($fabG1 / 1236 ) * 100, 2 ) )") - inhA_pc=$(python3 -c "print ( round( ($inhA / 1364 ) * 100, 2 ) )") - rpsA_pc=$(python3 -c "print ( round( ($rpsA / 1846 ) * 100, 2 ) )") - tlyA_pc=$(python3 -c "print ( round( ($tlyA / 1207 ) * 100, 2 ) )") - ndh_pc=$(python3 -c "print ( round( ($ndh / 1792 ) * 100, 2 ) )") - katG_pc=$(python3 -c "print ( round( ($katG / 2882 ) * 100, 2 ) )") - PPE35_pc=$(python3 -c "print ( round( ($PPE35 / 3364 ) * 100, 2 ) )") - Rv1979c_pc=$(python3 -c "print ( round( ($Rv1979c / 1846 ) * 100, 2 ) )") - pncA_pc=$(python3 -c "print ( round( ($pncA / 1843 ) * 100, 2 ) )") - kasA_pc=$(python3 -c "print ( round( ($kasA / 1651 ) * 100, 2 ) )") - eis_pc=$(python3 -c "print ( round( ($eis / 1663 ) * 100, 2 ) )") - ahpC_pc=$(python3 -c "print ( round( ($ahpC / 1069 ) * 100, 2 ) )") - folC_pc=$(python3 -c "print ( round( ($folC / 1864 ) * 100, 2 ) )") - pepQ_pc=$(python3 -c "print ( round( ($pepQ / 1519 ) * 100, 2 ) )") - ribD_pc=$(python3 -c "print ( round( ($ribD / 1177 ) * 100, 2 ) )") - Rv2752c_pc=$(python3 -c "print ( round( ($Rv2752c / 2077 ) * 100, 2 ) )") - thyX_pc=$(python3 -c "print ( round( ($thyX / 1169 ) * 100, 2 ) )") - thyA_pc=$(python3 -c "print ( round( ($thyA / 1192 ) * 100, 2 ) )") - ald_pc=$(python3 -c "print ( round( ($ald / 1516 ) * 100, 2 ) )") - fbiD_pc=$(python3 -c "print ( round( ($fbiD / 1045 ) * 100, 2 ) )") - Rv3083_pc=$(python3 -c "print ( round( ($Rv3083 / 1888 ) * 100, 2 ) )") - fprA_pc=$(python3 -c "print ( round( ($fprA / 1771 ) * 100, 2 ) )") - whiB7_pc=$(python3 -c "print ( round( ($whiB7 / 679 ) * 100, 2 ) )") - Rv3236c_pc=$(python3 -c "print ( round( ($Rv3236c / 1558 ) * 100, 2 ) )") - fbiA_pc=$(python3 -c "print ( round( ($fbiA / 1396 ) * 100, 2 ) )") - fbiB_pc=$(python3 -c "print ( round( ($fbiB / 1747 ) * 100, 2 ) )") - alr_pc=$(python3 -c "print ( round( ($alr / 1627 ) * 100, 2 ) )") - rpoA_pc=$(python3 -c "print ( round( ($rpoA / 1444 ) * 100, 2 ) )") - ddn_pc=$(python3 -c "print ( round( ($ddn / 856 ) * 100, 2 ) )") - clpC1_pc=$(python3 -c "print ( round( ($clpC1 / 2947 ) * 100, 2 ) )") - panD_pc=$(python3 -c "print ( round( ($panD / 820 ) * 100, 2 ) )") - embC_pc=$(python3 -c "print ( round( ($embC / 3685 ) * 100, 2 ) )") - embA_pc=$(python3 -c "print ( round( ($embA / 3714 ) * 100, 2 ) )") - embB_pc=$(python3 -c "print ( round( ($embB / 3697 ) * 100, 2 ) )") - aftB_pc=$(python3 -c "print ( round( ($aftB / 2284 ) * 100, 2 ) )") - ubiA_pc=$(python3 -c "print ( round( ($ubiA / 1309 ) * 100, 2 ) )") - ethA_pc=$(python3 -c "print ( round( ($ethA / 4371 ) * 100, 2 ) )") - ethR_pc=$(python3 -c "print ( round( ($ethR / 1051 ) * 100, 2 ) )") - whiB6_pc=$(python3 -c "print ( round( ($whiB6 / 751 ) * 100, 2 ) )") - gid_pc=$(python3 -c "print ( round( ($gid / 1149 ) * 100, 2 ) )") - - echo -e "#NOTE: THE VALUES BELOW ASSUME TBPROFILER (H37Rv) REFERENCE GENOME" > ~{samplename}.percent_gene_coverage.tsv - echo -e "Gene\tPercent_Coverage" >> ~{samplename}.percent_gene_coverage.tsv - echo -e "gyrB\t"$gyrB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "gyrA\t"$gyrA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fgd1\t"$fgd1_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mshA\t"$mshA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ccsA\t"$ccsA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpoB\t"$rpoB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpoC\t"$rpoC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mmpL5\t"$mmpL5_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mmpS5\t"$mmpS5_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mmpR5\t"$mmpR5_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpsL\t"$rpsL_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rplC\t"$rplC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiC\t"$fbiC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv1258c\t"$Rv1258c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embR\t"$embR_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "atpE\t"$atpE_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rrs\t"$rrs_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rrl\t"$rrl_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fabG1\t"$fabG1_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "inhA\t"$inhA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpsA\t"$rpsA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "tlyA\t"$tlyA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ndh\t"$ndh_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "katG\t"$katG_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "PPE35\t"$PPE35_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv1979c\t"$Rv1979c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "pncA\t"$pncA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "kasA\t"$kasA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "eis\t"$eis_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ahpC\t"$ahpC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "folC\t"$folC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "pepQ\t"$pepQ_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ribD\t"$ribD_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv2752c\t"$Rv2752c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "thyX\t"$thyX_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "thyA\t"$thyA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ald\t"$ald_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiD\t"$fbiD_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv3083\t"$Rv3083_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fprA\t"$fprA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "whiB7\t"$whiB7_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv3236c\t"$Rv3236c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiA\t"$fbiA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiB\t"$fbiB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "alr\t"$alr_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpoA\t"$rpoA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ddn\t"$ddn_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "clpC1\t"$clpC1_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "panD\t"$panD_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embC\t"$embC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embA\t"$embA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embB\t"$embB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "aftB\t"$aftB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ubiA\t"$ubiA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ethA\t"$ethA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ethR\t"$ethR_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "whiB6\t"$whiB6_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "gid\t"$gid_pc >> ~{samplename}.percent_gene_coverage.tsv - - >>> - output { - File tb_resistance_genes_percent_coverage = "~{samplename}.percent_gene_coverage.tsv" - } - runtime { - docker: docker - memory: "8 GB" - cpu: 2 - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - preemptible: 0 - maxRetries: 3 - } -} \ No newline at end of file diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl new file mode 100644 index 000000000..ac6283d16 --- /dev/null +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -0,0 +1,55 @@ +version 1.0 + +task tbp_parser { + input { + File tbprofiler_json + File tbprofiler_bam + File tbprofiler_bai + String samplename + + String? sequencing_method + String? operator + Int min_depth = 10 + Int coverage_threshold = 100 + Boolean tbp_parser_debug = false + + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.1" + Int disk_size = 100 + Int memory = 4 + Int cpu = 1 + } + command <<< + # get version + python3 tbp-parser/tbp_parser/tbp_parser.py --version | tee VERSION + + # run tbp-parser + python3 tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ + ~{"--sequencing_method" + sequencing_method} \ + ~{"--operator" + operator} \ + ~{"--min_depth" + min_depth} \ + ~{"--coverage_threshold" + coverage_threshold} \ + --output_prefix ~{samplename} \ + ~{true="--debug" false="--verbose" tbp_parser_debug} + + # get genome percent coverage for the entire reference genome length over min_depth + genome=$(samtools depth -J ~{tbprofiler_bam} | awk -F "\t" '{if ($3 >= ~{min_depth}) print;}' | wc -l ) + python3 -c "print ( ($genome / 4411532 ) * 100 )" | tee GENOME_PC + >>> + output { + File tbp_parser_looker_report_csv = "~{samplename}.looker_report.csv" + File tbp_parser_laboratorian_report_csv = "~{samplename}.laboratorian_report.csv" + File tbp_parser_lims_report_csv = "~{samplename}.lims_report.csv" + File tbp_parser_coverage_report = "~{samplename}.percent_gene_coverage.csv" + Float tbp_parser_genome_percent_coverage = read_float("GENOME_PC") + String tbp_parser_version = read_string("VERSION") + String tbp_parser_docker = docker + } + runtime { + docker: docker + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + } +} \ No newline at end of file diff --git a/tasks/species_typing/task_tbprofiler.wdl b/tasks/species_typing/task_tbprofiler.wdl index 1464cc1a1..a3a881243 100644 --- a/tasks/species_typing/task_tbprofiler.wdl +++ b/tasks/species_typing/task_tbprofiler.wdl @@ -9,7 +9,7 @@ task tbprofiler { String tbprofiler_docker_image = "us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:4.4.2" Int disk_size = 100 String mapper = "bwa" - String caller = "bcftools" + String caller = "freebayes" Int min_depth = 10 Float min_af = 0.1 Float min_af_pred = 0.1 @@ -22,7 +22,7 @@ task tbprofiler { date | tee DATE # Print and save version - tb-profiler --version > VERSION && sed -i -e 's/^/TBProfiler version /' VERSION + tb-profiler version > VERSION && sed -i -e 's/^/TBProfiler version /' VERSION if [ -z "~{read2}" ] ; then INPUT_READS="-1 ~{read1}" @@ -89,6 +89,12 @@ task tbprofiler { res_genes.append(tsv_dict[i]) res_genes_string=';'.join(res_genes) Resistance_Genes.write(res_genes_string) + with open ("MEDIAN_COVERAGE", 'wt') as Median_Coverage: + median_coverage=tsv_dict['median_coverage'] + Median_Coverage.write(median_coverage) + with open ("PCT_READS_MAPPED", 'wt') as Pct_Reads_Mapped: + pct_reads_mapped=tsv_dict['pct_reads_mapped'] + Pct_Reads_Mapped.write(pct_reads_mapped) CODE >>> output { @@ -104,6 +110,8 @@ task tbprofiler { String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS") String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS") String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES") + Int tbprofiler_median_coverage = read_int("MEDIAN_COVERAGE") + Float tbprofiler_pct_reads_mapped = read_float("PCT_READS_MAPPED") } runtime { docker: "~{tbprofiler_docker_image}" diff --git a/tasks/species_typing/task_tbprofiler_output_parsing.wdl b/tasks/species_typing/task_tbprofiler_output_parsing.wdl deleted file mode 100644 index b49739828..000000000 --- a/tasks/species_typing/task_tbprofiler_output_parsing.wdl +++ /dev/null @@ -1,484 +0,0 @@ -version 1.0 - -task tbprofiler_output_parsing { - input { - File json - String output_seq_method_type - String operator - String samplename - Int min_depth = 10 - } - command <<< - python3 < 0: - confidences.append(confidence.array[0]) - else: - confidences.append("No annotation") - if len(frequency) > 0: - frequencies.append(frequency.array[0]) - else: - frequencies.append("1") - return confidences, frequencies - - ## Main Parsing Functions ## - - def parse_json_lab_report(json_file): - """ - This function recieved the tbprofiler output json file and - writes the Laboratorian report that includes the following information - per mutation: - - sample_id: inclides sample name - - tbprofiler_gene_name: gene name - - tbprofiler_locus_tag: locus tag - - tbprofiler_variant_substitution_type: variant substitution type (missense_variant, upstream_gene_variant...) - - tbprofiler_variant_substitution_nt: nucleotide substitution (c.1349C>G) - - tbprofiler_variant_substitution_aa: aminoacid substitution (p.Ser450Trp) - - confidence: tbprofiler annotation regarding resistance (Not assoc w R, Uncertain significance...) - - antimicrobial: antimicrobial the mutation is confering resistance to (streptomycin, rifampicin...) - - looker_interpretation: interpretation of resistance for Looker report (R, S, U, R-interim) - - mdl_interpretation: MDL interpretation of resistance (R,S,U) - - depth: depth of coverage at the mutation site (100) - - frequency: frequency of mutation at the site (1) - - read_support: number of reads supporting the mutation (100, depth*frequency) - - rationale: rationale for resistance calling (WHO classification, Expert rule) - - warning: column reserved for warnings such as low depth of coverage - """ - - df_laboratorian = pd.DataFrame(columns = ["sample_id","tbprofiler_gene_name","tbprofiler_locus_tag", - "tbprofiler_variant_substitution_type","tbprofiler_variant_substitution_nt", - "tbprofiler_variant_substitution_aa","confidence","antimicrobial", - "looker_interpretation","mdl_interpretation","depth","frequency", - "read_support","rationale","warning"]) - - row_list = [] - genes_reported = [] - - with open(json_file) as results_json_fh: - results_json = json.load(results_json_fh) - - # reported mutation by tb-profiler, all confering resistance by WHO criteria - for dr_variant in results_json["dr_variants"]: - if "annotation" in dr_variant: - try: # sometimes annotation is an empty list - if dr_variant["annotation"][0]["who_confidence"] == "": - confidence = "No WHO annotation" - else: - confidence = dr_variant["annotation"][0]["who_confidence"] - except: - confidence = "No WHO annotation" - else: - confidence = "No WHO annotation" - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = dr_variant["gene"] - row["tbprofiler_locus_tag"] = dr_variant["locus_tag"] - row["tbprofiler_variant_substitution_type"] = dr_variant["type"] - row["tbprofiler_variant_substitution_nt"] = dr_variant["nucleotide_change"] - row["tbprofiler_variant_substitution_aa"] = dr_variant["protein_change"] if dr_variant["protein_change"] != "" else "NA" - row["confidence"] = confidence - row["antimicrobial"] = ",".join(dr_variant["gene_associated_drugs"]) - row["looker_interpretation"] = decipher_looker(row["confidence"]) - row["mdl_interpretation"] = decipher_MDL(row["confidence"]) - row["depth"] = int(dr_variant["depth"] or 0) - row["frequency"] = dr_variant["freq"] - row["read_support"] = row["depth"]*row["frequency"] - row["rationale"] = "WHO classification" - row["warning"] = "Low depth coverage" if row["depth"] < int('~{min_depth}') else "" - genes_reported.append(dr_variant["gene"]) - row_list.append(row) - - # mutations not reported by tb-profiler - application of expert rules to determine resistance - for other_variant in results_json["other_variants"]: - - # report only mutations that are NOT synonymous - if other_variant["type"] != "synonymous_variant": - - # Expert rule: mutations in katG, pncA, ethA or gid, classify as resistant - if other_variant["gene"] == "katG" or other_variant["gene"] == "pncA" or other_variant["gene"] == "ethA" or other_variant["gene"] == "gid": - if "annotation" in other_variant: - try: # sometimes annotation is an empty list - if other_variant["annotation"][0]["who_confidence"] == "": - confidence = "No WHO annotation" - else: - confidence = other_variant["annotation"][0]["who_confidence"] - except: - confidence = "No WHO annotation" - else: - confidence = "No WHO annotation" - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = other_variant["gene"] - row["tbprofiler_locus_tag"] = other_variant["locus_tag"] - row["tbprofiler_variant_substitution_type"] = other_variant["type"] - row["tbprofiler_variant_substitution_nt"] = other_variant["nucleotide_change"] - row["tbprofiler_variant_substitution_aa"] = other_variant["protein_change"] if other_variant["protein_change"] != "" else "NA" - row["confidence"] = confidence - row["antimicrobial"] = ",".join(other_variant["gene_associated_drugs"]) - row["looker_interpretation"] = decipher_looker(row["confidence"]) - row["mdl_interpretation"] = decipher_MDL(row["confidence"]) - row["depth"] = int(other_variant["depth"] or 0) - row["frequency"] = other_variant["freq"] - row["read_support"] = row["depth"]*row["frequency"] - row["rationale"] = "Resistant based on expert rule" - row["warning"] = "Low depth coverage" if row["depth"] < int('~{min_depth}') else "" - genes_reported.append(other_variant["gene"]) - row_list.append(row) - - # Expert rule: in case mutation occurs between codons 426 and 452 of rpoB gene, classify as resistant - if other_variant["gene"] == "rpoB": - position = get_codon(other_variant["protein_change"]) - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = other_variant["gene"] - row["tbprofiler_locus_tag"] = other_variant["locus_tag"] - row["tbprofiler_variant_substitution_type"] = other_variant["type"] - row["tbprofiler_variant_substitution_nt"] = other_variant["nucleotide_change"] - row["tbprofiler_variant_substitution_aa"] = other_variant["protein_change"] if other_variant["protein_change"] != "" else "NA" - row["confidence"] = "No WHO annotation" - row["antimicrobial"] = ",".join(other_variant["gene_associated_drugs"]) - row["looker_interpretation"] = decipher_looker(row["confidence"]) - row["mdl_interpretation"] = decipher_MDL(row["confidence"]) - row["depth"] = int(other_variant["depth"] or 0) - row["frequency"] = other_variant["freq"] - row["read_support"] = row["depth"]*row["frequency"] - row["rationale"] = "Resistant based on expert rule" if 426 <= position <= 452 else "Uncertain significance based on expert rule" - row["warning"] = "Low depth coverage" if row["depth"] < int('~{min_depth}') else "" - genes_reported.append(other_variant["gene"]) - row_list.append(row) - - for gene, resistance_list in gene_to_resistance.items(): - for resistance in resistance_list: - if gene not in genes_reported: - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = gene - row["tbprofiler_locus_tag"] = gene_to_locus_tag[gene] - row["tbprofiler_variant_substitution_type"] = "WT" - row["tbprofiler_variant_substitution_nt"] = "NA" - row["tbprofiler_variant_substitution_aa"] = "NA" - row["confidence"] = "NA" - row["antimicrobial"] = resistance - row["looker_interpretation"] = "NA" - row["mdl_interpretation"] = "NA" - row["depth"] = "NA" - row["frequency"] = "NA" - row["read_support"] = "NA" - row["rationale"] = "NA" - row["warning"] = "NA" - row_list.append(row) - - df_laboratorian = df_laboratorian.append(row_list, ignore_index=True) - df_laboratorian.to_csv("tbprofiler_laboratorian_report.csv", index=False) - - def parse_json_lims_report(json_file, formatted_time): - """ - This function recieves the tbprofiler output json file and - writes the LIMS report that includes the following information - per sample: - - MDL sample accession numbers: includes sample name - - M_DST_A01_ID - includes lineage - - The set of information in gene_dict dictionary with target drug resistance information - in layman's terms, and the mutations responsible for the predicted phenotype - - Date of analysis in YYYY-MM-DD HH:SS format - - Operator information - """ - - lineage = get_lineage("~{json}") - mutations = parse_json_mutations("~{json}") - resistance = parse_json_resistance("~{json}") - df_lims = pd.DataFrame({"MDL sample accession numbers":"~{samplename}", "M_DST_A01_ID": lineage},index=[0]) - - for antimicrobial, genes in gene_dict.items(): - if antimicrobial_dict[antimicrobial] in resistance.keys(): - df_lims[antimicrobial] = translate(resistance[antimicrobial_dict[antimicrobial]], antimicrobial_dict[antimicrobial]) - else: - df_lims[antimicrobial] = "No resistance to {} detected".format(antimicrobial_dict[antimicrobial]) - for gene_name, gene_id in genes.items(): - if gene_name in mutations.keys(): - df_lims[gene_id] = mutations[gene_name] - else: - df_lims[gene_id] = "No mutations detected" - - df_lims["Analysis date"] = formatted_time - df_lims["Operator"] = "~{operator}" - df_lims.to_csv("tbprofiler_lims_report.csv", index=False) - - def parse_json_looker_report(json_file, current_time): - """ - This function recieves the tbprofiler output json file and - writes the Looker report that includes the following information - per sample: - - sample_id: includes sample name - - for each antimicrobial, indication if its resistant (R) or susceptible (S) - """ - resistance = parse_json_resistance("~{json}") - df_looker = pd.DataFrame({"sample_id":"~{samplename}", "output_seq_method_type": "~{output_seq_method_type}"},index=[0]) - - for antimicrobial in antimicrobial_list: - if antimicrobial in resistance.keys(): - df_looker[antimicrobial] = decipher_looker(resistance[antimicrobial]) - else: - df_looker[antimicrobial] = "S" - - df_looker["analysis_date"] = current_time - df_looker["operator"] = "~{operator}" - - df_looker.to_csv("tbprofiler_looker.csv", index=False) - - ### Report Generation ### - - # get timestamp in YYYY-MM-DD HH:MM format - current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') - - # Laboratorian report generation - parse_json_lab_report("~{json}") - - # LIMS report generation - parse_json_lims_report("~{json}", current_time) - - # LOOKER report generation - parse_json_looker_report("~{json}", current_time) - - CODE - >>> - output { - File tbprofiler_looker_csv = "tbprofiler_looker.csv" - File tbprofiler_laboratorian_report_csv = "tbprofiler_laboratorian_report.csv" - File tbprofiler_lims_report_csv = "tbprofiler_lims_report.csv" - } - runtime { - docker: "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.2" - memory: "4 GB" - cpu: 1 - disks: "local-disk " + 10 + " SSD" - disk: 10 + " GB" - maxRetries: 0 - } -} \ No newline at end of file diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index e960a7eb1..c0cf4e819 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -799,10 +799,11 @@ workflow theiaprok_illumina_pe { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - File? tbprofiler_lims_report_csv = merlin_magic.tbprofiler_lims_report_csv - File? tbprofiler_looker_csv = merlin_magic.tbprofiler_looker_csv - File? tbprofiler_laboratorian_report_csv = merlin_magic.tbprofiler_laboratorian_report_csv - File? tbprofiler_resistance_genes_percent_coverage = merlin_magic.tb_resistance_genes_percent_coverage + File? tbp_parser_lims_report_csv = merlin_magic.tbp_parser_lims_report_csv + File? tbp_parser_looker_report_csv = merlin_magic.tbp_parser_looker_report_csv + File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv + File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index 4f2025bd6..a827303a5 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -751,10 +751,11 @@ workflow theiaprok_illumina_se { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - File? tbprofiler_lims_report_csv = merlin_magic.tbprofiler_lims_report_csv - File? tbprofiler_laboratorian_report_csv = merlin_magic.tbprofiler_laboratorian_report_csv - File? tbprofiler_looker_csv = merlin_magic.tbprofiler_looker_csv - File? tbprofiler_resistance_genes_percent_coverage = merlin_magic.tb_resistance_genes_percent_coverage + File? tbp_parser_lims_report_csv = merlin_magic.tbp_parser_lims_report_csv + File? tbp_parser_looker_report_csv = merlin_magic.tbp_parser_looker_report_csv + File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv + File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 06628be0a..3741fc42c 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -713,11 +713,12 @@ workflow theiaprok_ont { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - File? tbprofiler_laboratorian_report_csv = merlin_magic.tbprofiler_laboratorian_report_csv - File? tbprofiler_lims_report_csv = merlin_magic.tbprofiler_lims_report_csv - File? tbprofiler_looker_csv = merlin_magic.tbprofiler_looker_csv - File? tbprofiler_resistance_genes_percent_coverage = merlin_magic.tb_resistance_genes_percent_coverage - # Legionella pneumophila typing + File? tbp_parser_lims_report_csv = merlin_magic.tbp_parser_lims_report_csv + File? tbp_parser_looker_report_csv = merlin_magic.tbp_parser_looker_report_csv + File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv + File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage + # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt String? legsta_version = merlin_magic.legsta_version diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index b6fd6e2ae..107a26df6 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -11,8 +11,7 @@ import "../../tasks/species_typing/task_sistr.wdl" as sistr_task import "../../tasks/species_typing/task_seqsero2.wdl" as seqsero2_task import "../../tasks/species_typing/task_kleborate.wdl" as kleborate_task import "../../tasks/species_typing/task_tbprofiler.wdl" as tbprofiler_task -import "../../tasks/species_typing/task_tbprofiler_output_parsing.wdl" as tbprofiler_output_parsing_task -import "../../tasks/species_typing/task_tb_gene_coverage.wdl" as tb_gene_coverage_task +import "../../tasks/species_typing/task_tbp_parser.wdl" as tbp_parser_task import "../../tasks/species_typing/task_legsta.wdl" as legsta_task import "../../tasks/species_typing/task_genotyphi.wdl" as genotyphi import "../../tasks/species_typing/task_kaptive.wdl" as kaptive_task @@ -63,8 +62,12 @@ workflow merlin_magic { Boolean assembly_only = false Boolean theiaeuk = false Boolean tbprofiler_additional_outputs = false - String tbprofiler_output_seq_method_type = "WGS" - String tbprofiler_operator = "Default" + String tbp_parser_output_seq_method_type = "WGS" + String? tbp_parser_operator + Int? tbp_parser_min_depth + Int? tbp_parser_coverage_threshold + Boolean? tbp_parser_debug + String? tbp_parser_docker_image String? snippy_query_gene Int srst2_min_cov = 80 Int srst2_max_divergence = 20 @@ -239,18 +242,18 @@ workflow merlin_magic { ont_data = ont_data } if (tbprofiler_additional_outputs) { - call tbprofiler_output_parsing_task.tbprofiler_output_parsing{ + call tbp_parser_task.tbp_parser { input: - json = tbprofiler.tbprofiler_output_json, - output_seq_method_type = tbprofiler_output_seq_method_type, - operator = tbprofiler_operator, - samplename = samplename - } - call tb_gene_coverage_task.tb_gene_coverage { - input: - bamfile = tbprofiler.tbprofiler_output_bam, - bamindex = tbprofiler.tbprofiler_output_bai, - samplename = samplename + tbprofiler_json = tbprofiler.tbprofiler_output_json, + tbprofiler_bam = tbprofiler.tbprofiler_output_bam, + tbprofiler_bai = tbprofiler.tbprofiler_output_bai, + samplename = samplename, + sequencing_method = tbp_parser_output_seq_method_type, + operator = tbp_parser_operator, + min_depth = tbp_parser_min_depth, + coverage_threshold = tbp_parser_coverage_threshold, + tbp_parser_debug = tbp_parser_debug, + docker = tbp_parser_docker_image } } } @@ -569,10 +572,15 @@ workflow merlin_magic { String? tbprofiler_sub_lineage = tbprofiler.tbprofiler_sub_lineage String? tbprofiler_dr_type = tbprofiler.tbprofiler_dr_type String? tbprofiler_resistance_genes = tbprofiler.tbprofiler_resistance_genes - File? tbprofiler_lims_report_csv = tbprofiler_output_parsing.tbprofiler_lims_report_csv - File? tbprofiler_laboratorian_report_csv = tbprofiler_output_parsing.tbprofiler_laboratorian_report_csv - File? tbprofiler_looker_csv = tbprofiler_output_parsing.tbprofiler_looker_csv - File? tb_resistance_genes_percent_coverage = tb_gene_coverage.tb_resistance_genes_percent_coverage + Int? tbprofiler_median_coverage = tbprofiler.tbprofiler_median_coverage + Float? tbprofiler_pct_reads_mapped = tbprofiler.tbprofiler_pct_reads_mapped + String? tbp_parser_version = tbp_parser.tbp_parser_version + String? tbp_parser_docker = tbp_parser.tbp_parser_docker + File? tbp_parser_lims_report_csv = tbp_parser.tbp_parser_lims_report_csv + File? tbp_parser_laboratorian_report_csv = tbp_parser.tbp_parser_laboratorian_report_csv + File? tbp_parser_looker_report_csv = tbp_parser.tbp_parser_looker_report_csv + File? tbp_parser_coverage_report = tbp_parser.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = tbp_parser.tbp_parser_genome_percent_coverage # Legionella pneumophila Typing File? legsta_results = legsta.legsta_results String? legsta_predicted_sbt = legsta.legsta_predicted_sbt From 5c769b655e827ef6c1b3128eb90cc3ae6657bb4e Mon Sep 17 00:00:00 2001 From: cimendes Date: Tue, 12 Sep 2023 09:19:31 +0000 Subject: [PATCH 02/23] add root --- tasks/species_typing/task_tbp_parser.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index ac6283d16..0d0d6893d 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -20,10 +20,10 @@ task tbp_parser { } command <<< # get version - python3 tbp-parser/tbp_parser/tbp_parser.py --version | tee VERSION + python3 /tbp-parser/tbp_parser/tbp_parser.py --version | tee VERSION # run tbp-parser - python3 tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ + python3 /tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ ~{"--sequencing_method" + sequencing_method} \ ~{"--operator" + operator} \ ~{"--min_depth" + min_depth} \ From cc2d611079ad1e1774794af5b54d560547370d47 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 12 Sep 2023 13:37:46 +0000 Subject: [PATCH 03/23] update docker, add spaces --- tasks/species_typing/task_tbp_parser.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 0d0d6893d..5fab6d0c6 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.1" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.2" Int disk_size = 100 Int memory = 4 Int cpu = 1 @@ -24,10 +24,10 @@ task tbp_parser { # run tbp-parser python3 /tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ - ~{"--sequencing_method" + sequencing_method} \ - ~{"--operator" + operator} \ - ~{"--min_depth" + min_depth} \ - ~{"--coverage_threshold" + coverage_threshold} \ + ~{"--sequencing_method " + sequencing_method} \ + ~{"--operator " + operator} \ + ~{"--min_depth " + min_depth} \ + ~{"--coverage_threshold " + coverage_threshold} \ --output_prefix ~{samplename} \ ~{true="--debug" false="--verbose" tbp_parser_debug} From a743fefbd3a83896c89a9174bfd08cd1d55fa531 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 12 Sep 2023 15:57:03 +0000 Subject: [PATCH 04/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 5fab6d0c6..37da00c72 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.2" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.3" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 0065d251514d571cba7dd232d1fe93c5be3e154e Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 12 Sep 2023 17:55:38 +0000 Subject: [PATCH 05/23] update container --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 37da00c72..2093a8fa2 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.3" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.4" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 2f9e80a2464396152340590ec20d4e34499528fc Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 13 Sep 2023 15:26:10 +0000 Subject: [PATCH 06/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 2093a8fa2..f86302275 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.4" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.5" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 5227d77f7e95097f5b3e5d66048cdf2bbdc61c67 Mon Sep 17 00:00:00 2001 From: Ash O'Farrell Date: Thu, 14 Sep 2023 06:57:00 -0700 Subject: [PATCH 07/23] Add preemptible, shorter version string (#185) --- tasks/species_typing/task_tbp_parser.wdl | 3 ++- tasks/species_typing/task_tbprofiler.wdl | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index f86302275..2f5824979 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -50,6 +50,7 @@ task tbp_parser { cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 3 + preemptible: 1 } } \ No newline at end of file diff --git a/tasks/species_typing/task_tbprofiler.wdl b/tasks/species_typing/task_tbprofiler.wdl index a3a881243..7b9cad530 100644 --- a/tasks/species_typing/task_tbprofiler.wdl +++ b/tasks/species_typing/task_tbprofiler.wdl @@ -22,7 +22,7 @@ task tbprofiler { date | tee DATE # Print and save version - tb-profiler version > VERSION && sed -i -e 's/^/TBProfiler version /' VERSION + tb-profiler version > VERSION && sed -i -e 's/TBProfiler version //' VERSION && sed -n -i '$p' VERSION if [ -z "~{read2}" ] ; then INPUT_READS="-1 ~{read1}" @@ -119,6 +119,7 @@ task tbprofiler { cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 3 + preemptible: 1 } } \ No newline at end of file From 8fcfefa47339e0fd40a268d8dabf69f86453f317 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 15 Sep 2023 14:53:15 +0000 Subject: [PATCH 08/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 2f5824979..20a0a19d2 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.5" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.6" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 7d00801bd2756922f7771dcbd54f4153e5cc6bb1 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 15 Sep 2023 16:02:24 +0000 Subject: [PATCH 09/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 20a0a19d2..aecf88b8b 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.6" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.7" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 51ddccdbc96936329dbbf3bf3ef1f1e3904b287f Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 15 Sep 2023 16:56:18 +0000 Subject: [PATCH 10/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index aecf88b8b..4f82830a8 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.7" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.8" Int disk_size = 100 Int memory = 4 Int cpu = 1 From c924d880beff6b9be7b69d18329465eb7e497cd5 Mon Sep 17 00:00:00 2001 From: frankambrosio3 Date: Mon, 18 Sep 2023 13:05:44 +0000 Subject: [PATCH 11/23] Added clockwork task to theiaprok illumina pe --- tasks/species_typing/task_clockwork.wdl | 49 +++++++++++++++++++ .../theiaprok/wf_theiaprok_illumina_pe.wdl | 3 ++ workflows/utilities/wf_merlin_magic.wdl | 17 +++++-- 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 tasks/species_typing/task_clockwork.wdl diff --git a/tasks/species_typing/task_clockwork.wdl b/tasks/species_typing/task_clockwork.wdl new file mode 100644 index 000000000..47d96666a --- /dev/null +++ b/tasks/species_typing/task_clockwork.wdl @@ -0,0 +1,49 @@ +version 1.0 + +task clockwork_decon_reads { + # Inputs + input { + File read1 + File? read2 + String samplename + Int disk_size = 200 + Int cpu = 16 + Int mem = 32 + } + + command <<< + # Print and save date + date | tee DATE + + # Print and save version + clockwork version > VERSION + + # Map reads to the clockwork reference + clockwork map_reads \ + --unsorted_sam ~{samplename} /varpipe_wgs/tools/clockwork-0.11.3/OUT/ref.fa \ + "~{samplename}.sam" \ + ~{read1} \ + ~{read2} + + # Remove contaminants (reads that map with high identity to non-MTB sequences) + clockwork remove_contam \ + /varpipe_wgs/tools/clockwork-0.11.3/OUT/remove_contam_metadata.tsv \ + "~{samplename}.sam" \ + "~{samplename}_outfile_read_counts" \ + "./clockwork_cleaned_~{samplename}_R1.fastq.gz" \ + "./clockwork_cleaned_~{samplename}_R2.fastq.gz" + + >>> + output { + File clockwork_cleaned_read1 = "./clockwork_cleaned_~{samplename}_R1.fastq.gz" + File clockwork_cleaned_read2 = "./clockwork_cleaned_~{samplename}_R2.fastq.gz" + } + runtime { + docker: "us-docker.pkg.dev/general-theiagen/cdcgov/varpipe_wgs_with_refs:2bc7234074bd53d9e92a1048b0485763cd9bbf6f4d12d5a1cc82bfec8ca7d75e" + memory: "~{mem} GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + } +} \ No newline at end of file diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index c0cf4e819..da8d00018 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -804,6 +804,9 @@ workflow theiaprok_illumina_pe { File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage + File? clockwork_decontaminated_read1 = merlin_magic.clockwork_cleaned_read1 + File? clockwork_decontaminated_read2 = merlin_magic.clockwork_cleaned_read2 + # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index 107a26df6..03ee03856 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -10,6 +10,7 @@ import "../../tasks/species_typing/task_lissero.wdl" as lissero_task import "../../tasks/species_typing/task_sistr.wdl" as sistr_task import "../../tasks/species_typing/task_seqsero2.wdl" as seqsero2_task import "../../tasks/species_typing/task_kleborate.wdl" as kleborate_task +import "../../tasks/species_typing/task_clockwork.wdl" as clockwork_task import "../../tasks/species_typing/task_tbprofiler.wdl" as tbprofiler_task import "../../tasks/species_typing/task_tbp_parser.wdl" as tbp_parser_task import "../../tasks/species_typing/task_legsta.wdl" as legsta_task @@ -234,14 +235,19 @@ workflow merlin_magic { } if (merlin_tag == "Mycobacterium tuberculosis") { if (!assembly_only) { - call tbprofiler_task.tbprofiler { # needs testing + call clockwork_task.clockwork_decon_reads { input: read1 = select_first([read1]), read2 = read2, - samplename = samplename, - ont_data = ont_data + samplename = samplename } - if (tbprofiler_additional_outputs) { + call tbprofiler_task.tbprofiler { # needs testing + input: + read1 = clockwork_decon_reads.clockwork_cleaned_read1, + read2 = clockwork_decon_reads.clockwork_cleaned_read2, + samplename = samplename + } + if (tbprofiler_additional_outputs) { #needs ONT support call tbp_parser_task.tbp_parser { input: tbprofiler_json = tbprofiler.tbprofiler_output_json, @@ -581,6 +587,9 @@ workflow merlin_magic { File? tbp_parser_looker_report_csv = tbp_parser.tbp_parser_looker_report_csv File? tbp_parser_coverage_report = tbp_parser.tbp_parser_coverage_report Float? tbp_parser_genome_percent_coverage = tbp_parser.tbp_parser_genome_percent_coverage + File? clockwork_cleaned_read1 = clockwork_decon_reads.clockwork_cleaned_read1 + File? clockwork_cleaned_read2 = clockwork_decon_reads.clockwork_cleaned_read2 + # Legionella pneumophila Typing File? legsta_results = legsta.legsta_results String? legsta_predicted_sbt = legsta.legsta_predicted_sbt From 2c5fee974c5b02fc3a298404b475578f209a81c3 Mon Sep 17 00:00:00 2001 From: frankambrosio3 Date: Mon, 18 Sep 2023 15:54:14 +0000 Subject: [PATCH 12/23] 64gb ram --- tasks/species_typing/task_clockwork.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_clockwork.wdl b/tasks/species_typing/task_clockwork.wdl index 47d96666a..b296850f1 100644 --- a/tasks/species_typing/task_clockwork.wdl +++ b/tasks/species_typing/task_clockwork.wdl @@ -8,7 +8,7 @@ task clockwork_decon_reads { String samplename Int disk_size = 200 Int cpu = 16 - Int mem = 32 + Int mem = 64 } command <<< From 9d4235ed5aaf054c05420016022fdf072f07a8df Mon Sep 17 00:00:00 2001 From: cimendes Date: Mon, 18 Sep 2023 16:41:34 +0000 Subject: [PATCH 13/23] remove sam file at the end --- tasks/species_typing/task_clockwork.wdl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tasks/species_typing/task_clockwork.wdl b/tasks/species_typing/task_clockwork.wdl index b296850f1..8c7d2f680 100644 --- a/tasks/species_typing/task_clockwork.wdl +++ b/tasks/species_typing/task_clockwork.wdl @@ -9,6 +9,7 @@ task clockwork_decon_reads { Int disk_size = 200 Int cpu = 16 Int mem = 64 + String docker = "us-docker.pkg.dev/general-theiagen/cdcgov/varpipe_wgs_with_refs:2bc7234074bd53d9e92a1048b0485763cd9bbf6f4d12d5a1cc82bfec8ca7d75e" } command <<< @@ -33,13 +34,16 @@ task clockwork_decon_reads { "./clockwork_cleaned_~{samplename}_R1.fastq.gz" \ "./clockwork_cleaned_~{samplename}_R2.fastq.gz" + # Clean up files + rm "~{samplename}.sam" + >>> output { File clockwork_cleaned_read1 = "./clockwork_cleaned_~{samplename}_R1.fastq.gz" File clockwork_cleaned_read2 = "./clockwork_cleaned_~{samplename}_R2.fastq.gz" } runtime { - docker: "us-docker.pkg.dev/general-theiagen/cdcgov/varpipe_wgs_with_refs:2bc7234074bd53d9e92a1048b0485763cd9bbf6f4d12d5a1cc82bfec8ca7d75e" + docker: docker memory: "~{mem} GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" From d617a9f09d838c1ca14ffb7b38c9a0915be680a4 Mon Sep 17 00:00:00 2001 From: frankambrosio3 Date: Tue, 19 Sep 2023 18:01:59 +0000 Subject: [PATCH 14/23] v8 --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 5fab6d0c6..af846b8f9 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.2" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.8" Int disk_size = 100 Int memory = 4 Int cpu = 1 From ac4fd39a48a5304dc6ea95b16748ccb545aee174 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 20 Sep 2023 14:55:37 +0000 Subject: [PATCH 15/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 4f82830a8..a4258914f 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.8" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.9" Int disk_size = 100 Int memory = 4 Int cpu = 1 From c84ce51cc75bf56af80c2749e5bf2cf39cd1b5b7 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 20 Sep 2023 16:07:15 +0000 Subject: [PATCH 16/23] apply style guidelines --- tasks/species_typing/task_clockwork.wdl | 37 +++++++++++-------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/tasks/species_typing/task_clockwork.wdl b/tasks/species_typing/task_clockwork.wdl index 8c7d2f680..6e0d027f9 100644 --- a/tasks/species_typing/task_clockwork.wdl +++ b/tasks/species_typing/task_clockwork.wdl @@ -1,53 +1,48 @@ version 1.0 task clockwork_decon_reads { - # Inputs input { File read1 - File? read2 + File read2 String samplename Int disk_size = 200 Int cpu = 16 Int mem = 64 String docker = "us-docker.pkg.dev/general-theiagen/cdcgov/varpipe_wgs_with_refs:2bc7234074bd53d9e92a1048b0485763cd9bbf6f4d12d5a1cc82bfec8ca7d75e" - } - + } command <<< - # Print and save date - date | tee DATE - # Print and save version clockwork version > VERSION # Map reads to the clockwork reference clockwork map_reads \ - --unsorted_sam ~{samplename} /varpipe_wgs/tools/clockwork-0.11.3/OUT/ref.fa \ - "~{samplename}.sam" \ - ~{read1} \ - ~{read2} + --unsorted_sam ~{samplename} /varpipe_wgs/tools/clockwork-0.11.3/OUT/ref.fa \ + "~{samplename}.sam" \ + ~{read1} \ + ~{read2} # Remove contaminants (reads that map with high identity to non-MTB sequences) clockwork remove_contam \ - /varpipe_wgs/tools/clockwork-0.11.3/OUT/remove_contam_metadata.tsv \ - "~{samplename}.sam" \ - "~{samplename}_outfile_read_counts" \ - "./clockwork_cleaned_~{samplename}_R1.fastq.gz" \ - "./clockwork_cleaned_~{samplename}_R2.fastq.gz" + /varpipe_wgs/tools/clockwork-0.11.3/OUT/remove_contam_metadata.tsv \ + "~{samplename}.sam" \ + "~{samplename}_outfile_read_counts" \ + "clockwork_cleaned_~{samplename}_R1.fastq.gz" \ + "clockwork_cleaned_~{samplename}_R2.fastq.gz" # Clean up files rm "~{samplename}.sam" - >>> output { - File clockwork_cleaned_read1 = "./clockwork_cleaned_~{samplename}_R1.fastq.gz" - File clockwork_cleaned_read2 = "./clockwork_cleaned_~{samplename}_R2.fastq.gz" + File clockwork_cleaned_read1 = "clockwork_cleaned_~{samplename}_R1.fastq.gz" + File clockwork_cleaned_read2 = "clockwork_cleaned_~{samplename}_R2.fastq.gz" } runtime { docker: docker - memory: "~{mem} GB" + memory: mem + " GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 3 + preemptible: 1 } } \ No newline at end of file From 7c1a4f194ef2bccaf57e4a9dd7788b29314bde42 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 20 Sep 2023 16:11:12 +0000 Subject: [PATCH 17/23] apply style guidelines --- tasks/species_typing/task_clockwork.wdl | 2 +- workflows/theiaprok/wf_theiaprok_illumina_pe.wdl | 1 - workflows/utilities/wf_merlin_magic.wdl | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tasks/species_typing/task_clockwork.wdl b/tasks/species_typing/task_clockwork.wdl index 6e0d027f9..bfe304864 100644 --- a/tasks/species_typing/task_clockwork.wdl +++ b/tasks/species_typing/task_clockwork.wdl @@ -3,7 +3,7 @@ version 1.0 task clockwork_decon_reads { input { File read1 - File read2 + File? read2 # only optional to not fail in merlin_magic String samplename Int disk_size = 200 Int cpu = 16 diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index da8d00018..c241e8f8e 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -806,7 +806,6 @@ workflow theiaprok_illumina_pe { Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage File? clockwork_decontaminated_read1 = merlin_magic.clockwork_cleaned_read1 File? clockwork_decontaminated_read2 = merlin_magic.clockwork_cleaned_read2 - # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index 03ee03856..5e9600f67 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -241,13 +241,13 @@ workflow merlin_magic { read2 = read2, samplename = samplename } - call tbprofiler_task.tbprofiler { # needs testing + call tbprofiler_task.tbprofiler { input: read1 = clockwork_decon_reads.clockwork_cleaned_read1, read2 = clockwork_decon_reads.clockwork_cleaned_read2, samplename = samplename } - if (tbprofiler_additional_outputs) { #needs ONT support + if (tbprofiler_additional_outputs) { call tbp_parser_task.tbp_parser { input: tbprofiler_json = tbprofiler.tbprofiler_output_json, From 759180c0527f88f19063a47b2640e788685f7a91 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 21 Sep 2023 08:19:12 +0000 Subject: [PATCH 18/23] update md5sum --- tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml | 6 +++--- tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 5fa38e056..6be0c145b 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -620,7 +620,7 @@ - path: miniwdl_run/wdl/tasks/species_typing/task_sonneityping.wdl md5sum: aeb12cf9a0db4e34f2aecbaba51c30fb - path: miniwdl_run/wdl/tasks/species_typing/task_tbprofiler.wdl - md5sum: b1676a1713bab967b3534da1790fa013 + md5sum: 27428d62762d736cd5fc3a034abcfeb9 - path: miniwdl_run/wdl/tasks/species_typing/task_ts_mlst.wdl md5sum: 550791ca5faf11a5f75b6be18739ae01 - path: miniwdl_run/wdl/tasks/task_versioning.wdl @@ -634,9 +634,9 @@ - path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl md5sum: 0236363c7f0694cd3f96416aa43e2f91 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl - md5sum: a76d59109075ce8b861e63ffe70d7c77 + md5sum: 15e86c99f08d8b8175b18646551d671c - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl - md5sum: 53555c2f3e144e55f362080c5e75e434 + md5sum: e9dbfd1ee9d3c2009963e2fbea352a4b - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl md5sum: 40d4e09a82030c8219b37f883cddaca4 - path: miniwdl_run/workflow.log diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index dcb4661b5..7056c728f 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -588,7 +588,7 @@ - path: miniwdl_run/wdl/tasks/species_typing/task_sonneityping.wdl md5sum: aeb12cf9a0db4e34f2aecbaba51c30fb - path: miniwdl_run/wdl/tasks/species_typing/task_tbprofiler.wdl - md5sum: b1676a1713bab967b3534da1790fa013 + md5sum: 27428d62762d736cd5fc3a034abcfeb9 - path: miniwdl_run/wdl/tasks/species_typing/task_ts_mlst.wdl md5sum: 550791ca5faf11a5f75b6be18739ae01 - path: miniwdl_run/wdl/tasks/task_versioning.wdl @@ -602,9 +602,9 @@ - path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl md5sum: 0236363c7f0694cd3f96416aa43e2f91 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl - md5sum: 858d33eb64b9bda618a47a999f370df5 + md5sum: 128c159032400b1f275da24869892a82 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl - md5sum: 53555c2f3e144e55f362080c5e75e434 + md5sum: e9dbfd1ee9d3c2009963e2fbea352a4b - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl md5sum: 53d322d895837c0bcb049786572e944d - path: miniwdl_run/workflow.log From 57e3ef69d2fbb63177dcfbb6a48ed5066bfb32f2 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 21 Sep 2023 12:11:59 +0000 Subject: [PATCH 19/23] only run clockwork is paired_end and not ont data --- workflows/utilities/wf_merlin_magic.wdl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index 5e9600f67..22124be37 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -235,16 +235,18 @@ workflow merlin_magic { } if (merlin_tag == "Mycobacterium tuberculosis") { if (!assembly_only) { - call clockwork_task.clockwork_decon_reads { - input: - read1 = select_first([read1]), - read2 = read2, - samplename = samplename + if (paired_end && !ont_data) { + call clockwork_task.clockwork_decon_reads { + input: + read1 = select_first([read1]), + read2 = read2, + samplename = samplename + } } call tbprofiler_task.tbprofiler { input: - read1 = clockwork_decon_reads.clockwork_cleaned_read1, - read2 = clockwork_decon_reads.clockwork_cleaned_read2, + read1 = select_first([clockwork_decon_reads.clockwork_cleaned_read1,read1]), + read2 = select_first([clockwork_decon_reads.clockwork_cleaned_read2,read2]), samplename = samplename } if (tbprofiler_additional_outputs) { From 751621bb0f77c65bb550a50232e5904de17821cc Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 21 Sep 2023 12:40:41 +0000 Subject: [PATCH 20/23] try fix --- workflows/utilities/wf_merlin_magic.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index 22124be37..84fd6ec09 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -246,7 +246,7 @@ workflow merlin_magic { call tbprofiler_task.tbprofiler { input: read1 = select_first([clockwork_decon_reads.clockwork_cleaned_read1,read1]), - read2 = select_first([clockwork_decon_reads.clockwork_cleaned_read2,read2]), + read2 = select_first([clockwork_decon_reads.clockwork_cleaned_read2,read2,'']), samplename = samplename } if (tbprofiler_additional_outputs) { From e279d1dd316f4a4251c749f51737b082578f2aec Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 21 Sep 2023 14:59:19 +0000 Subject: [PATCH 21/23] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index a4258914f..ab4e7ff80 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.9" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.10" Int disk_size = 100 Int memory = 4 Int cpu = 1 From d5cd4c7ca7e2667dd2100595392b0dac96d09949 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 21 Sep 2023 14:59:19 +0000 Subject: [PATCH 22/23] add potential fix for the select_first null issue --- tasks/species_typing/task_tbprofiler.wdl | 3 ++- workflows/utilities/wf_merlin_magic.wdl | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tasks/species_typing/task_tbprofiler.wdl b/tasks/species_typing/task_tbprofiler.wdl index 7b9cad530..63c48897a 100644 --- a/tasks/species_typing/task_tbprofiler.wdl +++ b/tasks/species_typing/task_tbprofiler.wdl @@ -24,7 +24,8 @@ task tbprofiler { # Print and save version tb-profiler version > VERSION && sed -i -e 's/TBProfiler version //' VERSION && sed -n -i '$p' VERSION - if [ -z "~{read2}" ] ; then + # check if file is non existant or non empty + if [ -z "~{read2}" ] || [ ! -s "~{read2}" ] ; then INPUT_READS="-1 ~{read1}" else INPUT_READS="-1 ~{read1} -2 ~{read2}" diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index 84fd6ec09..f42db2bc0 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -245,8 +245,8 @@ workflow merlin_magic { } call tbprofiler_task.tbprofiler { input: - read1 = select_first([clockwork_decon_reads.clockwork_cleaned_read1,read1]), - read2 = select_first([clockwork_decon_reads.clockwork_cleaned_read2,read2,'']), + read1 = select_first([clockwork_decon_reads.clockwork_cleaned_read1, read1]), + read2 = select_first([clockwork_decon_reads.clockwork_cleaned_read2, read2, "gs://theiagen-public-files/terra/theiaprok-files/no-read2.txt"]), samplename = samplename } if (tbprofiler_additional_outputs) { From 74d11fd4ff2692771721628cc961a6e82de0787e Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 21 Sep 2023 15:39:44 +0000 Subject: [PATCH 23/23] update md5sum --- tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml | 4 ++-- tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 6be0c145b..8946b69e4 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -620,7 +620,7 @@ - path: miniwdl_run/wdl/tasks/species_typing/task_sonneityping.wdl md5sum: aeb12cf9a0db4e34f2aecbaba51c30fb - path: miniwdl_run/wdl/tasks/species_typing/task_tbprofiler.wdl - md5sum: 27428d62762d736cd5fc3a034abcfeb9 + md5sum: f3c9cdca6d49878ab2be31aff128e1b4 - path: miniwdl_run/wdl/tasks/species_typing/task_ts_mlst.wdl md5sum: 550791ca5faf11a5f75b6be18739ae01 - path: miniwdl_run/wdl/tasks/task_versioning.wdl @@ -636,7 +636,7 @@ - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl md5sum: 15e86c99f08d8b8175b18646551d671c - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl - md5sum: e9dbfd1ee9d3c2009963e2fbea352a4b + md5sum: c4861a59d49b13b67706631a0e1246c4 - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl md5sum: 40d4e09a82030c8219b37f883cddaca4 - path: miniwdl_run/workflow.log diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 7056c728f..933077097 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -588,7 +588,7 @@ - path: miniwdl_run/wdl/tasks/species_typing/task_sonneityping.wdl md5sum: aeb12cf9a0db4e34f2aecbaba51c30fb - path: miniwdl_run/wdl/tasks/species_typing/task_tbprofiler.wdl - md5sum: 27428d62762d736cd5fc3a034abcfeb9 + md5sum: f3c9cdca6d49878ab2be31aff128e1b4 - path: miniwdl_run/wdl/tasks/species_typing/task_ts_mlst.wdl md5sum: 550791ca5faf11a5f75b6be18739ae01 - path: miniwdl_run/wdl/tasks/task_versioning.wdl @@ -604,7 +604,7 @@ - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl md5sum: 128c159032400b1f275da24869892a82 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl - md5sum: e9dbfd1ee9d3c2009963e2fbea352a4b + md5sum: c4861a59d49b13b67706631a0e1246c4 - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl md5sum: 53d322d895837c0bcb049786572e944d - path: miniwdl_run/workflow.log