From b0c2a0e4e7479b681b1290df8bdde760b8127444 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 11 Sep 2023 20:17:39 +0000 Subject: [PATCH 01/12] add to theiaprok --- .../species_typing/task_tb_gene_coverage.wdl | 210 -------- tasks/species_typing/task_tbp_parser.wdl | 55 ++ tasks/species_typing/task_tbprofiler.wdl | 12 +- .../task_tbprofiler_output_parsing.wdl | 484 ------------------ .../theiaprok/wf_theiaprok_illumina_pe.wdl | 9 +- .../theiaprok/wf_theiaprok_illumina_se.wdl | 9 +- workflows/theiaprok/wf_theiaprok_ont.wdl | 11 +- workflows/utilities/wf_merlin_magic.wdl | 46 +- 8 files changed, 108 insertions(+), 728 deletions(-) delete mode 100644 tasks/species_typing/task_tb_gene_coverage.wdl create mode 100644 tasks/species_typing/task_tbp_parser.wdl delete mode 100644 tasks/species_typing/task_tbprofiler_output_parsing.wdl diff --git a/tasks/species_typing/task_tb_gene_coverage.wdl b/tasks/species_typing/task_tb_gene_coverage.wdl deleted file mode 100644 index 212214256..000000000 --- a/tasks/species_typing/task_tb_gene_coverage.wdl +++ /dev/null @@ -1,210 +0,0 @@ -version 1.0 - -task tb_gene_coverage { - input { - File bamfile - File bamindex - String samplename - Int min_depth = 10 - Int disk_size = 100 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15" - } - command <<< - chr=$(samtools idxstats ~{bamfile} | cut -f 1 | head -1) - - # samtools outputs 3 columns; column 3 is the depth of coverage per nucleotide position, piped to awk to count the positions - # above min_depth, then wc -l counts them all - gyrB=$(samtools depth -J -r "${chr}:5040-7467" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - gyrA=$(samtools depth -J -r "${chr}:7102-10018" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fgd1=$(samtools depth -J -r "${chr}:490583-491993" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mshA=$(samtools depth -J -r "${chr}:575148-576990" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ccsA=$(samtools depth -J -r "${chr}:619691-621065" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpoB=$(samtools depth -J -r "${chr}:759607-763525" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpoC=$(samtools depth -J -r "${chr}:763170-767520" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mmpL5=$(samtools depth -J -r "${chr}:775386-778680" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mmpS5=$(samtools depth -J -r "${chr}:778277-779105" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - mmpR5=$(samtools depth -J -r "${chr}:778790-779687" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpsL=$(samtools depth -J -r "${chr}:781360-782134" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rplC=$(samtools depth -J -r "${chr}:800609-801662" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiC=$(samtools depth -J -r "${chr}:1302731-1305701" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv1258c=$(samtools depth -J -r "${chr}:1405881-1407540" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embR=$(samtools depth -J -r "${chr}:1415981-1417547" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - atpE=$(samtools depth -J -r "${chr}:1460845-1461490" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rrs=$(samtools depth -J -r "${chr}:1471646-1473582" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rrl=$(samtools depth -J -r "${chr}:1473458-1476995" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fabG1=$(samtools depth -J -r "${chr}:1673148-1674383" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - inhA=$(samtools depth -J -r "${chr}:1673848-1675211" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpsA=$(samtools depth -J -r "${chr}:1833342-1835187" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - tlyA=$(samtools depth -J -r "${chr}:1917740-1918946" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ndh=$(samtools depth -J -r "${chr}:2101451-2103242" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - katG=$(samtools depth -J -r "${chr}:2153689-2156570" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - PPE35=$(samtools depth -J -r "${chr}:2167449-2170812" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv1979c=$(samtools depth -J -r "${chr}:2221519-2223364" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - pncA=$(samtools depth -J -r "${chr}:2288481-2290323" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - kasA=$(samtools depth -J -r "${chr}:2517915-2519565" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - eis=$(samtools depth -J -r "${chr}:2713924-2715586" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ahpC=$(samtools depth -J -r "${chr}:2725912-2726980" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - folC=$(samtools depth -J -r "${chr}:2745935-2747798" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - pepQ=$(samtools depth -J -r "${chr}:2859100-2860618" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ribD=$(samtools depth -J -r "${chr}:2986639-2987815" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv2752c=$(samtools depth -J -r "${chr}:3064315-3066391" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - thyX=$(samtools depth -J -r "${chr}:3066993-3068161" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - thyA=$(samtools depth -J -r "${chr}:3073480-3074671" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ald=$(samtools depth -J -r "${chr}:3086620-3088135" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiD=$(samtools depth -J -r "${chr}:3338918-3339962" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv3083=$(samtools depth -J -r "${chr}:3448304-3450191" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fprA=$(samtools depth -J -r "${chr}:3473807-3475577" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - whiB7=$(samtools depth -J -r "${chr}:3568201-3568879" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - Rv3236c=$(samtools depth -J -r "${chr}:3611759-3613316" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiA=$(samtools depth -J -r "${chr}:3640343-3641738" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - fbiB=$(samtools depth -J -r "${chr}:3641335-3643081" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - alr=$(samtools depth -J -r "${chr}:3839994-3841620" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - rpoA=$(samtools depth -J -r "${chr}:3877264-3878707" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ddn=$(samtools depth -J -r "${chr}:3986644-3987499" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - clpC1=$(samtools depth -J -r "${chr}:4037958-4040904" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - panD=$(samtools depth -J -r "${chr}:4043662-4044481" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embC=$(samtools depth -J -r "${chr}:4239663-4243347" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embA=$(samtools depth -J -r "${chr}:4243004-4246717" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - embB=$(samtools depth -J -r "${chr}:4246314-4250010" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - aftB=$(samtools depth -J -r "${chr}:4266753-4269036" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ubiA=$(samtools depth -J -r "${chr}:4268725-4270033" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ethA=$(samtools depth -J -r "${chr}:4325804-4330174" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - ethR=$(samtools depth -J -r "${chr}:4327349-4328399" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - whiB6=$(samtools depth -J -r "${chr}:4337971-4338721" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - gid=$(samtools depth -J -r "${chr}:4407328-4408476" ~{bamfile} | awk -F "\t" '{if ($3 > ~{min_depth}) print;}' | wc -l ) - - # add one to gene lenth to compensate for subtraction - gyrB_pc=$(python3 -c "print ( round ( ($gyrB / 2428 ) * 100, 2 ) )") - gyrA_pc=$(python3 -c "print ( round( ($gyrA / 2917 ) * 100, 2 ) )") - fgd1_pc=$(python3 -c "print ( round( ($fgd1 / 1411 ) * 100, 2 ) )") - mshA_pc=$(python3 -c "print ( round( ($mshA / 1843 ) * 100, 2 ) )") - ccsA_pc=$(python3 -c "print ( round( ($ccsA / 1375 ) * 100, 2 ) )") - rpoB_pc=$(python3 -c "print ( round( ($rpoB / 3919 ) * 100, 2 ) )") - rpoC_pc=$(python3 -c "print ( round( ($rpoC / 4351 ) * 100, 2 ) )") - mmpL5_pc=$(python3 -c "print ( round( ($mmpL5 / 3295 ) * 100, 2 ) )") - mmpS5_pc=$(python3 -c "print ( round( ($mmpS5 / 829 ) * 100, 2 ) )") - mmpR5_pc=$(python3 -c "print ( round( ($mmpR5 / 898 ) * 100, 2 ) )") - rpsL_pc=$(python3 -c "print ( round( ($rpsL / 775 ) * 100, 2 ) )") - rplC_pc=$(python3 -c "print ( round( ($rplC / 1054 ) * 100, 2 ) )") - fbiC_pc=$(python3 -c "print ( round( ($fbiC / 2971 ) * 100, 2 ) )") - Rv1258c_pc=$(python3 -c "print ( round( ($Rv1258c / 1660 ) * 100, 2 ) )") - embR_pc=$(python3 -c "print ( round( ($embR / 1567 ) * 100, 2 ) )") - atpE_pc=$(python3 -c "print ( round( ($atpE / 646 ) * 100, 2 ) )") - rrs_pc=$(python3 -c "print ( round( ($rrs / 1937 ) * 100, 2 ) )") - rrl_pc=$(python3 -c "print ( round( ($rrl / 3538 ) * 100, 2 ) )") - fabG1_pc=$(python3 -c "print ( round( ($fabG1 / 1236 ) * 100, 2 ) )") - inhA_pc=$(python3 -c "print ( round( ($inhA / 1364 ) * 100, 2 ) )") - rpsA_pc=$(python3 -c "print ( round( ($rpsA / 1846 ) * 100, 2 ) )") - tlyA_pc=$(python3 -c "print ( round( ($tlyA / 1207 ) * 100, 2 ) )") - ndh_pc=$(python3 -c "print ( round( ($ndh / 1792 ) * 100, 2 ) )") - katG_pc=$(python3 -c "print ( round( ($katG / 2882 ) * 100, 2 ) )") - PPE35_pc=$(python3 -c "print ( round( ($PPE35 / 3364 ) * 100, 2 ) )") - Rv1979c_pc=$(python3 -c "print ( round( ($Rv1979c / 1846 ) * 100, 2 ) )") - pncA_pc=$(python3 -c "print ( round( ($pncA / 1843 ) * 100, 2 ) )") - kasA_pc=$(python3 -c "print ( round( ($kasA / 1651 ) * 100, 2 ) )") - eis_pc=$(python3 -c "print ( round( ($eis / 1663 ) * 100, 2 ) )") - ahpC_pc=$(python3 -c "print ( round( ($ahpC / 1069 ) * 100, 2 ) )") - folC_pc=$(python3 -c "print ( round( ($folC / 1864 ) * 100, 2 ) )") - pepQ_pc=$(python3 -c "print ( round( ($pepQ / 1519 ) * 100, 2 ) )") - ribD_pc=$(python3 -c "print ( round( ($ribD / 1177 ) * 100, 2 ) )") - Rv2752c_pc=$(python3 -c "print ( round( ($Rv2752c / 2077 ) * 100, 2 ) )") - thyX_pc=$(python3 -c "print ( round( ($thyX / 1169 ) * 100, 2 ) )") - thyA_pc=$(python3 -c "print ( round( ($thyA / 1192 ) * 100, 2 ) )") - ald_pc=$(python3 -c "print ( round( ($ald / 1516 ) * 100, 2 ) )") - fbiD_pc=$(python3 -c "print ( round( ($fbiD / 1045 ) * 100, 2 ) )") - Rv3083_pc=$(python3 -c "print ( round( ($Rv3083 / 1888 ) * 100, 2 ) )") - fprA_pc=$(python3 -c "print ( round( ($fprA / 1771 ) * 100, 2 ) )") - whiB7_pc=$(python3 -c "print ( round( ($whiB7 / 679 ) * 100, 2 ) )") - Rv3236c_pc=$(python3 -c "print ( round( ($Rv3236c / 1558 ) * 100, 2 ) )") - fbiA_pc=$(python3 -c "print ( round( ($fbiA / 1396 ) * 100, 2 ) )") - fbiB_pc=$(python3 -c "print ( round( ($fbiB / 1747 ) * 100, 2 ) )") - alr_pc=$(python3 -c "print ( round( ($alr / 1627 ) * 100, 2 ) )") - rpoA_pc=$(python3 -c "print ( round( ($rpoA / 1444 ) * 100, 2 ) )") - ddn_pc=$(python3 -c "print ( round( ($ddn / 856 ) * 100, 2 ) )") - clpC1_pc=$(python3 -c "print ( round( ($clpC1 / 2947 ) * 100, 2 ) )") - panD_pc=$(python3 -c "print ( round( ($panD / 820 ) * 100, 2 ) )") - embC_pc=$(python3 -c "print ( round( ($embC / 3685 ) * 100, 2 ) )") - embA_pc=$(python3 -c "print ( round( ($embA / 3714 ) * 100, 2 ) )") - embB_pc=$(python3 -c "print ( round( ($embB / 3697 ) * 100, 2 ) )") - aftB_pc=$(python3 -c "print ( round( ($aftB / 2284 ) * 100, 2 ) )") - ubiA_pc=$(python3 -c "print ( round( ($ubiA / 1309 ) * 100, 2 ) )") - ethA_pc=$(python3 -c "print ( round( ($ethA / 4371 ) * 100, 2 ) )") - ethR_pc=$(python3 -c "print ( round( ($ethR / 1051 ) * 100, 2 ) )") - whiB6_pc=$(python3 -c "print ( round( ($whiB6 / 751 ) * 100, 2 ) )") - gid_pc=$(python3 -c "print ( round( ($gid / 1149 ) * 100, 2 ) )") - - echo -e "#NOTE: THE VALUES BELOW ASSUME TBPROFILER (H37Rv) REFERENCE GENOME" > ~{samplename}.percent_gene_coverage.tsv - echo -e "Gene\tPercent_Coverage" >> ~{samplename}.percent_gene_coverage.tsv - echo -e "gyrB\t"$gyrB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "gyrA\t"$gyrA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fgd1\t"$fgd1_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mshA\t"$mshA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ccsA\t"$ccsA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpoB\t"$rpoB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpoC\t"$rpoC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mmpL5\t"$mmpL5_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mmpS5\t"$mmpS5_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "mmpR5\t"$mmpR5_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpsL\t"$rpsL_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rplC\t"$rplC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiC\t"$fbiC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv1258c\t"$Rv1258c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embR\t"$embR_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "atpE\t"$atpE_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rrs\t"$rrs_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rrl\t"$rrl_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fabG1\t"$fabG1_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "inhA\t"$inhA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpsA\t"$rpsA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "tlyA\t"$tlyA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ndh\t"$ndh_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "katG\t"$katG_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "PPE35\t"$PPE35_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv1979c\t"$Rv1979c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "pncA\t"$pncA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "kasA\t"$kasA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "eis\t"$eis_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ahpC\t"$ahpC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "folC\t"$folC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "pepQ\t"$pepQ_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ribD\t"$ribD_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv2752c\t"$Rv2752c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "thyX\t"$thyX_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "thyA\t"$thyA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ald\t"$ald_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiD\t"$fbiD_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv3083\t"$Rv3083_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fprA\t"$fprA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "whiB7\t"$whiB7_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "Rv3236c\t"$Rv3236c_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiA\t"$fbiA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "fbiB\t"$fbiB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "alr\t"$alr_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "rpoA\t"$rpoA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ddn\t"$ddn_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "clpC1\t"$clpC1_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "panD\t"$panD_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embC\t"$embC_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embA\t"$embA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "embB\t"$embB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "aftB\t"$aftB_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ubiA\t"$ubiA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ethA\t"$ethA_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "ethR\t"$ethR_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "whiB6\t"$whiB6_pc >> ~{samplename}.percent_gene_coverage.tsv - echo -e "gid\t"$gid_pc >> ~{samplename}.percent_gene_coverage.tsv - - >>> - output { - File tb_resistance_genes_percent_coverage = "~{samplename}.percent_gene_coverage.tsv" - } - runtime { - docker: docker - memory: "8 GB" - cpu: 2 - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - preemptible: 0 - maxRetries: 3 - } -} \ No newline at end of file diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl new file mode 100644 index 000000000..ac6283d16 --- /dev/null +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -0,0 +1,55 @@ +version 1.0 + +task tbp_parser { + input { + File tbprofiler_json + File tbprofiler_bam + File tbprofiler_bai + String samplename + + String? sequencing_method + String? operator + Int min_depth = 10 + Int coverage_threshold = 100 + Boolean tbp_parser_debug = false + + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.1" + Int disk_size = 100 + Int memory = 4 + Int cpu = 1 + } + command <<< + # get version + python3 tbp-parser/tbp_parser/tbp_parser.py --version | tee VERSION + + # run tbp-parser + python3 tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ + ~{"--sequencing_method" + sequencing_method} \ + ~{"--operator" + operator} \ + ~{"--min_depth" + min_depth} \ + ~{"--coverage_threshold" + coverage_threshold} \ + --output_prefix ~{samplename} \ + ~{true="--debug" false="--verbose" tbp_parser_debug} + + # get genome percent coverage for the entire reference genome length over min_depth + genome=$(samtools depth -J ~{tbprofiler_bam} | awk -F "\t" '{if ($3 >= ~{min_depth}) print;}' | wc -l ) + python3 -c "print ( ($genome / 4411532 ) * 100 )" | tee GENOME_PC + >>> + output { + File tbp_parser_looker_report_csv = "~{samplename}.looker_report.csv" + File tbp_parser_laboratorian_report_csv = "~{samplename}.laboratorian_report.csv" + File tbp_parser_lims_report_csv = "~{samplename}.lims_report.csv" + File tbp_parser_coverage_report = "~{samplename}.percent_gene_coverage.csv" + Float tbp_parser_genome_percent_coverage = read_float("GENOME_PC") + String tbp_parser_version = read_string("VERSION") + String tbp_parser_docker = docker + } + runtime { + docker: docker + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + } +} \ No newline at end of file diff --git a/tasks/species_typing/task_tbprofiler.wdl b/tasks/species_typing/task_tbprofiler.wdl index 1464cc1a1..a3a881243 100644 --- a/tasks/species_typing/task_tbprofiler.wdl +++ b/tasks/species_typing/task_tbprofiler.wdl @@ -9,7 +9,7 @@ task tbprofiler { String tbprofiler_docker_image = "us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:4.4.2" Int disk_size = 100 String mapper = "bwa" - String caller = "bcftools" + String caller = "freebayes" Int min_depth = 10 Float min_af = 0.1 Float min_af_pred = 0.1 @@ -22,7 +22,7 @@ task tbprofiler { date | tee DATE # Print and save version - tb-profiler --version > VERSION && sed -i -e 's/^/TBProfiler version /' VERSION + tb-profiler version > VERSION && sed -i -e 's/^/TBProfiler version /' VERSION if [ -z "~{read2}" ] ; then INPUT_READS="-1 ~{read1}" @@ -89,6 +89,12 @@ task tbprofiler { res_genes.append(tsv_dict[i]) res_genes_string=';'.join(res_genes) Resistance_Genes.write(res_genes_string) + with open ("MEDIAN_COVERAGE", 'wt') as Median_Coverage: + median_coverage=tsv_dict['median_coverage'] + Median_Coverage.write(median_coverage) + with open ("PCT_READS_MAPPED", 'wt') as Pct_Reads_Mapped: + pct_reads_mapped=tsv_dict['pct_reads_mapped'] + Pct_Reads_Mapped.write(pct_reads_mapped) CODE >>> output { @@ -104,6 +110,8 @@ task tbprofiler { String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS") String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS") String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES") + Int tbprofiler_median_coverage = read_int("MEDIAN_COVERAGE") + Float tbprofiler_pct_reads_mapped = read_float("PCT_READS_MAPPED") } runtime { docker: "~{tbprofiler_docker_image}" diff --git a/tasks/species_typing/task_tbprofiler_output_parsing.wdl b/tasks/species_typing/task_tbprofiler_output_parsing.wdl deleted file mode 100644 index b49739828..000000000 --- a/tasks/species_typing/task_tbprofiler_output_parsing.wdl +++ /dev/null @@ -1,484 +0,0 @@ -version 1.0 - -task tbprofiler_output_parsing { - input { - File json - String output_seq_method_type - String operator - String samplename - Int min_depth = 10 - } - command <<< - python3 < 0: - confidences.append(confidence.array[0]) - else: - confidences.append("No annotation") - if len(frequency) > 0: - frequencies.append(frequency.array[0]) - else: - frequencies.append("1") - return confidences, frequencies - - ## Main Parsing Functions ## - - def parse_json_lab_report(json_file): - """ - This function recieved the tbprofiler output json file and - writes the Laboratorian report that includes the following information - per mutation: - - sample_id: inclides sample name - - tbprofiler_gene_name: gene name - - tbprofiler_locus_tag: locus tag - - tbprofiler_variant_substitution_type: variant substitution type (missense_variant, upstream_gene_variant...) - - tbprofiler_variant_substitution_nt: nucleotide substitution (c.1349C>G) - - tbprofiler_variant_substitution_aa: aminoacid substitution (p.Ser450Trp) - - confidence: tbprofiler annotation regarding resistance (Not assoc w R, Uncertain significance...) - - antimicrobial: antimicrobial the mutation is confering resistance to (streptomycin, rifampicin...) - - looker_interpretation: interpretation of resistance for Looker report (R, S, U, R-interim) - - mdl_interpretation: MDL interpretation of resistance (R,S,U) - - depth: depth of coverage at the mutation site (100) - - frequency: frequency of mutation at the site (1) - - read_support: number of reads supporting the mutation (100, depth*frequency) - - rationale: rationale for resistance calling (WHO classification, Expert rule) - - warning: column reserved for warnings such as low depth of coverage - """ - - df_laboratorian = pd.DataFrame(columns = ["sample_id","tbprofiler_gene_name","tbprofiler_locus_tag", - "tbprofiler_variant_substitution_type","tbprofiler_variant_substitution_nt", - "tbprofiler_variant_substitution_aa","confidence","antimicrobial", - "looker_interpretation","mdl_interpretation","depth","frequency", - "read_support","rationale","warning"]) - - row_list = [] - genes_reported = [] - - with open(json_file) as results_json_fh: - results_json = json.load(results_json_fh) - - # reported mutation by tb-profiler, all confering resistance by WHO criteria - for dr_variant in results_json["dr_variants"]: - if "annotation" in dr_variant: - try: # sometimes annotation is an empty list - if dr_variant["annotation"][0]["who_confidence"] == "": - confidence = "No WHO annotation" - else: - confidence = dr_variant["annotation"][0]["who_confidence"] - except: - confidence = "No WHO annotation" - else: - confidence = "No WHO annotation" - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = dr_variant["gene"] - row["tbprofiler_locus_tag"] = dr_variant["locus_tag"] - row["tbprofiler_variant_substitution_type"] = dr_variant["type"] - row["tbprofiler_variant_substitution_nt"] = dr_variant["nucleotide_change"] - row["tbprofiler_variant_substitution_aa"] = dr_variant["protein_change"] if dr_variant["protein_change"] != "" else "NA" - row["confidence"] = confidence - row["antimicrobial"] = ",".join(dr_variant["gene_associated_drugs"]) - row["looker_interpretation"] = decipher_looker(row["confidence"]) - row["mdl_interpretation"] = decipher_MDL(row["confidence"]) - row["depth"] = int(dr_variant["depth"] or 0) - row["frequency"] = dr_variant["freq"] - row["read_support"] = row["depth"]*row["frequency"] - row["rationale"] = "WHO classification" - row["warning"] = "Low depth coverage" if row["depth"] < int('~{min_depth}') else "" - genes_reported.append(dr_variant["gene"]) - row_list.append(row) - - # mutations not reported by tb-profiler - application of expert rules to determine resistance - for other_variant in results_json["other_variants"]: - - # report only mutations that are NOT synonymous - if other_variant["type"] != "synonymous_variant": - - # Expert rule: mutations in katG, pncA, ethA or gid, classify as resistant - if other_variant["gene"] == "katG" or other_variant["gene"] == "pncA" or other_variant["gene"] == "ethA" or other_variant["gene"] == "gid": - if "annotation" in other_variant: - try: # sometimes annotation is an empty list - if other_variant["annotation"][0]["who_confidence"] == "": - confidence = "No WHO annotation" - else: - confidence = other_variant["annotation"][0]["who_confidence"] - except: - confidence = "No WHO annotation" - else: - confidence = "No WHO annotation" - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = other_variant["gene"] - row["tbprofiler_locus_tag"] = other_variant["locus_tag"] - row["tbprofiler_variant_substitution_type"] = other_variant["type"] - row["tbprofiler_variant_substitution_nt"] = other_variant["nucleotide_change"] - row["tbprofiler_variant_substitution_aa"] = other_variant["protein_change"] if other_variant["protein_change"] != "" else "NA" - row["confidence"] = confidence - row["antimicrobial"] = ",".join(other_variant["gene_associated_drugs"]) - row["looker_interpretation"] = decipher_looker(row["confidence"]) - row["mdl_interpretation"] = decipher_MDL(row["confidence"]) - row["depth"] = int(other_variant["depth"] or 0) - row["frequency"] = other_variant["freq"] - row["read_support"] = row["depth"]*row["frequency"] - row["rationale"] = "Resistant based on expert rule" - row["warning"] = "Low depth coverage" if row["depth"] < int('~{min_depth}') else "" - genes_reported.append(other_variant["gene"]) - row_list.append(row) - - # Expert rule: in case mutation occurs between codons 426 and 452 of rpoB gene, classify as resistant - if other_variant["gene"] == "rpoB": - position = get_codon(other_variant["protein_change"]) - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = other_variant["gene"] - row["tbprofiler_locus_tag"] = other_variant["locus_tag"] - row["tbprofiler_variant_substitution_type"] = other_variant["type"] - row["tbprofiler_variant_substitution_nt"] = other_variant["nucleotide_change"] - row["tbprofiler_variant_substitution_aa"] = other_variant["protein_change"] if other_variant["protein_change"] != "" else "NA" - row["confidence"] = "No WHO annotation" - row["antimicrobial"] = ",".join(other_variant["gene_associated_drugs"]) - row["looker_interpretation"] = decipher_looker(row["confidence"]) - row["mdl_interpretation"] = decipher_MDL(row["confidence"]) - row["depth"] = int(other_variant["depth"] or 0) - row["frequency"] = other_variant["freq"] - row["read_support"] = row["depth"]*row["frequency"] - row["rationale"] = "Resistant based on expert rule" if 426 <= position <= 452 else "Uncertain significance based on expert rule" - row["warning"] = "Low depth coverage" if row["depth"] < int('~{min_depth}') else "" - genes_reported.append(other_variant["gene"]) - row_list.append(row) - - for gene, resistance_list in gene_to_resistance.items(): - for resistance in resistance_list: - if gene not in genes_reported: - row = {} - row["sample_id"] = "~{samplename}" - row["tbprofiler_gene_name"] = gene - row["tbprofiler_locus_tag"] = gene_to_locus_tag[gene] - row["tbprofiler_variant_substitution_type"] = "WT" - row["tbprofiler_variant_substitution_nt"] = "NA" - row["tbprofiler_variant_substitution_aa"] = "NA" - row["confidence"] = "NA" - row["antimicrobial"] = resistance - row["looker_interpretation"] = "NA" - row["mdl_interpretation"] = "NA" - row["depth"] = "NA" - row["frequency"] = "NA" - row["read_support"] = "NA" - row["rationale"] = "NA" - row["warning"] = "NA" - row_list.append(row) - - df_laboratorian = df_laboratorian.append(row_list, ignore_index=True) - df_laboratorian.to_csv("tbprofiler_laboratorian_report.csv", index=False) - - def parse_json_lims_report(json_file, formatted_time): - """ - This function recieves the tbprofiler output json file and - writes the LIMS report that includes the following information - per sample: - - MDL sample accession numbers: includes sample name - - M_DST_A01_ID - includes lineage - - The set of information in gene_dict dictionary with target drug resistance information - in layman's terms, and the mutations responsible for the predicted phenotype - - Date of analysis in YYYY-MM-DD HH:SS format - - Operator information - """ - - lineage = get_lineage("~{json}") - mutations = parse_json_mutations("~{json}") - resistance = parse_json_resistance("~{json}") - df_lims = pd.DataFrame({"MDL sample accession numbers":"~{samplename}", "M_DST_A01_ID": lineage},index=[0]) - - for antimicrobial, genes in gene_dict.items(): - if antimicrobial_dict[antimicrobial] in resistance.keys(): - df_lims[antimicrobial] = translate(resistance[antimicrobial_dict[antimicrobial]], antimicrobial_dict[antimicrobial]) - else: - df_lims[antimicrobial] = "No resistance to {} detected".format(antimicrobial_dict[antimicrobial]) - for gene_name, gene_id in genes.items(): - if gene_name in mutations.keys(): - df_lims[gene_id] = mutations[gene_name] - else: - df_lims[gene_id] = "No mutations detected" - - df_lims["Analysis date"] = formatted_time - df_lims["Operator"] = "~{operator}" - df_lims.to_csv("tbprofiler_lims_report.csv", index=False) - - def parse_json_looker_report(json_file, current_time): - """ - This function recieves the tbprofiler output json file and - writes the Looker report that includes the following information - per sample: - - sample_id: includes sample name - - for each antimicrobial, indication if its resistant (R) or susceptible (S) - """ - resistance = parse_json_resistance("~{json}") - df_looker = pd.DataFrame({"sample_id":"~{samplename}", "output_seq_method_type": "~{output_seq_method_type}"},index=[0]) - - for antimicrobial in antimicrobial_list: - if antimicrobial in resistance.keys(): - df_looker[antimicrobial] = decipher_looker(resistance[antimicrobial]) - else: - df_looker[antimicrobial] = "S" - - df_looker["analysis_date"] = current_time - df_looker["operator"] = "~{operator}" - - df_looker.to_csv("tbprofiler_looker.csv", index=False) - - ### Report Generation ### - - # get timestamp in YYYY-MM-DD HH:MM format - current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') - - # Laboratorian report generation - parse_json_lab_report("~{json}") - - # LIMS report generation - parse_json_lims_report("~{json}", current_time) - - # LOOKER report generation - parse_json_looker_report("~{json}", current_time) - - CODE - >>> - output { - File tbprofiler_looker_csv = "tbprofiler_looker.csv" - File tbprofiler_laboratorian_report_csv = "tbprofiler_laboratorian_report.csv" - File tbprofiler_lims_report_csv = "tbprofiler_lims_report.csv" - } - runtime { - docker: "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.2" - memory: "4 GB" - cpu: 1 - disks: "local-disk " + 10 + " SSD" - disk: 10 + " GB" - maxRetries: 0 - } -} \ No newline at end of file diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index e960a7eb1..c0cf4e819 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -799,10 +799,11 @@ workflow theiaprok_illumina_pe { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - File? tbprofiler_lims_report_csv = merlin_magic.tbprofiler_lims_report_csv - File? tbprofiler_looker_csv = merlin_magic.tbprofiler_looker_csv - File? tbprofiler_laboratorian_report_csv = merlin_magic.tbprofiler_laboratorian_report_csv - File? tbprofiler_resistance_genes_percent_coverage = merlin_magic.tb_resistance_genes_percent_coverage + File? tbp_parser_lims_report_csv = merlin_magic.tbp_parser_lims_report_csv + File? tbp_parser_looker_report_csv = merlin_magic.tbp_parser_looker_report_csv + File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv + File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index 4f2025bd6..a827303a5 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -751,10 +751,11 @@ workflow theiaprok_illumina_se { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - File? tbprofiler_lims_report_csv = merlin_magic.tbprofiler_lims_report_csv - File? tbprofiler_laboratorian_report_csv = merlin_magic.tbprofiler_laboratorian_report_csv - File? tbprofiler_looker_csv = merlin_magic.tbprofiler_looker_csv - File? tbprofiler_resistance_genes_percent_coverage = merlin_magic.tb_resistance_genes_percent_coverage + File? tbp_parser_lims_report_csv = merlin_magic.tbp_parser_lims_report_csv + File? tbp_parser_looker_report_csv = merlin_magic.tbp_parser_looker_report_csv + File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv + File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 06628be0a..3741fc42c 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -713,11 +713,12 @@ workflow theiaprok_ont { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - File? tbprofiler_laboratorian_report_csv = merlin_magic.tbprofiler_laboratorian_report_csv - File? tbprofiler_lims_report_csv = merlin_magic.tbprofiler_lims_report_csv - File? tbprofiler_looker_csv = merlin_magic.tbprofiler_looker_csv - File? tbprofiler_resistance_genes_percent_coverage = merlin_magic.tb_resistance_genes_percent_coverage - # Legionella pneumophila typing + File? tbp_parser_lims_report_csv = merlin_magic.tbp_parser_lims_report_csv + File? tbp_parser_looker_report_csv = merlin_magic.tbp_parser_looker_report_csv + File? tbp_parser_laboratorian_report_csv = merlin_magic.tbp_parser_laboratorian_report_csv + File? tbp_parser_coverage_report = merlin_magic.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = merlin_magic.tbp_parser_genome_percent_coverage + # Legionella pneumophila typing File? legsta_results = merlin_magic.legsta_results String? legsta_predicted_sbt = merlin_magic.legsta_predicted_sbt String? legsta_version = merlin_magic.legsta_version diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index b6fd6e2ae..107a26df6 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -11,8 +11,7 @@ import "../../tasks/species_typing/task_sistr.wdl" as sistr_task import "../../tasks/species_typing/task_seqsero2.wdl" as seqsero2_task import "../../tasks/species_typing/task_kleborate.wdl" as kleborate_task import "../../tasks/species_typing/task_tbprofiler.wdl" as tbprofiler_task -import "../../tasks/species_typing/task_tbprofiler_output_parsing.wdl" as tbprofiler_output_parsing_task -import "../../tasks/species_typing/task_tb_gene_coverage.wdl" as tb_gene_coverage_task +import "../../tasks/species_typing/task_tbp_parser.wdl" as tbp_parser_task import "../../tasks/species_typing/task_legsta.wdl" as legsta_task import "../../tasks/species_typing/task_genotyphi.wdl" as genotyphi import "../../tasks/species_typing/task_kaptive.wdl" as kaptive_task @@ -63,8 +62,12 @@ workflow merlin_magic { Boolean assembly_only = false Boolean theiaeuk = false Boolean tbprofiler_additional_outputs = false - String tbprofiler_output_seq_method_type = "WGS" - String tbprofiler_operator = "Default" + String tbp_parser_output_seq_method_type = "WGS" + String? tbp_parser_operator + Int? tbp_parser_min_depth + Int? tbp_parser_coverage_threshold + Boolean? tbp_parser_debug + String? tbp_parser_docker_image String? snippy_query_gene Int srst2_min_cov = 80 Int srst2_max_divergence = 20 @@ -239,18 +242,18 @@ workflow merlin_magic { ont_data = ont_data } if (tbprofiler_additional_outputs) { - call tbprofiler_output_parsing_task.tbprofiler_output_parsing{ + call tbp_parser_task.tbp_parser { input: - json = tbprofiler.tbprofiler_output_json, - output_seq_method_type = tbprofiler_output_seq_method_type, - operator = tbprofiler_operator, - samplename = samplename - } - call tb_gene_coverage_task.tb_gene_coverage { - input: - bamfile = tbprofiler.tbprofiler_output_bam, - bamindex = tbprofiler.tbprofiler_output_bai, - samplename = samplename + tbprofiler_json = tbprofiler.tbprofiler_output_json, + tbprofiler_bam = tbprofiler.tbprofiler_output_bam, + tbprofiler_bai = tbprofiler.tbprofiler_output_bai, + samplename = samplename, + sequencing_method = tbp_parser_output_seq_method_type, + operator = tbp_parser_operator, + min_depth = tbp_parser_min_depth, + coverage_threshold = tbp_parser_coverage_threshold, + tbp_parser_debug = tbp_parser_debug, + docker = tbp_parser_docker_image } } } @@ -569,10 +572,15 @@ workflow merlin_magic { String? tbprofiler_sub_lineage = tbprofiler.tbprofiler_sub_lineage String? tbprofiler_dr_type = tbprofiler.tbprofiler_dr_type String? tbprofiler_resistance_genes = tbprofiler.tbprofiler_resistance_genes - File? tbprofiler_lims_report_csv = tbprofiler_output_parsing.tbprofiler_lims_report_csv - File? tbprofiler_laboratorian_report_csv = tbprofiler_output_parsing.tbprofiler_laboratorian_report_csv - File? tbprofiler_looker_csv = tbprofiler_output_parsing.tbprofiler_looker_csv - File? tb_resistance_genes_percent_coverage = tb_gene_coverage.tb_resistance_genes_percent_coverage + Int? tbprofiler_median_coverage = tbprofiler.tbprofiler_median_coverage + Float? tbprofiler_pct_reads_mapped = tbprofiler.tbprofiler_pct_reads_mapped + String? tbp_parser_version = tbp_parser.tbp_parser_version + String? tbp_parser_docker = tbp_parser.tbp_parser_docker + File? tbp_parser_lims_report_csv = tbp_parser.tbp_parser_lims_report_csv + File? tbp_parser_laboratorian_report_csv = tbp_parser.tbp_parser_laboratorian_report_csv + File? tbp_parser_looker_report_csv = tbp_parser.tbp_parser_looker_report_csv + File? tbp_parser_coverage_report = tbp_parser.tbp_parser_coverage_report + Float? tbp_parser_genome_percent_coverage = tbp_parser.tbp_parser_genome_percent_coverage # Legionella pneumophila Typing File? legsta_results = legsta.legsta_results String? legsta_predicted_sbt = legsta.legsta_predicted_sbt From 5c769b655e827ef6c1b3128eb90cc3ae6657bb4e Mon Sep 17 00:00:00 2001 From: cimendes Date: Tue, 12 Sep 2023 09:19:31 +0000 Subject: [PATCH 02/12] add root --- tasks/species_typing/task_tbp_parser.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index ac6283d16..0d0d6893d 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -20,10 +20,10 @@ task tbp_parser { } command <<< # get version - python3 tbp-parser/tbp_parser/tbp_parser.py --version | tee VERSION + python3 /tbp-parser/tbp_parser/tbp_parser.py --version | tee VERSION # run tbp-parser - python3 tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ + python3 /tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ ~{"--sequencing_method" + sequencing_method} \ ~{"--operator" + operator} \ ~{"--min_depth" + min_depth} \ From cc2d611079ad1e1774794af5b54d560547370d47 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 12 Sep 2023 13:37:46 +0000 Subject: [PATCH 03/12] update docker, add spaces --- tasks/species_typing/task_tbp_parser.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 0d0d6893d..5fab6d0c6 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.1" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.2" Int disk_size = 100 Int memory = 4 Int cpu = 1 @@ -24,10 +24,10 @@ task tbp_parser { # run tbp-parser python3 /tbp-parser/tbp_parser/tbp_parser.py ~{tbprofiler_json} ~{tbprofiler_bam} \ - ~{"--sequencing_method" + sequencing_method} \ - ~{"--operator" + operator} \ - ~{"--min_depth" + min_depth} \ - ~{"--coverage_threshold" + coverage_threshold} \ + ~{"--sequencing_method " + sequencing_method} \ + ~{"--operator " + operator} \ + ~{"--min_depth " + min_depth} \ + ~{"--coverage_threshold " + coverage_threshold} \ --output_prefix ~{samplename} \ ~{true="--debug" false="--verbose" tbp_parser_debug} From a743fefbd3a83896c89a9174bfd08cd1d55fa531 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 12 Sep 2023 15:57:03 +0000 Subject: [PATCH 04/12] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 5fab6d0c6..37da00c72 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.2" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.3" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 0065d251514d571cba7dd232d1fe93c5be3e154e Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 12 Sep 2023 17:55:38 +0000 Subject: [PATCH 05/12] update container --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 37da00c72..2093a8fa2 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.3" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.4" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 2f9e80a2464396152340590ec20d4e34499528fc Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 13 Sep 2023 15:26:10 +0000 Subject: [PATCH 06/12] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 2093a8fa2..f86302275 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.4" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.5" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 5227d77f7e95097f5b3e5d66048cdf2bbdc61c67 Mon Sep 17 00:00:00 2001 From: Ash O'Farrell Date: Thu, 14 Sep 2023 06:57:00 -0700 Subject: [PATCH 07/12] Add preemptible, shorter version string (#185) --- tasks/species_typing/task_tbp_parser.wdl | 3 ++- tasks/species_typing/task_tbprofiler.wdl | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index f86302275..2f5824979 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -50,6 +50,7 @@ task tbp_parser { cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 3 + preemptible: 1 } } \ No newline at end of file diff --git a/tasks/species_typing/task_tbprofiler.wdl b/tasks/species_typing/task_tbprofiler.wdl index a3a881243..7b9cad530 100644 --- a/tasks/species_typing/task_tbprofiler.wdl +++ b/tasks/species_typing/task_tbprofiler.wdl @@ -22,7 +22,7 @@ task tbprofiler { date | tee DATE # Print and save version - tb-profiler version > VERSION && sed -i -e 's/^/TBProfiler version /' VERSION + tb-profiler version > VERSION && sed -i -e 's/TBProfiler version //' VERSION && sed -n -i '$p' VERSION if [ -z "~{read2}" ] ; then INPUT_READS="-1 ~{read1}" @@ -119,6 +119,7 @@ task tbprofiler { cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 3 + preemptible: 1 } } \ No newline at end of file From 8fcfefa47339e0fd40a268d8dabf69f86453f317 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 15 Sep 2023 14:53:15 +0000 Subject: [PATCH 08/12] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 2f5824979..20a0a19d2 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.5" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.6" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 7d00801bd2756922f7771dcbd54f4153e5cc6bb1 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 15 Sep 2023 16:02:24 +0000 Subject: [PATCH 09/12] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 20a0a19d2..aecf88b8b 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.6" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.7" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 51ddccdbc96936329dbbf3bf3ef1f1e3904b287f Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 15 Sep 2023 16:56:18 +0000 Subject: [PATCH 10/12] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index aecf88b8b..4f82830a8 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.7" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.8" Int disk_size = 100 Int memory = 4 Int cpu = 1 From 9311d6a5e9da5100dabf1d1cf79ecf12c2019ca2 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 19 Sep 2023 18:33:59 +0000 Subject: [PATCH 11/12] update readme; fix broken link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 499546885..e548d01f5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Bioinformatics workflows for characterization, epidemiology and sharing of patho **More information about the steps undertaken in these workflows is available via the [Theiagen Public Resources Documentation](https://theiagen.notion.site/Theiagen-Public-Health-Resources-a4bd134b0c5c4fe39870e21029a30566).** -Support for running these workflows can be sought by raising a [GitHub issue](https://github.com/theiagen/public_health_bioinformatics/issues/new) or by contacting Theiagen at support@theiagen.com. +Support for running these workflows can be sought by raising a [GitHub issue](https://github.com/theiagen/public_health_bioinformatics/issues/new/choose) or by contacting Theiagen at support@theiagen.com. These workflows are written in [WDL](https://github.com/openwdl/wdl), a language for specifying data processing workflows with a human-readable and writeable syntax. They have been developed by [Theiagen Genomics](https://theiagen.com/) to primarily run on the [Terra.bio](https://terra.bio/) platform but can be run locally or on an HPC system at the command-line with Cromwell or miniWDL. @@ -13,7 +13,7 @@ These workflows are written in [WDL](https://github.com/openwdl/wdl), a language * Workflows and task development influenced by The Broad's [Viral Pipes](https://github.com/broadinstitute/viral-pipelines) * TheiaCoV workflows for viral genomic characterization influenced by UPHL's [Cecret](https://github.com/UPHL-BioNGS/Cecret) & StaPH-B's [Monroe](https://staph-b.github.io/staphb_toolkit/workflow_docs/monroe/) * TheiaProk workflows for bacterial genomic characterization influenced by Robert Petit's [bactopia](https://github.com/bactopia/bactopia) -* The PHB workflow user community. To provide feedback, please raise a [GitHub issue](https://github.com/theiagen/public_health_vioinformatics/issues/new). +* The PHB workflow user community. To provide feedback, please raise a [GitHub issue](https://github.com/theiagen/public_health_bioinformatics/issues/new/choose). ### Contributing to the PHB workflows Contributions to the workflows contained in this repository are warmly welcomed. Our style guide may be found [here](https://theiagen.notion.site/Style-Guide-WDL-Workflow-Development-bb456f34322d4f4db699d4029050481c) for convenience of formatting. From 71fcb6c98f018ce502d6c76dc4bc380f6200c9e5 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 20 Sep 2023 14:55:53 +0000 Subject: [PATCH 12/12] update docker --- tasks/species_typing/task_tbp_parser.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/species_typing/task_tbp_parser.wdl b/tasks/species_typing/task_tbp_parser.wdl index 4f82830a8..a4258914f 100644 --- a/tasks/species_typing/task_tbp_parser.wdl +++ b/tasks/species_typing/task_tbp_parser.wdl @@ -13,7 +13,7 @@ task tbp_parser { Int coverage_threshold = 100 Boolean tbp_parser_debug = false - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.8" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:0.0.9" Int disk_size = 100 Int memory = 4 Int cpu = 1