From 99ae6968b06f0ff3f1f187c39a4086960d42a5ad Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Tue, 31 Oct 2023 14:10:36 -0400 Subject: [PATCH 1/7] big changes to BWA task: new optional inputs for memory and docker; memory default increased from 8 to 16; broke up one liner to produce intermediate SAM file for downstream usage; added 2 samtools view commands to generate separate BAMs for aligned and unaligned reads; updated cmds for FASTQ extraction from 2 BAMS; added indexing for unaligned BAM; added 4 new outputs: read1&2 unaligned, sorted_bam_unaligned, and sorted_bam_unaligned_bai --- tasks/alignment/task_bwa.wdl | 71 +++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index ad067ea90..e8abe636a 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -7,7 +7,9 @@ task bwa { String samplename File? reference_genome Int cpu = 6 + Int memory = 16 Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan" } command <<< # date and version control @@ -25,25 +27,62 @@ task bwa { ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" fi - # Map with BWA MEM - echo "Running bwa mem -t ~{cpu} ${ref_genome} ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam " + # Map with BWA MEM; pipe to samtools sort to write sorted SAM file bwa mem \ - -t ~{cpu} \ - "${ref_genome}" \ - ~{read1} ~{read2} |\ - samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam + -t ~{cpu} \ + "${ref_genome}" \ + ~{read1} \ + ~{read2} | \ + samtools sort -@ ~{cpu} > ~{samplename}.sorted.sam + + # convert SAM to BAM that only includes aligned reads + samtools view \ + -@ ~{cpu} \ + -F 4 \ + -b \ + -o ~{samplename}.sorted.bam \ + ~{samplename}.sorted.sam + # convert SAM to BAM that only includes unaligned reads + samtools view \ + -@ ~{cpu} \ + -f 4 \ + -b \ + -o ~{samplename}.sorted.unaligned-reads.bam \ + ~{samplename}.sorted.sam + + # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs if [[ ! -z "~{read2}" ]]; then - echo "processing paired reads" - samtools fastq -F4 -1 ~{samplename}_R1.fastq.gz -2 ~{samplename}_R2.fastq.gz ~{samplename}.sorted.bam + echo "Generating FASTQs for aligned and unaligned paired reads" + samtools fastq \ + -F 4 \ + -1 ~{samplename}_R1.fastq.gz \ + -2 ~{samplename}_R2.fastq.gz \ + ~{samplename}.sorted.bam + # note the lowercase 'f' here is imporant + samtools fastq \ + -f 4 \ + -1 ~{samplename}_unaligned_R1.fastq.gz \ + -2 ~{samplename}_unaligned_R2.fastq.gz \ + ~{samplename}.sorted.unaligned-reads.bam else - echo "processing single-end reads" - samtools fastq -F4 ~{samplename}.sorted.bam | gzip > ~{samplename}_R1.fastq.gz + echo "Generating FASTQs for aligned and unaligned single-end reads" + samtools fastq \ + -@ ~{cpu} \ + -F 4 \ + -1 ~{samplename}_R1.fastq.gz \ + ~{samplename}.sorted.bam + # again, lowercase 'f' is important for getting all unaligned reads + samtools fastq \ + -@ ~{cpu} \ + -f 4 \ + -1 ~{samplename}_unaligned_R1.fastq.gz \ + ~{samplename}.sorted.unaligned-reads.bam fi - # index BAMs - samtools index ~{samplename}.sorted.bam + samtools index ~{samplename}.sorted.bam + samtools index ~{samplename}.sorted.unaligned-reads.bam >>> output { String bwa_version = read_string("BWA_VERSION") @@ -52,10 +91,14 @@ task bwa { File sorted_bai = "${samplename}.sorted.bam.bai" File read1_aligned = "~{samplename}_R1.fastq.gz" File? read2_aligned = "~{samplename}_R2.fastq.gz" + File read1_unaligned = "~{samplename}_unaligned_R1.fastq.gz" + File? read2_unaligned = "~{samplename}_unaligned_R2.fastq.gz" + File sorted_bam_unaligned = "~{samplename}.sorted.unaligned-reads.bam" + File sorted_bam_unaligned_bai = "~{samplename}.sorted.unaligned-reads.bam.bai" } runtime { - docker: "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan" - memory: "8 GB" + docker: docker + memory: memory + " GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" # TES From 4abfd6127ff956af0d2898c51d4eccae99b9440b Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Mon, 13 Nov 2023 14:52:17 -0500 Subject: [PATCH 2/7] added 4 new file outputs to wf_ivar_consensus subworkflow and theiacov_illumina_pe workflow; both ran successfully with miniwdl --- workflows/theiacov/wf_theiacov_illumina_pe.wdl | 4 ++++ workflows/utilities/wf_ivar_consensus.wdl | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index 8dff1ee42..9e6d6afa6 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -299,6 +299,10 @@ workflow theiacov_illumina_pe { File? read2_aligned = ivar_consensus.read2_aligned String aligned_bam = select_first([ivar_consensus.aligned_bam, ""]) String aligned_bai = select_first([ivar_consensus.aligned_bai, ""]) + File? read1_unaligned = ivar_consensus.read1_unaligned + File? read2_unaligned = ivar_consensus.read2_unaligned + File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned + File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai # Read Alignment - primer trimming outputs Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl index e88ead2a1..1a902f901 100644 --- a/workflows/utilities/wf_ivar_consensus.wdl +++ b/workflows/utilities/wf_ivar_consensus.wdl @@ -75,6 +75,10 @@ workflow ivar_consensus { File? read2_aligned = bwa.read2_aligned File aligned_bam = select_first([primer_trim.trim_sorted_bam, bwa.sorted_bam, ""]) # set default values for select_first() to avoid workflow failures File aligned_bai = select_first([primer_trim.trim_sorted_bai, bwa.sorted_bai, ""]) + File read1_unaligned = bwa.read1_unaligned + File? read2_unaligned = bwa.read2_unaligned + File sorted_bam_unaligned = bwa.sorted_bam_unaligned + File sorted_bam_unaligned_bai = bwa.sorted_bam_unaligned_bai # primer trimming outputs Float? primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent From 3d26d0d98d08fbdf724552c4accaa8d06d695234 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Mon, 13 Nov 2023 15:02:27 -0500 Subject: [PATCH 3/7] added 3 new optional outputs to theiacov_illumina_se: read1_unaligned, sorted_bam_unaligned, sorted_bam_unaligned_bai. tested fine w miniwdl --- workflows/theiacov/wf_theiacov_illumina_se.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index 349251177..96f297152 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -212,6 +212,9 @@ workflow theiacov_illumina_se { String? assembly_method = ivar_consensus.assembly_method_nonflu File? aligned_bam = ivar_consensus.aligned_bam File? aligned_bai = ivar_consensus.aligned_bai + File? read1_unaligned = ivar_consensus.read1_unaligned + File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned + File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai # Read Alignment - primer trimming outputs Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim From 88e1d69533ba95f0fcc50c7d85ea5b3802ae3787 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Tue, 14 Nov 2023 11:48:37 -0500 Subject: [PATCH 4/7] added in cpu option for samtools fastq commands. still need to figure out why so much data is being streamed to STDOUT and not directly to a file --- tasks/alignment/task_bwa.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index e8abe636a..425e69e51 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -55,12 +55,14 @@ task bwa { if [[ ! -z "~{read2}" ]]; then echo "Generating FASTQs for aligned and unaligned paired reads" samtools fastq \ + -@ ~{cpu} \ -F 4 \ -1 ~{samplename}_R1.fastq.gz \ -2 ~{samplename}_R2.fastq.gz \ ~{samplename}.sorted.bam # note the lowercase 'f' here is imporant samtools fastq \ + -@ ~{cpu} \ -f 4 \ -1 ~{samplename}_unaligned_R1.fastq.gz \ -2 ~{samplename}_unaligned_R2.fastq.gz \ From b5b41dd66c55b3c2c78d04afd75de1102ea6716f Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Thu, 7 Dec 2023 12:14:51 -0500 Subject: [PATCH 5/7] updates to bwa task, ran successfully w miniwdl. Added echo statements to count reads before and after alignment & re-production of FASTQs; broke samtools sort into multiple lines for readability; added helpful echo statements throughout; fixed the samtools fastq commands that were not appropriately outputting FASTQ files --- tasks/alignment/task_bwa.wdl | 39 ++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index 425e69e51..cfec703d8 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -27,13 +27,18 @@ task bwa { ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" fi + echo "input R1 has $(zcat ~{read1} | grep -c '^@') reads as input" + echo "input R2 has $(zcat ~{read2} | grep -c '^@') reads as input" + # Map with BWA MEM; pipe to samtools sort to write sorted SAM file bwa mem \ -t ~{cpu} \ "${ref_genome}" \ ~{read1} \ ~{read2} | \ - samtools sort -@ ~{cpu} > ~{samplename}.sorted.sam + samtools sort \ + -@ ~{cpu} - \ + > ~{samplename}.sorted.sam # convert SAM to BAM that only includes aligned reads samtools view \ @@ -51,15 +56,21 @@ task bwa { -o ~{samplename}.sorted.unaligned-reads.bam \ ~{samplename}.sorted.sam + # see here for "samtools fastq" options: https://www.htslib.org/doc/samtools-fasta.html + # TL;DR is that "samtools fastq -1 R1.fastq -2 R2.fastq" works with paired-end inputs and will output R1 and R2 reads to separate files due to tags in the SAM & BAM file + + # AFAIK for single end alignments w/ bwa mem, the output SAM/BAM files do not have tags to differentiate between R1 and R2 reads, so the "samtools fastq -0 R1.fastq" command is used to output all reads to a single file + # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs if [[ ! -z "~{read2}" ]]; then - echo "Generating FASTQs for aligned and unaligned paired reads" + echo "Generating FASTQs for aligned reads" samtools fastq \ -@ ~{cpu} \ -F 4 \ -1 ~{samplename}_R1.fastq.gz \ -2 ~{samplename}_R2.fastq.gz \ ~{samplename}.sorted.bam + echo "Generating FASTQs for unaligned reads" # note the lowercase 'f' here is imporant samtools fastq \ -@ ~{cpu} \ @@ -68,23 +79,39 @@ task bwa { -2 ~{samplename}_unaligned_R2.fastq.gz \ ~{samplename}.sorted.unaligned-reads.bam else - echo "Generating FASTQs for aligned and unaligned single-end reads" + echo "Generating FASTQs for aligned single-end reads" samtools fastq \ -@ ~{cpu} \ -F 4 \ - -1 ~{samplename}_R1.fastq.gz \ - ~{samplename}.sorted.bam + -0 ~{samplename}_R1.fastq.gz \ + ~{samplename}.sorted.bam + echo "Generating FASTQs for unaligned single-end reads" # again, lowercase 'f' is important for getting all unaligned reads samtools fastq \ -@ ~{cpu} \ -f 4 \ - -1 ~{samplename}_unaligned_R1.fastq.gz \ + -0 ~{samplename}_unaligned_R1.fastq.gz \ ~{samplename}.sorted.unaligned-reads.bam fi # index BAMs samtools index ~{samplename}.sorted.bam samtools index ~{samplename}.sorted.unaligned-reads.bam + + # count output reads to ensure we are outputting all reads, regardless if the aligned or not + # if read2 does exist as input, count both R1 and R2 + if [[ ! -z "~{read2}" ]]; then + echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input" + echo "output R2_aligned has $(zcat ~{samplename}_R2.fastq.gz | grep -c '^@') reads as input" + echo + echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input" + echo "output R2_unaligned has $(zcat ~{samplename}_unaligned_R2.fastq.gz | grep -c '^@') reads as input" + # else = if read2 does not exist as input, only count R1 + else + echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input" + echo + echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input" + fi >>> output { String bwa_version = read_string("BWA_VERSION") From 714931ca344241c283479a2cffcfc3aead1fbf2e Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Fri, 8 Dec 2023 16:12:30 -0500 Subject: [PATCH 6/7] turn on memory retry feature for bwa WDL task --- tasks/alignment/task_bwa.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index cfec703d8..0448a74c5 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -132,6 +132,6 @@ task bwa { disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" # TES preemptible: 0 - #maxRetries: 3 + maxRetries: 3 } } \ No newline at end of file From c107e8135d8159881bd246f61a5a323a006ae0af Mon Sep 17 00:00:00 2001 From: jrotieno Date: Wed, 7 Feb 2024 17:13:04 +0000 Subject: [PATCH 7/7] add a check on read compression and end of line formatting --- tasks/alignment/task_bwa.wdl | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index 0448a74c5..eeecbf0eb 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -27,8 +27,15 @@ task bwa { ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" fi - echo "input R1 has $(zcat ~{read1} | grep -c '^@') reads as input" - echo "input R2 has $(zcat ~{read2} | grep -c '^@') reads as input" + # set cat command based on compression + if [[ "~{read1}" == *".gz" ]] ; then + cat_reads="zcat" + else + cat_reads="cat" + fi + + echo -e "\ninput R1 has $(${cat_reads} ~{read1} | grep -c '^@') reads as input" + echo "input R2 has $(${cat_reads} ~{read2} | grep -c '^@') reads as input" # Map with BWA MEM; pipe to samtools sort to write sorted SAM file bwa mem \ @@ -63,7 +70,7 @@ task bwa { # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs if [[ ! -z "~{read2}" ]]; then - echo "Generating FASTQs for aligned reads" + echo -e "\nGenerating FASTQs for aligned reads" samtools fastq \ -@ ~{cpu} \ -F 4 \ @@ -79,13 +86,13 @@ task bwa { -2 ~{samplename}_unaligned_R2.fastq.gz \ ~{samplename}.sorted.unaligned-reads.bam else - echo "Generating FASTQs for aligned single-end reads" + echo -e "\nGenerating FASTQs for aligned single-end reads\n" samtools fastq \ -@ ~{cpu} \ -F 4 \ -0 ~{samplename}_R1.fastq.gz \ ~{samplename}.sorted.bam - echo "Generating FASTQs for unaligned single-end reads" + echo -e "Generating FASTQs for unaligned single-end reads\n" # again, lowercase 'f' is important for getting all unaligned reads samtools fastq \ -@ ~{cpu} \ @@ -101,7 +108,7 @@ task bwa { # count output reads to ensure we are outputting all reads, regardless if the aligned or not # if read2 does exist as input, count both R1 and R2 if [[ ! -z "~{read2}" ]]; then - echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input" + echo -e "\noutput R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input" echo "output R2_aligned has $(zcat ~{samplename}_R2.fastq.gz | grep -c '^@') reads as input" echo echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input"